_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The .srt file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         webpage_bytes = urlh.read()
 130         return webpage_bytes.decode('utf-8', 'replace')
 131
 132
 133 class YoutubeIE(InfoExtractor):
 134     """Information extractor for youtube.com."""
 135
 136     _VALID_URL = r"""^
 137                      (
 138                          (?:https?://)?                                       # http(s):// (optional)
 139                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 140                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 141                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 142                          (?:                                                  # the various things that can precede the ID:
 143                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 144                              |(?:                                             # or the v= param in all its forms
 145                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 146                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 147                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 148                                  v=
 149                              )
 150                          )?                                                   # optional -> youtube.com/xxxx is OK
 151                      )?                                                       # all until now is optional -> you can pass the naked ID
 152                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 153                      (?(1).+)?                                                # if we found the ID, everything can follow
 154                      $"""
 155     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 156     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 157     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 158     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 159     _NETRC_MACHINE = 'youtube'
 160     # Listed in order of quality
 161     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 162     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 163     _video_extensions = {
 164         '13': '3gp',
 165         '17': 'mp4',
 166         '18': 'mp4',
 167         '22': 'mp4',
 168         '37': 'mp4',
 169         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 170         '43': 'webm',
 171         '44': 'webm',
 172         '45': 'webm',
 173         '46': 'webm',
 174     }
 175     _video_dimensions = {
 176         '5': '240x400',
 177         '6': '???',
 178         '13': '???',
 179         '17': '144x176',
 180         '18': '360x640',
 181         '22': '720x1280',
 182         '34': '360x640',
 183         '35': '480x854',
 184         '37': '1080x1920',
 185         '38': '3072x4096',
 186         '43': '360x640',
 187         '44': '480x854',
 188         '45': '720x1280',
 189         '46': '1080x1920',
 190     }
 191     IE_NAME = u'youtube'
 192
 193     @classmethod
 194     def suitable(cls, url):
 195         """Receives a URL and returns True if suitable for this IE."""
 196         if YoutubePlaylistIE.suitable(url): return False
 197         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 198
 199     def report_lang(self):
 200         """Report attempt to set language."""
 201         self._downloader.to_screen(u'[youtube] Setting language')
 202
 203     def report_login(self):
 204         """Report attempt to log in."""
 205         self._downloader.to_screen(u'[youtube] Logging in')
 206
 207     def report_age_confirmation(self):
 208         """Report attempt to confirm age."""
 209         self._downloader.to_screen(u'[youtube] Confirming age')
 210
 211     def report_video_webpage_download(self, video_id):
 212         """Report attempt to download video webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 214
 215     def report_video_info_webpage_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 218
 219     def report_video_subtitles_download(self, video_id):
 220         """Report attempt to download video info webpage."""
 221         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 222
 223     def report_information_extraction(self, video_id):
 224         """Report attempt to extract video information."""
 225         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 226
 227     def report_unavailable_format(self, video_id, format):
 228         """Report extracted video URL."""
 229         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 230
 231     def report_rtmp_download(self):
 232         """Indicate the download will use the RTMP protocol."""
 233         self._downloader.to_screen(u'[youtube] RTMP download detected')
 234
 235     def _closed_captions_xml_to_srt(self, xml_string):
 236         srt = ''
 237         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 238         # TODO parse xml instead of regex
 239         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 240             if not dur: dur = '4'
 241             start = float(start)
 242             end = start + float(dur)
 243             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 244             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 245             caption = unescapeHTML(caption)
 246             caption = unescapeHTML(caption) # double cycle, intentional
 247             srt += str(n+1) + '\n'
 248             srt += start + ' --> ' + end + '\n'
 249             srt += caption + '\n\n'
 250         return srt
 251
 252     def _extract_subtitles(self, video_id):
 253         self.report_video_subtitles_download(video_id)
 254         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 255         try:
 256             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 258             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 259         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 260         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 261         if not srt_lang_list:
 262             return (u'WARNING: video has no closed captions', None)
 263         if self._downloader.params.get('subtitleslang', False):
 264             srt_lang = self._downloader.params.get('subtitleslang')
 265         elif 'en' in srt_lang_list:
 266             srt_lang = 'en'
 267         else:
 268             srt_lang = list(srt_lang_list.keys())[0]
 269         if not srt_lang in srt_lang_list:
 270             return (u'WARNING: no closed captions found in the specified language', None)
 271         params = compat_urllib_parse.urlencode({
 272             'lang': srt_lang,
 273             'name': srt_lang_list[srt_lang].encode('utf-8'),
 274             'v': video_id,
 275         })
 276         url = 'http://www.youtube.com/api/timedtext?' + params
 277         try:
 278             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 281         if not srt_xml:
 282             return (u'WARNING: Did not fetch video subtitles', None)
 283         return (None, self._closed_captions_xml_to_srt(srt_xml))
 284
 285     def _print_formats(self, formats):
 286         print('Available formats:')
 287         for x in formats:
 288             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 289
 290     def _real_initialize(self):
 291         if self._downloader is None:
 292             return
 293
 294         username = None
 295         password = None
 296         downloader_params = self._downloader.params
 297
 298         # Attempt to use provided username and password or .netrc data
 299         if downloader_params.get('username', None) is not None:
 300             username = downloader_params['username']
 301             password = downloader_params['password']
 302         elif downloader_params.get('usenetrc', False):
 303             try:
 304                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 305                 if info is not None:
 306                     username = info[0]
 307                     password = info[2]
 308                 else:
 309                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 310             except (IOError, netrc.NetrcParseError) as err:
 311                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 312                 return
 313
 314         # Set language
 315         request = compat_urllib_request.Request(self._LANG_URL)
 316         try:
 317             self.report_lang()
 318             compat_urllib_request.urlopen(request).read()
 319         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 320             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 321             return
 322
 323         # No authentication to be performed
 324         if username is None:
 325             return
 326
 327         request = compat_urllib_request.Request(self._LOGIN_URL)
 328         try:
 329             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 330         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 331             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 332             return
 333
 334         galx = None
 335         dsh = None
 336         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           galx = match.group(1)
 339
 340         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 341         if match:
 342           dsh = match.group(1)
 343
 344         # Log in
 345         login_form_strs = {
 346                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 347                 u'Email': username,
 348                 u'GALX': galx,
 349                 u'Passwd': password,
 350                 u'PersistentCookie': u'yes',
 351                 u'_utf8': u'霱',
 352                 u'bgresponse': u'js_disabled',
 353                 u'checkConnection': u'',
 354                 u'checkedDomains': u'youtube',
 355                 u'dnConn': u'',
 356                 u'dsh': dsh,
 357                 u'pstMsg': u'0',
 358                 u'rmShown': u'1',
 359                 u'secTok': u'',
 360                 u'signIn': u'Sign in',
 361                 u'timeStmp': u'',
 362                 u'service': u'youtube',
 363                 u'uilel': u'3',
 364                 u'hl': u'en_US',
 365         }
 366         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 367         # chokes on unicode
 368         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 369         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 370         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 371         try:
 372             self.report_login()
 373             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 375                 self._downloader.report_warning(u'unable to log in: bad username or password')
 376                 return
 377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 378             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 379             return
 380
 381         # Confirm age
 382         age_form = {
 383                 'next_url':     '/',
 384                 'action_confirm':   'Confirm',
 385                 }
 386         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 387         try:
 388             self.report_age_confirmation()
 389             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 390         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 391             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 392             return
 393
 394     def _extract_id(self, url):
 395         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 396         if mobj is None:
 397             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 398             return
 399         video_id = mobj.group(2)
 400         return video_id
 401
 402     def _real_extract(self, url):
 403         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 404         mobj = re.search(self._NEXT_URL_RE, url)
 405         if mobj:
 406             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 407         video_id = self._extract_id(url)
 408
 409         # Get video webpage
 410         self.report_video_webpage_download(video_id)
 411         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 412         request = compat_urllib_request.Request(url)
 413         try:
 414             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 416             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 417             return
 418
 419         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 420
 421         # Attempt to extract SWF player URL
 422         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 423         if mobj is not None:
 424             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 425         else:
 426             player_url = None
 427
 428         # Get video info
 429         self.report_video_info_webpage_download(video_id)
 430         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 431             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 432                     % (video_id, el_type))
 433             request = compat_urllib_request.Request(video_info_url)
 434             try:
 435                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 436                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 437                 video_info = compat_parse_qs(video_info_webpage)
 438                 if 'token' in video_info:
 439                     break
 440             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 441                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 442                 return
 443         if 'token' not in video_info:
 444             if 'reason' in video_info:
 445                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 446             else:
 447                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 448             return
 449
 450         # Check for "rental" videos
 451         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 452             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 453             return
 454
 455         # Start extracting information
 456         self.report_information_extraction(video_id)
 457
 458         # uploader
 459         if 'author' not in video_info:
 460             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 461             return
 462         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 463
 464         # uploader_id
 465         video_uploader_id = None
 466         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 467         if mobj is not None:
 468             video_uploader_id = mobj.group(1)
 469         else:
 470             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 471
 472         # title
 473         if 'title' not in video_info:
 474             self._downloader.trouble(u'ERROR: unable to extract video title')
 475             return
 476         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 477
 478         # thumbnail image
 479         if 'thumbnail_url' not in video_info:
 480             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 481             video_thumbnail = ''
 482         else:   # don't panic if we can't find it
 483             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 484
 485         # upload date
 486         upload_date = None
 487         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 488         if mobj is not None:
 489             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 490             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 491             for expression in format_expressions:
 492                 try:
 493                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 494                 except:
 495                     pass
 496
 497         # description
 498         video_description = get_element_by_id("eow-description", video_webpage)
 499         if video_description:
 500             video_description = clean_html(video_description)
 501         else:
 502             video_description = ''
 503
 504         # closed captions
 505         video_subtitles = None
 506         if self._downloader.params.get('writesubtitles', False):
 507             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 508             if srt_error:
 509                 self._downloader.trouble(srt_error)
 510
 511         if 'length_seconds' not in video_info:
 512             self._downloader.trouble(u'WARNING: unable to extract video duration')
 513             video_duration = ''
 514         else:
 515             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 516
 517         # token
 518         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 519
 520         # Decide which formats to download
 521         req_format = self._downloader.params.get('format', None)
 522
 523         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 524             self.report_rtmp_download()
 525             video_url_list = [(None, video_info['conn'][0])]
 526         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 527             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 528             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 529             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 530             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 531
 532             format_limit = self._downloader.params.get('format_limit', None)
 533             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 534             if format_limit is not None and format_limit in available_formats:
 535                 format_list = available_formats[available_formats.index(format_limit):]
 536             else:
 537                 format_list = available_formats
 538             existing_formats = [x for x in format_list if x in url_map]
 539             if len(existing_formats) == 0:
 540                 self._downloader.trouble(u'ERROR: no known formats available for video')
 541                 return
 542             if self._downloader.params.get('listformats', None):
 543                 self._print_formats(existing_formats)
 544                 return
 545             if req_format is None or req_format == 'best':
 546                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 547             elif req_format == 'worst':
 548                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 549             elif req_format in ('-1', 'all'):
 550                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 551             else:
 552                 # Specific formats. We pick the first in a slash-delimeted sequence.
 553                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 554                 req_formats = req_format.split('/')
 555                 video_url_list = None
 556                 for rf in req_formats:
 557                     if rf in url_map:
 558                         video_url_list = [(rf, url_map[rf])]
 559                         break
 560                 if video_url_list is None:
 561                     self._downloader.trouble(u'ERROR: requested format not available')
 562                     return
 563         else:
 564             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 565             return
 566
 567         results = []
 568         for format_param, video_real_url in video_url_list:
 569             # Extension
 570             video_extension = self._video_extensions.get(format_param, 'flv')
 571
 572             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 573                                               self._video_dimensions.get(format_param, '???'))
 574
 575             results.append({
 576                 'id':       video_id,
 577                 'url':      video_real_url,
 578                 'uploader': video_uploader,
 579                 'uploader_id': video_uploader_id,
 580                 'upload_date':  upload_date,
 581                 'title':    video_title,
 582                 'ext':      video_extension,
 583                 'format':   video_format,
 584                 'thumbnail':    video_thumbnail,
 585                 'description':  video_description,
 586                 'player_url':   player_url,
 587                 'subtitles':    video_subtitles,
 588                 'duration':     video_duration
 589             })
 590         return results
 591
 592
 593 class MetacafeIE(InfoExtractor):
 594     """Information Extractor for metacafe.com."""
 595
 596     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 597     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 598     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 599     IE_NAME = u'metacafe'
 600
 601     def __init__(self, downloader=None):
 602         InfoExtractor.__init__(self, downloader)
 603
 604     def report_disclaimer(self):
 605         """Report disclaimer retrieval."""
 606         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 607
 608     def report_age_confirmation(self):
 609         """Report attempt to confirm age."""
 610         self._downloader.to_screen(u'[metacafe] Confirming age')
 611
 612     def report_download_webpage(self, video_id):
 613         """Report webpage download."""
 614         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 615
 616     def report_extraction(self, video_id):
 617         """Report information extraction."""
 618         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 619
 620     def _real_initialize(self):
 621         # Retrieve disclaimer
 622         request = compat_urllib_request.Request(self._DISCLAIMER)
 623         try:
 624             self.report_disclaimer()
 625             disclaimer = compat_urllib_request.urlopen(request).read()
 626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 627             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 628             return
 629
 630         # Confirm age
 631         disclaimer_form = {
 632             'filters': '0',
 633             'submit': "Continue - I'm over 18",
 634             }
 635         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 636         try:
 637             self.report_age_confirmation()
 638             disclaimer = compat_urllib_request.urlopen(request).read()
 639         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 640             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 641             return
 642
 643     def _real_extract(self, url):
 644         # Extract id and simplified title from URL
 645         mobj = re.match(self._VALID_URL, url)
 646         if mobj is None:
 647             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 648             return
 649
 650         video_id = mobj.group(1)
 651
 652         # Check if video comes from YouTube
 653         mobj2 = re.match(r'^yt-(.*)$', video_id)
 654         if mobj2 is not None:
 655             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 656             return
 657
 658         # Retrieve video webpage to extract further information
 659         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 660         try:
 661             self.report_download_webpage(video_id)
 662             webpage = compat_urllib_request.urlopen(request).read()
 663         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 664             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 665             return
 666
 667         # Extract URL, uploader and title from webpage
 668         self.report_extraction(video_id)
 669         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 670         if mobj is not None:
 671             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 672             video_extension = mediaURL[-3:]
 673
 674             # Extract gdaKey if available
 675             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 676             if mobj is None:
 677                 video_url = mediaURL
 678             else:
 679                 gdaKey = mobj.group(1)
 680                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 681         else:
 682             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 683             if mobj is None:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             vardict = compat_parse_qs(mobj.group(1))
 687             if 'mediaData' not in vardict:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 691             if mobj is None:
 692                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 693                 return
 694             mediaURL = mobj.group(1).replace('\\/', '/')
 695             video_extension = mediaURL[-3:]
 696             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 697
 698         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 699         if mobj is None:
 700             self._downloader.trouble(u'ERROR: unable to extract title')
 701             return
 702         video_title = mobj.group(1).decode('utf-8')
 703
 704         mobj = re.search(r'submitter=(.*?);', webpage)
 705         if mobj is None:
 706             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 707             return
 708         video_uploader = mobj.group(1)
 709
 710         return [{
 711             'id':       video_id.decode('utf-8'),
 712             'url':      video_url.decode('utf-8'),
 713             'uploader': video_uploader.decode('utf-8'),
 714             'upload_date':  None,
 715             'title':    video_title,
 716             'ext':      video_extension.decode('utf-8'),
 717         }]
 718
 719
 720 class DailymotionIE(InfoExtractor):
 721     """Information Extractor for Dailymotion"""
 722
 723     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 724     IE_NAME = u'dailymotion'
 725     _WORKING = False
 726
 727     def __init__(self, downloader=None):
 728         InfoExtractor.__init__(self, downloader)
 729
 730     def report_extraction(self, video_id):
 731         """Report information extraction."""
 732         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 733
 734     def _real_extract(self, url):
 735         # Extract id and simplified title from URL
 736         mobj = re.match(self._VALID_URL, url)
 737         if mobj is None:
 738             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 739             return
 740
 741         video_id = mobj.group(1).split('_')[0].split('?')[0]
 742
 743         video_extension = 'mp4'
 744
 745         # Retrieve video webpage to extract further information
 746         request = compat_urllib_request.Request(url)
 747         request.add_header('Cookie', 'family_filter=off')
 748         webpage = self._download_webpage(request, video_id)
 749
 750         # Extract URL, uploader and title from webpage
 751         self.report_extraction(video_id)
 752         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 753         if mobj is None:
 754             self._downloader.trouble(u'ERROR: unable to extract media URL')
 755             return
 756         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 757
 758         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 759             if key in flashvars:
 760                 max_quality = key
 761                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 762                 break
 763         else:
 764             self._downloader.trouble(u'ERROR: unable to extract video URL')
 765             return
 766
 767         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 768         if mobj is None:
 769             self._downloader.trouble(u'ERROR: unable to extract video URL')
 770             return
 771
 772         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 773
 774         # TODO: support choosing qualities
 775
 776         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 777         if mobj is None:
 778             self._downloader.trouble(u'ERROR: unable to extract title')
 779             return
 780         video_title = unescapeHTML(mobj.group('title'))
 781
 782         video_uploader = None
 783         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 784         if mobj is None:
 785             # lookin for official user
 786             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 787             if mobj_official is None:
 788                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 789             else:
 790                 video_uploader = mobj_official.group(1)
 791         else:
 792             video_uploader = mobj.group(1)
 793
 794         video_upload_date = None
 795         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 796         if mobj is not None:
 797             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 798
 799         return [{
 800             'id':       video_id,
 801             'url':      video_url,
 802             'uploader': video_uploader,
 803             'upload_date':  video_upload_date,
 804             'title':    video_title,
 805             'ext':      video_extension,
 806         }]
 807
 808
 809 class PhotobucketIE(InfoExtractor):
 810     """Information extractor for photobucket.com."""
 811
 812     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 813     IE_NAME = u'photobucket'
 814
 815     def __init__(self, downloader=None):
 816         InfoExtractor.__init__(self, downloader)
 817
 818     def report_download_webpage(self, video_id):
 819         """Report webpage download."""
 820         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 821
 822     def report_extraction(self, video_id):
 823         """Report information extraction."""
 824         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 825
 826     def _real_extract(self, url):
 827         # Extract id from URL
 828         mobj = re.match(self._VALID_URL, url)
 829         if mobj is None:
 830             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 831             return
 832
 833         video_id = mobj.group(1)
 834
 835         video_extension = 'flv'
 836
 837         # Retrieve video webpage to extract further information
 838         request = compat_urllib_request.Request(url)
 839         try:
 840             self.report_download_webpage(video_id)
 841             webpage = compat_urllib_request.urlopen(request).read()
 842         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 843             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 844             return
 845
 846         # Extract URL, uploader, and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 849         if mobj is None:
 850             self._downloader.trouble(u'ERROR: unable to extract media URL')
 851             return
 852         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 853
 854         video_url = mediaURL
 855
 856         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 857         if mobj is None:
 858             self._downloader.trouble(u'ERROR: unable to extract title')
 859             return
 860         video_title = mobj.group(1).decode('utf-8')
 861
 862         video_uploader = mobj.group(2).decode('utf-8')
 863
 864         return [{
 865             'id':       video_id.decode('utf-8'),
 866             'url':      video_url.decode('utf-8'),
 867             'uploader': video_uploader,
 868             'upload_date':  None,
 869             'title':    video_title,
 870             'ext':      video_extension.decode('utf-8'),
 871         }]
 872
 873
 874 class YahooIE(InfoExtractor):
 875     """Information extractor for video.yahoo.com."""
 876
 877     _WORKING = False
 878     # _VALID_URL matches all Yahoo! Video URLs
 879     # _VPAGE_URL matches only the extractable '/watch/' URLs
 880     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 881     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 882     IE_NAME = u'video.yahoo'
 883
 884     def __init__(self, downloader=None):
 885         InfoExtractor.__init__(self, downloader)
 886
 887     def report_download_webpage(self, video_id):
 888         """Report webpage download."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 890
 891     def report_extraction(self, video_id):
 892         """Report information extraction."""
 893         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 894
 895     def _real_extract(self, url, new_video=True):
 896         # Extract ID from URL
 897         mobj = re.match(self._VALID_URL, url)
 898         if mobj is None:
 899             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 900             return
 901
 902         video_id = mobj.group(2)
 903         video_extension = 'flv'
 904
 905         # Rewrite valid but non-extractable URLs as
 906         # extractable English language /watch/ URLs
 907         if re.match(self._VPAGE_URL, url) is None:
 908             request = compat_urllib_request.Request(url)
 909             try:
 910                 webpage = compat_urllib_request.urlopen(request).read()
 911             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 912                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 913                 return
 914
 915             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 916             if mobj is None:
 917                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 918                 return
 919             yahoo_id = mobj.group(1)
 920
 921             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 922             if mobj is None:
 923                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 924                 return
 925             yahoo_vid = mobj.group(1)
 926
 927             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 928             return self._real_extract(url, new_video=False)
 929
 930         # Retrieve video webpage to extract further information
 931         request = compat_urllib_request.Request(url)
 932         try:
 933             self.report_download_webpage(video_id)
 934             webpage = compat_urllib_request.urlopen(request).read()
 935         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 936             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 937             return
 938
 939         # Extract uploader and title from webpage
 940         self.report_extraction(video_id)
 941         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: unable to extract video title')
 944             return
 945         video_title = mobj.group(1).decode('utf-8')
 946
 947         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 948         if mobj is None:
 949             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 950             return
 951         video_uploader = mobj.group(1).decode('utf-8')
 952
 953         # Extract video thumbnail
 954         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 955         if mobj is None:
 956             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 957             return
 958         video_thumbnail = mobj.group(1).decode('utf-8')
 959
 960         # Extract video description
 961         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 962         if mobj is None:
 963             self._downloader.trouble(u'ERROR: unable to extract video description')
 964             return
 965         video_description = mobj.group(1).decode('utf-8')
 966         if not video_description:
 967             video_description = 'No description available.'
 968
 969         # Extract video height and width
 970         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 971         if mobj is None:
 972             self._downloader.trouble(u'ERROR: unable to extract video height')
 973             return
 974         yv_video_height = mobj.group(1)
 975
 976         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 977         if mobj is None:
 978             self._downloader.trouble(u'ERROR: unable to extract video width')
 979             return
 980         yv_video_width = mobj.group(1)
 981
 982         # Retrieve video playlist to extract media URL
 983         # I'm not completely sure what all these options are, but we
 984         # seem to need most of them, otherwise the server sends a 401.
 985         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 986         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 987         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 988                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 989                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 990         try:
 991             self.report_download_webpage(video_id)
 992             webpage = compat_urllib_request.urlopen(request).read()
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Extract media URL from playlist XML
 998         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 999         if mobj is None:
1000             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1001             return
1002         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003         video_url = unescapeHTML(video_url)
1004
1005         return [{
1006             'id':       video_id.decode('utf-8'),
1007             'url':      video_url,
1008             'uploader': video_uploader,
1009             'upload_date':  None,
1010             'title':    video_title,
1011             'ext':      video_extension.decode('utf-8'),
1012             'thumbnail':    video_thumbnail.decode('utf-8'),
1013             'description':  video_description,
1014         }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018     """Information extractor for vimeo.com."""
1019
1020     # _VALID_URL matches Vimeo URLs
1021     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1022     IE_NAME = u'vimeo'
1023
1024     def __init__(self, downloader=None):
1025         InfoExtractor.__init__(self, downloader)
1026
1027     def report_download_webpage(self, video_id):
1028         """Report webpage download."""
1029         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031     def report_extraction(self, video_id):
1032         """Report information extraction."""
1033         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035     def _real_extract(self, url, new_video=True):
1036         # Extract ID from URL
1037         mobj = re.match(self._VALID_URL, url)
1038         if mobj is None:
1039             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040             return
1041
1042         video_id = mobj.group('id')
1043         if not mobj.group('proto'):
1044             url = 'https://' + url
1045         if mobj.group('direct_link'):
1046             url = 'https://vimeo.com/' + video_id
1047
1048         # Retrieve video webpage to extract further information
1049         request = compat_urllib_request.Request(url, None, std_headers)
1050         try:
1051             self.report_download_webpage(video_id)
1052             webpage_bytes = compat_urllib_request.urlopen(request).read()
1053             webpage = webpage_bytes.decode('utf-8')
1054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1056             return
1057
1058         # Now we begin extracting as much information as we can from what we
1059         # retrieved. First we extract the information common to all extractors,
1060         # and latter we extract those that are Vimeo specific.
1061         self.report_extraction(video_id)
1062
1063         # Extract the config JSON
1064         try:
1065             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066             config = json.loads(config)
1067         except:
1068             self._downloader.trouble(u'ERROR: unable to extract info section')
1069             return
1070
1071         # Extract title
1072         video_title = config["video"]["title"]
1073
1074         # Extract uploader and uploader_id
1075         video_uploader = config["video"]["owner"]["name"]
1076         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1077
1078         # Extract video thumbnail
1079         video_thumbnail = config["video"]["thumbnail"]
1080
1081         # Extract video description
1082         video_description = get_element_by_attribute("itemprop", "description", webpage)
1083         if video_description: video_description = clean_html(video_description)
1084         else: video_description = ''
1085
1086         # Extract upload date
1087         video_upload_date = None
1088         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089         if mobj is not None:
1090             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1091
1092         # Vimeo specific: extract request signature and timestamp
1093         sig = config['request']['signature']
1094         timestamp = config['request']['timestamp']
1095
1096         # Vimeo specific: extract video codec and quality information
1097         # First consider quality, then codecs, then take everything
1098         # TODO bind to format param
1099         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100         files = { 'hd': [], 'sd': [], 'other': []}
1101         for codec_name, codec_extension in codecs:
1102             if codec_name in config["video"]["files"]:
1103                 if 'hd' in config["video"]["files"][codec_name]:
1104                     files['hd'].append((codec_name, codec_extension, 'hd'))
1105                 elif 'sd' in config["video"]["files"][codec_name]:
1106                     files['sd'].append((codec_name, codec_extension, 'sd'))
1107                 else:
1108                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1109
1110         for quality in ('hd', 'sd', 'other'):
1111             if len(files[quality]) > 0:
1112                 video_quality = files[quality][0][2]
1113                 video_codec = files[quality][0][0]
1114                 video_extension = files[quality][0][1]
1115                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1116                 break
1117         else:
1118             self._downloader.trouble(u'ERROR: no known codec found')
1119             return
1120
1121         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123
1124         return [{
1125             'id':       video_id,
1126             'url':      video_url,
1127             'uploader': video_uploader,
1128             'uploader_id': video_uploader_id,
1129             'upload_date':  video_upload_date,
1130             'title':    video_title,
1131             'ext':      video_extension,
1132             'thumbnail':    video_thumbnail,
1133             'description':  video_description,
1134         }]
1135
1136
1137 class ArteTvIE(InfoExtractor):
1138     """arte.tv information extractor."""
1139
1140     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141     _LIVE_URL = r'index-[0-9]+\.html$'
1142
1143     IE_NAME = u'arte.tv'
1144
1145     def __init__(self, downloader=None):
1146         InfoExtractor.__init__(self, downloader)
1147
1148     def report_download_webpage(self, video_id):
1149         """Report webpage download."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1151
1152     def report_extraction(self, video_id):
1153         """Report information extraction."""
1154         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1155
1156     def fetch_webpage(self, url):
1157         request = compat_urllib_request.Request(url)
1158         try:
1159             self.report_download_webpage(url)
1160             webpage = compat_urllib_request.urlopen(request).read()
1161         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1163             return
1164         except ValueError as err:
1165             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166             return
1167         return webpage
1168
1169     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170         page = self.fetch_webpage(url)
1171         mobj = re.search(regex, page, regexFlags)
1172         info = {}
1173
1174         if mobj is None:
1175             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1176             return
1177
1178         for (i, key, err) in matchTuples:
1179             if mobj.group(i) is None:
1180                 self._downloader.trouble(err)
1181                 return
1182             else:
1183                 info[key] = mobj.group(i)
1184
1185         return info
1186
1187     def extractLiveStream(self, url):
1188         video_lang = url.split('/')[-4]
1189         info = self.grep_webpage(
1190             url,
1191             r'src="(.*?/videothek_js.*?\.js)',
1192             0,
1193             [
1194                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1195             ]
1196         )
1197         http_host = url.split('/')[2]
1198         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202                 '(http://.*?\.swf).*?' +
1203                 '(rtmp://.*?)\'',
1204             re.DOTALL,
1205             [
1206                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1207                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1209             ]
1210         )
1211         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1212
1213     def extractPlus7Stream(self, url):
1214         video_lang = url.split('/')[-3]
1215         info = self.grep_webpage(
1216             url,
1217             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1218             0,
1219             [
1220                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1221             ]
1222         )
1223         next_url = compat_urllib_parse.unquote(info.get('url'))
1224         info = self.grep_webpage(
1225             next_url,
1226             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1227             0,
1228             [
1229                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1230             ]
1231         )
1232         next_url = compat_urllib_parse.unquote(info.get('url'))
1233
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video id="(.*?)".*?>.*?' +
1237                 '<name>(.*?)</name>.*?' +
1238                 '<dateVideo>(.*?)</dateVideo>.*?' +
1239                 '<url quality="hd">(.*?)</url>',
1240             re.DOTALL,
1241             [
1242                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1243                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1245                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1246             ]
1247         )
1248
1249         return {
1250             'id':           info.get('id'),
1251             'url':          compat_urllib_parse.unquote(info.get('url')),
1252             'uploader':     u'arte.tv',
1253             'upload_date':  info.get('date'),
1254             'title':        info.get('title').decode('utf-8'),
1255             'ext':          u'mp4',
1256             'format':       u'NA',
1257             'player_url':   None,
1258         }
1259
1260     def _real_extract(self, url):
1261         video_id = url.split('/')[-1]
1262         self.report_extraction(video_id)
1263
1264         if re.search(self._LIVE_URL, video_id) is not None:
1265             self.extractLiveStream(url)
1266             return
1267         else:
1268             info = self.extractPlus7Stream(url)
1269
1270         return [info]
1271
1272
1273 class GenericIE(InfoExtractor):
1274     """Generic last-resort information extractor."""
1275
1276     _VALID_URL = r'.*'
1277     IE_NAME = u'generic'
1278
1279     def __init__(self, downloader=None):
1280         InfoExtractor.__init__(self, downloader)
1281
1282     def report_download_webpage(self, video_id):
1283         """Report webpage download."""
1284         if not self._downloader.params.get('test', False):
1285             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1286         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1287
1288     def report_extraction(self, video_id):
1289         """Report information extraction."""
1290         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1291
1292     def report_following_redirect(self, new_url):
1293         """Report information extraction."""
1294         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1295
1296     def _test_redirect(self, url):
1297         """Check if it is a redirect, like url shorteners, in case restart chain."""
1298         class HeadRequest(compat_urllib_request.Request):
1299             def get_method(self):
1300                 return "HEAD"
1301
1302         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1303             """
1304             Subclass the HTTPRedirectHandler to make it use our
1305             HeadRequest also on the redirected URL
1306             """
1307             def redirect_request(self, req, fp, code, msg, headers, newurl):
1308                 if code in (301, 302, 303, 307):
1309                     newurl = newurl.replace(' ', '%20')
1310                     newheaders = dict((k,v) for k,v in req.headers.items()
1311                                       if k.lower() not in ("content-length", "content-type"))
1312                     return HeadRequest(newurl,
1313                                        headers=newheaders,
1314                                        origin_req_host=req.get_origin_req_host(),
1315                                        unverifiable=True)
1316                 else:
1317                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1318
1319         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1320             """
1321             Fallback to GET if HEAD is not allowed (405 HTTP error)
1322             """
1323             def http_error_405(self, req, fp, code, msg, headers):
1324                 fp.read()
1325                 fp.close()
1326
1327                 newheaders = dict((k,v) for k,v in req.headers.items()
1328                                   if k.lower() not in ("content-length", "content-type"))
1329                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1330                                                  headers=newheaders,
1331                                                  origin_req_host=req.get_origin_req_host(),
1332                                                  unverifiable=True))
1333
1334         # Build our opener
1335         opener = compat_urllib_request.OpenerDirector()
1336         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1337                         HTTPMethodFallback, HEADRedirectHandler,
1338                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1339             opener.add_handler(handler())
1340
1341         response = opener.open(HeadRequest(url))
1342         new_url = response.geturl()
1343
1344         if url == new_url:
1345             return False
1346
1347         self.report_following_redirect(new_url)
1348         self._downloader.download([new_url])
1349         return True
1350
1351     def _real_extract(self, url):
1352         if self._test_redirect(url): return
1353
1354         video_id = url.split('/')[-1]
1355         try:
1356             webpage = self._download_webpage(url, video_id)
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             # Broaden the search a little bit: JWPlayer JS loader
1371             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1372         if mobj is None:
1373             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1374             return
1375
1376         # It's possible that one of the regexes
1377         # matched, but returned an empty group:
1378         if mobj.group(1) is None:
1379             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380             return
1381
1382         video_url = compat_urllib_parse.unquote(mobj.group(1))
1383         video_id = os.path.basename(video_url)
1384
1385         # here's a fun little line of code for you:
1386         video_extension = os.path.splitext(video_id)[1][1:]
1387         video_id = os.path.splitext(video_id)[0]
1388
1389         # it's tempting to parse this further, but you would
1390         # have to take into account all the variations like
1391         #   Video Title - Site Name
1392         #   Site Name | Video Title
1393         #   Video Title - Tagline | Site Name
1394         # and so on and so forth; it's just not practical
1395         mobj = re.search(r'<title>(.*)</title>', webpage)
1396         if mobj is None:
1397             self._downloader.trouble(u'ERROR: unable to extract title')
1398             return
1399         video_title = mobj.group(1)
1400
1401         # video uploader is domain name
1402         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1403         if mobj is None:
1404             self._downloader.trouble(u'ERROR: unable to extract title')
1405             return
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(InfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422     _max_youtube_results = 1000
1423     IE_NAME = u'youtube:search'
1424
1425     def __init__(self, downloader=None):
1426         InfoExtractor.__init__(self, downloader)
1427
1428     def report_download_page(self, query, pagenum):
1429         """Report attempt to download search page with given number."""
1430         query = query.decode(preferredencoding())
1431         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1432
1433     def _real_extract(self, query):
1434         mobj = re.match(self._VALID_URL, query)
1435         if mobj is None:
1436             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1437             return
1438
1439         prefix, query = query.split(':')
1440         prefix = prefix[8:]
1441         query = query.encode('utf-8')
1442         if prefix == '':
1443             self._download_n_results(query, 1)
1444             return
1445         elif prefix == 'all':
1446             self._download_n_results(query, self._max_youtube_results)
1447             return
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 self._download_n_results(query, n)
1458                 return
1459             except ValueError: # parsing prefix as integer fails
1460                 self._download_n_results(query, 1)
1461                 return
1462
1463     def _download_n_results(self, query, n):
1464         """Downloads a specified number of results for a query"""
1465
1466         video_ids = []
1467         pagenum = 0
1468         limit = n
1469
1470         while (50 * pagenum) < limit:
1471             self.report_download_page(query, pagenum+1)
1472             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473             request = compat_urllib_request.Request(result_url)
1474             try:
1475                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1478                 return
1479             api_response = json.loads(data)['data']
1480
1481             if not 'items' in api_response:
1482                 self._downloader.trouble(u'[youtube] No video results')
1483                 return
1484
1485             new_ids = list(video['id'] for video in api_response['items'])
1486             video_ids += new_ids
1487
1488             limit = min(n, api_response['totalItems'])
1489             pagenum += 1
1490
1491         if len(video_ids) > n:
1492             video_ids = video_ids[:n]
1493         for id in video_ids:
1494             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1495         return
1496
1497
1498 class GoogleSearchIE(InfoExtractor):
1499     """Information Extractor for Google Video search queries."""
1500     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504     _max_google_results = 1000
1505     IE_NAME = u'video.google:search'
1506
1507     def __init__(self, downloader=None):
1508         InfoExtractor.__init__(self, downloader)
1509
1510     def report_download_page(self, query, pagenum):
1511         """Report attempt to download playlist page with given number."""
1512         query = query.decode(preferredencoding())
1513         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1514
1515     def _real_extract(self, query):
1516         mobj = re.match(self._VALID_URL, query)
1517         if mobj is None:
1518             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1519             return
1520
1521         prefix, query = query.split(':')
1522         prefix = prefix[8:]
1523         query = query.encode('utf-8')
1524         if prefix == '':
1525             self._download_n_results(query, 1)
1526             return
1527         elif prefix == 'all':
1528             self._download_n_results(query, self._max_google_results)
1529             return
1530         else:
1531             try:
1532                 n = int(prefix)
1533                 if n <= 0:
1534                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1535                     return
1536                 elif n > self._max_google_results:
1537                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1538                     n = self._max_google_results
1539                 self._download_n_results(query, n)
1540                 return
1541             except ValueError: # parsing prefix as integer fails
1542                 self._download_n_results(query, 1)
1543                 return
1544
1545     def _download_n_results(self, query, n):
1546         """Downloads a specified number of results for a query"""
1547
1548         video_ids = []
1549         pagenum = 0
1550
1551         while True:
1552             self.report_download_page(query, pagenum)
1553             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1554             request = compat_urllib_request.Request(result_url)
1555             try:
1556                 page = compat_urllib_request.urlopen(request).read()
1557             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1558                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1559                 return
1560
1561             # Extract video identifiers
1562             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1563                 video_id = mobj.group(1)
1564                 if video_id not in video_ids:
1565                     video_ids.append(video_id)
1566                     if len(video_ids) == n:
1567                         # Specified n videos reached
1568                         for id in video_ids:
1569                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1570                         return
1571
1572             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573                 for id in video_ids:
1574                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1575                 return
1576
1577             pagenum = pagenum + 1
1578
1579
1580 class YahooSearchIE(InfoExtractor):
1581     """Information Extractor for Yahoo! Video search queries."""
1582
1583     _WORKING = False
1584     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587     _MORE_PAGES_INDICATOR = r'\s*Next'
1588     _max_yahoo_results = 1000
1589     IE_NAME = u'video.yahoo:search'
1590
1591     def __init__(self, downloader=None):
1592         InfoExtractor.__init__(self, downloader)
1593
1594     def report_download_page(self, query, pagenum):
1595         """Report attempt to download playlist page with given number."""
1596         query = query.decode(preferredencoding())
1597         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1598
1599     def _real_extract(self, query):
1600         mobj = re.match(self._VALID_URL, query)
1601         if mobj is None:
1602             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1603             return
1604
1605         prefix, query = query.split(':')
1606         prefix = prefix[8:]
1607         query = query.encode('utf-8')
1608         if prefix == '':
1609             self._download_n_results(query, 1)
1610             return
1611         elif prefix == 'all':
1612             self._download_n_results(query, self._max_yahoo_results)
1613             return
1614         else:
1615             try:
1616                 n = int(prefix)
1617                 if n <= 0:
1618                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1619                     return
1620                 elif n > self._max_yahoo_results:
1621                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622                     n = self._max_yahoo_results
1623                 self._download_n_results(query, n)
1624                 return
1625             except ValueError: # parsing prefix as integer fails
1626                 self._download_n_results(query, 1)
1627                 return
1628
1629     def _download_n_results(self, query, n):
1630         """Downloads a specified number of results for a query"""
1631
1632         video_ids = []
1633         already_seen = set()
1634         pagenum = 1
1635
1636         while True:
1637             self.report_download_page(query, pagenum)
1638             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639             request = compat_urllib_request.Request(result_url)
1640             try:
1641                 page = compat_urllib_request.urlopen(request).read()
1642             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1644                 return
1645
1646             # Extract video identifiers
1647             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648                 video_id = mobj.group(1)
1649                 if video_id not in already_seen:
1650                     video_ids.append(video_id)
1651                     already_seen.add(video_id)
1652                     if len(video_ids) == n:
1653                         # Specified n videos reached
1654                         for id in video_ids:
1655                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656                         return
1657
1658             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659                 for id in video_ids:
1660                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1661                 return
1662
1663             pagenum = pagenum + 1
1664
1665
1666 class YoutubePlaylistIE(InfoExtractor):
1667     """Information Extractor for YouTube playlists."""
1668
1669     _VALID_URL = r"""(?:
1670                         (?:https?://)?
1671                         (?:\w+\.)?
1672                         youtube\.com/
1673                         (?:
1674                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1675                            \? (?:.*?&)*? (?:p|a|list)=
1676                         |  user/.*?/user/
1677                         |  p/
1678                         |  user/.*?#[pg]/c/
1679                         )
1680                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1681                         .*
1682                      |
1683                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1684                      )"""
1685     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1686     _MAX_RESULTS = 50
1687     IE_NAME = u'youtube:playlist'
1688
1689     def __init__(self, downloader=None):
1690         InfoExtractor.__init__(self, downloader)
1691
1692     @classmethod
1693     def suitable(cls, url):
1694         """Receives a URL and returns True if suitable for this IE."""
1695         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1696
1697     def report_download_page(self, playlist_id, pagenum):
1698         """Report attempt to download playlist page with given number."""
1699         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1700
1701     def _real_extract(self, url):
1702         # Extract playlist id
1703         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1704         if mobj is None:
1705             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1706             return
1707
1708         # Download playlist videos from API
1709         playlist_id = mobj.group(1) or mobj.group(2)
1710         page_num = 1
1711         videos = []
1712
1713         while True:
1714             self.report_download_page(playlist_id, page_num)
1715
1716             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1717             try:
1718                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1719             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721                 return
1722
1723             try:
1724                 response = json.loads(page)
1725             except ValueError as err:
1726                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1727                 return
1728
1729             if not 'feed' in response or not 'entry' in response['feed']:
1730                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1731                 return
1732             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733                         for entry in response['feed']['entry']
1734                         if 'content' in entry ]
1735
1736             if len(response['feed']['entry']) < self._MAX_RESULTS:
1737                 break
1738             page_num += 1
1739
1740         videos = [v[1] for v in sorted(videos)]
1741         total = len(videos)
1742
1743         playliststart = self._downloader.params.get('playliststart', 1) - 1
1744         playlistend = self._downloader.params.get('playlistend', -1)
1745         if playlistend == -1:
1746             videos = videos[playliststart:]
1747         else:
1748             videos = videos[playliststart:playlistend]
1749
1750         if len(videos) == total:
1751             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1752         else:
1753             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1754
1755         for video in videos:
1756             self._downloader.download([video])
1757         return
1758
1759
1760 class YoutubeChannelIE(InfoExtractor):
1761     """Information Extractor for YouTube channels."""
1762
1763     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1764     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1765     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1766     IE_NAME = u'youtube:channel'
1767
1768     def report_download_page(self, channel_id, pagenum):
1769         """Report attempt to download channel page with given number."""
1770         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1771
1772     def _real_extract(self, url):
1773         # Extract channel id
1774         mobj = re.match(self._VALID_URL, url)
1775         if mobj is None:
1776             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1777             return
1778
1779         # Download channel pages
1780         channel_id = mobj.group(1)
1781         video_ids = []
1782         pagenum = 1
1783
1784         while True:
1785             self.report_download_page(channel_id, pagenum)
1786             url = self._TEMPLATE_URL % (channel_id, pagenum)
1787             request = compat_urllib_request.Request(url)
1788             try:
1789                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1790             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1791                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1792                 return
1793
1794             # Extract video identifiers
1795             ids_in_page = []
1796             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1797                 if mobj.group(1) not in ids_in_page:
1798                     ids_in_page.append(mobj.group(1))
1799             video_ids.extend(ids_in_page)
1800
1801             if self._MORE_PAGES_INDICATOR not in page:
1802                 break
1803             pagenum = pagenum + 1
1804
1805         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1806
1807         for id in video_ids:
1808             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1809         return
1810
1811
1812 class YoutubeUserIE(InfoExtractor):
1813     """Information Extractor for YouTube users."""
1814
1815     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1816     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1817     _GDATA_PAGE_SIZE = 50
1818     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1819     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1820     IE_NAME = u'youtube:user'
1821
1822     def __init__(self, downloader=None):
1823         InfoExtractor.__init__(self, downloader)
1824
1825     def report_download_page(self, username, start_index):
1826         """Report attempt to download user page."""
1827         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1828                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1829
1830     def _real_extract(self, url):
1831         # Extract username
1832         mobj = re.match(self._VALID_URL, url)
1833         if mobj is None:
1834             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1835             return
1836
1837         username = mobj.group(1)
1838
1839         # Download video ids using YouTube Data API. Result size per
1840         # query is limited (currently to 50 videos) so we need to query
1841         # page by page until there are no video ids - it means we got
1842         # all of them.
1843
1844         video_ids = []
1845         pagenum = 0
1846
1847         while True:
1848             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1849             self.report_download_page(username, start_index)
1850
1851             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1852
1853             try:
1854                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1855             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1857                 return
1858
1859             # Extract video identifiers
1860             ids_in_page = []
1861
1862             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1863                 if mobj.group(1) not in ids_in_page:
1864                     ids_in_page.append(mobj.group(1))
1865
1866             video_ids.extend(ids_in_page)
1867
1868             # A little optimization - if current page is not
1869             # "full", ie. does not contain PAGE_SIZE video ids then
1870             # we can assume that this page is the last one - there
1871             # are no more ids on further pages - no need to query
1872             # again.
1873
1874             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1875                 break
1876
1877             pagenum += 1
1878
1879         all_ids_count = len(video_ids)
1880         playliststart = self._downloader.params.get('playliststart', 1) - 1
1881         playlistend = self._downloader.params.get('playlistend', -1)
1882
1883         if playlistend == -1:
1884             video_ids = video_ids[playliststart:]
1885         else:
1886             video_ids = video_ids[playliststart:playlistend]
1887
1888         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1889                 (username, all_ids_count, len(video_ids)))
1890
1891         for video_id in video_ids:
1892             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1893
1894
1895 class BlipTVUserIE(InfoExtractor):
1896     """Information Extractor for blip.tv users."""
1897
1898     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1899     _PAGE_SIZE = 12
1900     IE_NAME = u'blip.tv:user'
1901
1902     def __init__(self, downloader=None):
1903         InfoExtractor.__init__(self, downloader)
1904
1905     def report_download_page(self, username, pagenum):
1906         """Report attempt to download user page."""
1907         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1908                 (self.IE_NAME, username, pagenum))
1909
1910     def _real_extract(self, url):
1911         # Extract username
1912         mobj = re.match(self._VALID_URL, url)
1913         if mobj is None:
1914             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1915             return
1916
1917         username = mobj.group(1)
1918
1919         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1920
1921         request = compat_urllib_request.Request(url)
1922
1923         try:
1924             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925             mobj = re.search(r'data-users-id="([^"]+)"', page)
1926             page_base = page_base % mobj.group(1)
1927         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1929             return
1930
1931
1932         # Download video ids using BlipTV Ajax calls. Result size per
1933         # query is limited (currently to 12 videos) so we need to query
1934         # page by page until there are no video ids - it means we got
1935         # all of them.
1936
1937         video_ids = []
1938         pagenum = 1
1939
1940         while True:
1941             self.report_download_page(username, pagenum)
1942             url = page_base + "&page=" + str(pagenum)
1943             request = compat_urllib_request.Request( url )
1944             try:
1945                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1946             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1947                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1948                 return
1949
1950             # Extract video identifiers
1951             ids_in_page = []
1952
1953             for mobj in re.finditer(r'href="/([^"]+)"', page):
1954                 if mobj.group(1) not in ids_in_page:
1955                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1956
1957             video_ids.extend(ids_in_page)
1958
1959             # A little optimization - if current page is not
1960             # "full", ie. does not contain PAGE_SIZE video ids then
1961             # we can assume that this page is the last one - there
1962             # are no more ids on further pages - no need to query
1963             # again.
1964
1965             if len(ids_in_page) < self._PAGE_SIZE:
1966                 break
1967
1968             pagenum += 1
1969
1970         all_ids_count = len(video_ids)
1971         playliststart = self._downloader.params.get('playliststart', 1) - 1
1972         playlistend = self._downloader.params.get('playlistend', -1)
1973
1974         if playlistend == -1:
1975             video_ids = video_ids[playliststart:]
1976         else:
1977             video_ids = video_ids[playliststart:playlistend]
1978
1979         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1980                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1981
1982         for video_id in video_ids:
1983             self._downloader.download([u'http://blip.tv/'+video_id])
1984
1985
1986 class DepositFilesIE(InfoExtractor):
1987     """Information extractor for depositfiles.com"""
1988
1989     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1990
1991     def report_download_webpage(self, file_id):
1992         """Report webpage download."""
1993         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1994
1995     def report_extraction(self, file_id):
1996         """Report information extraction."""
1997         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1998
1999     def _real_extract(self, url):
2000         file_id = url.split('/')[-1]
2001         # Rebuild url in english locale
2002         url = 'http://depositfiles.com/en/files/' + file_id
2003
2004         # Retrieve file webpage with 'Free download' button pressed
2005         free_download_indication = { 'gateway_result' : '1' }
2006         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2007         try:
2008             self.report_download_webpage(file_id)
2009             webpage = compat_urllib_request.urlopen(request).read()
2010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2011             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2012             return
2013
2014         # Search for the real file URL
2015         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2016         if (mobj is None) or (mobj.group(1) is None):
2017             # Try to figure out reason of the error.
2018             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2019             if (mobj is not None) and (mobj.group(1) is not None):
2020                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2021                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2022             else:
2023                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2024             return
2025
2026         file_url = mobj.group(1)
2027         file_extension = os.path.splitext(file_url)[1][1:]
2028
2029         # Search for file title
2030         mobj = re.search(r'<b title="(.*?)">', webpage)
2031         if mobj is None:
2032             self._downloader.trouble(u'ERROR: unable to extract title')
2033             return
2034         file_title = mobj.group(1).decode('utf-8')
2035
2036         return [{
2037             'id':       file_id.decode('utf-8'),
2038             'url':      file_url.decode('utf-8'),
2039             'uploader': None,
2040             'upload_date':  None,
2041             'title':    file_title,
2042             'ext':      file_extension.decode('utf-8'),
2043         }]
2044
2045
2046 class FacebookIE(InfoExtractor):
2047     """Information Extractor for Facebook"""
2048
2049     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2050     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2051     _NETRC_MACHINE = 'facebook'
2052     IE_NAME = u'facebook'
2053
2054     def report_login(self):
2055         """Report attempt to log in."""
2056         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2057
2058     def _real_initialize(self):
2059         if self._downloader is None:
2060             return
2061
2062         useremail = None
2063         password = None
2064         downloader_params = self._downloader.params
2065
2066         # Attempt to use provided username and password or .netrc data
2067         if downloader_params.get('username', None) is not None:
2068             useremail = downloader_params['username']
2069             password = downloader_params['password']
2070         elif downloader_params.get('usenetrc', False):
2071             try:
2072                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2073                 if info is not None:
2074                     useremail = info[0]
2075                     password = info[2]
2076                 else:
2077                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2078             except (IOError, netrc.NetrcParseError) as err:
2079                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2080                 return
2081
2082         if useremail is None:
2083             return
2084
2085         # Log in
2086         login_form = {
2087             'email': useremail,
2088             'pass': password,
2089             'login': 'Log+In'
2090             }
2091         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2092         try:
2093             self.report_login()
2094             login_results = compat_urllib_request.urlopen(request).read()
2095             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2096                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2097                 return
2098         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2099             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2100             return
2101
2102     def _real_extract(self, url):
2103         mobj = re.match(self._VALID_URL, url)
2104         if mobj is None:
2105             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2106             return
2107         video_id = mobj.group('ID')
2108
2109         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2110         webpage = self._download_webpage(url, video_id)
2111
2112         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2113         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2114         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2115         if not m:
2116             raise ExtractorError(u'Cannot parse data')
2117         data = dict(json.loads(m.group(1)))
2118         params_raw = compat_urllib_parse.unquote(data['params'])
2119         params = json.loads(params_raw)
2120         video_url = params['hd_src']
2121         if not video_url:
2122             video_url = params['sd_src']
2123         if not video_url:
2124             raise ExtractorError(u'Cannot find video URL')
2125         video_duration = int(params['video_duration'])
2126
2127         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2128         if not m:
2129             raise ExtractorError(u'Cannot find title in webpage')
2130         video_title = unescapeHTML(m.group(1))
2131
2132         info = {
2133             'id': video_id,
2134             'title': video_title,
2135             'url': video_url,
2136             'ext': 'mp4',
2137             'duration': video_duration,
2138             'thumbnail': params['thumbnail_src'],
2139         }
2140         return [info]
2141
2142
2143 class BlipTVIE(InfoExtractor):
2144     """Information extractor for blip.tv"""
2145
2146     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2147     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2148     IE_NAME = u'blip.tv'
2149
2150     def report_extraction(self, file_id):
2151         """Report information extraction."""
2152         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2153
2154     def report_direct_download(self, title):
2155         """Report information extraction."""
2156         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2157
2158     def _real_extract(self, url):
2159         mobj = re.match(self._VALID_URL, url)
2160         if mobj is None:
2161             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2162             return
2163
2164         urlp = compat_urllib_parse_urlparse(url)
2165         if urlp.path.startswith('/play/'):
2166             request = compat_urllib_request.Request(url)
2167             response = compat_urllib_request.urlopen(request)
2168             redirecturl = response.geturl()
2169             rurlp = compat_urllib_parse_urlparse(redirecturl)
2170             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2171             url = 'http://blip.tv/a/a-' + file_id
2172             return self._real_extract(url)
2173
2174
2175         if '?' in url:
2176             cchar = '&'
2177         else:
2178             cchar = '?'
2179         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2180         request = compat_urllib_request.Request(json_url)
2181         request.add_header('User-Agent', 'iTunes/10.6.1')
2182         self.report_extraction(mobj.group(1))
2183         info = None
2184         try:
2185             urlh = compat_urllib_request.urlopen(request)
2186             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2187                 basename = url.split('/')[-1]
2188                 title,ext = os.path.splitext(basename)
2189                 title = title.decode('UTF-8')
2190                 ext = ext.replace('.', '')
2191                 self.report_direct_download(title)
2192                 info = {
2193                     'id': title,
2194                     'url': url,
2195                     'uploader': None,
2196                     'upload_date': None,
2197                     'title': title,
2198                     'ext': ext,
2199                     'urlhandle': urlh
2200                 }
2201         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2202             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2203         if info is None: # Regular URL
2204             try:
2205                 json_code_bytes = urlh.read()
2206                 json_code = json_code_bytes.decode('utf-8')
2207             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2208                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2209                 return
2210
2211             try:
2212                 json_data = json.loads(json_code)
2213                 if 'Post' in json_data:
2214                     data = json_data['Post']
2215                 else:
2216                     data = json_data
2217
2218                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2219                 video_url = data['media']['url']
2220                 umobj = re.match(self._URL_EXT, video_url)
2221                 if umobj is None:
2222                     raise ValueError('Can not determine filename extension')
2223                 ext = umobj.group(1)
2224
2225                 info = {
2226                     'id': data['item_id'],
2227                     'url': video_url,
2228                     'uploader': data['display_name'],
2229                     'upload_date': upload_date,
2230                     'title': data['title'],
2231                     'ext': ext,
2232                     'format': data['media']['mimeType'],
2233                     'thumbnail': data['thumbnailUrl'],
2234                     'description': data['description'],
2235                     'player_url': data['embedUrl'],
2236                     'user_agent': 'iTunes/10.6.1',
2237                 }
2238             except (ValueError,KeyError) as err:
2239                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2240                 return
2241
2242         return [info]
2243
2244
2245 class MyVideoIE(InfoExtractor):
2246     """Information Extractor for myvideo.de."""
2247
2248     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2249     IE_NAME = u'myvideo'
2250
2251     def __init__(self, downloader=None):
2252         InfoExtractor.__init__(self, downloader)
2253
2254     def report_extraction(self, video_id):
2255         """Report information extraction."""
2256         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2257
2258     def _real_extract(self,url):
2259         mobj = re.match(self._VALID_URL, url)
2260         if mobj is None:
2261             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2262             return
2263
2264         video_id = mobj.group(1)
2265
2266         # Get video webpage
2267         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2268         webpage = self._download_webpage(webpage_url, video_id)
2269
2270         self.report_extraction(video_id)
2271         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2272                  webpage)
2273         if mobj is None:
2274             self._downloader.trouble(u'ERROR: unable to extract media URL')
2275             return
2276         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2277
2278         mobj = re.search('<title>([^<]+)</title>', webpage)
2279         if mobj is None:
2280             self._downloader.trouble(u'ERROR: unable to extract title')
2281             return
2282
2283         video_title = mobj.group(1)
2284
2285         return [{
2286             'id':       video_id,
2287             'url':      video_url,
2288             'uploader': None,
2289             'upload_date':  None,
2290             'title':    video_title,
2291             'ext':      u'flv',
2292         }]
2293
2294 class ComedyCentralIE(InfoExtractor):
2295     """Information extractor for The Daily Show and Colbert Report """
2296
2297     # urls can be abbreviations like :thedailyshow or :colbert
2298     # urls for episodes like:
2299     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2300     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2301     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2302     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2303                       |(https?://)?(www\.)?
2304                           (?P<showname>thedailyshow|colbertnation)\.com/
2305                          (full-episodes/(?P<episode>.*)|
2306                           (?P<clip>
2307                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2308                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2309                      $"""
2310
2311     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2312
2313     _video_extensions = {
2314         '3500': 'mp4',
2315         '2200': 'mp4',
2316         '1700': 'mp4',
2317         '1200': 'mp4',
2318         '750': 'mp4',
2319         '400': 'mp4',
2320     }
2321     _video_dimensions = {
2322         '3500': '1280x720',
2323         '2200': '960x540',
2324         '1700': '768x432',
2325         '1200': '640x360',
2326         '750': '512x288',
2327         '400': '384x216',
2328     }
2329
2330     @classmethod
2331     def suitable(cls, url):
2332         """Receives a URL and returns True if suitable for this IE."""
2333         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2334
2335     def report_extraction(self, episode_id):
2336         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2337
2338     def report_config_download(self, episode_id, media_id):
2339         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2340
2341     def report_index_download(self, episode_id):
2342         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2343
2344     def _print_formats(self, formats):
2345         print('Available formats:')
2346         for x in formats:
2347             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2348
2349
2350     def _real_extract(self, url):
2351         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352         if mobj is None:
2353             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2354             return
2355
2356         if mobj.group('shortname'):
2357             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2358                 url = u'http://www.thedailyshow.com/full-episodes/'
2359             else:
2360                 url = u'http://www.colbertnation.com/full-episodes/'
2361             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2362             assert mobj is not None
2363
2364         if mobj.group('clip'):
2365             if mobj.group('showname') == 'thedailyshow':
2366                 epTitle = mobj.group('tdstitle')
2367             else:
2368                 epTitle = mobj.group('cntitle')
2369             dlNewest = False
2370         else:
2371             dlNewest = not mobj.group('episode')
2372             if dlNewest:
2373                 epTitle = mobj.group('showname')
2374             else:
2375                 epTitle = mobj.group('episode')
2376
2377         req = compat_urllib_request.Request(url)
2378         self.report_extraction(epTitle)
2379         try:
2380             htmlHandle = compat_urllib_request.urlopen(req)
2381             html = htmlHandle.read()
2382             webpage = html.decode('utf-8')
2383         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2384             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2385             return
2386         if dlNewest:
2387             url = htmlHandle.geturl()
2388             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2389             if mobj is None:
2390                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2391                 return
2392             if mobj.group('episode') == '':
2393                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2394                 return
2395             epTitle = mobj.group('episode')
2396
2397         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2398
2399         if len(mMovieParams) == 0:
2400             # The Colbert Report embeds the information in a without
2401             # a URL prefix; so extract the alternate reference
2402             # and then add the URL prefix manually.
2403
2404             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2405             if len(altMovieParams) == 0:
2406                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2407                 return
2408             else:
2409                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2410
2411         uri = mMovieParams[0][1]
2412         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2413         self.report_index_download(epTitle)
2414         try:
2415             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2416         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2418             return
2419
2420         results = []
2421
2422         idoc = xml.etree.ElementTree.fromstring(indexXml)
2423         itemEls = idoc.findall('.//item')
2424         for partNum,itemEl in enumerate(itemEls):
2425             mediaId = itemEl.findall('./guid')[0].text
2426             shortMediaId = mediaId.split(':')[-1]
2427             showId = mediaId.split(':')[-2].replace('.com', '')
2428             officialTitle = itemEl.findall('./title')[0].text
2429             officialDate = itemEl.findall('./pubDate')[0].text
2430
2431             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2432                         compat_urllib_parse.urlencode({'uri': mediaId}))
2433             configReq = compat_urllib_request.Request(configUrl)
2434             self.report_config_download(epTitle, shortMediaId)
2435             try:
2436                 configXml = compat_urllib_request.urlopen(configReq).read()
2437             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2438                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2439                 return
2440
2441             cdoc = xml.etree.ElementTree.fromstring(configXml)
2442             turls = []
2443             for rendition in cdoc.findall('.//rendition'):
2444                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2445                 turls.append(finfo)
2446
2447             if len(turls) == 0:
2448                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2449                 continue
2450
2451             if self._downloader.params.get('listformats', None):
2452                 self._print_formats([i[0] for i in turls])
2453                 return
2454
2455             # For now, just pick the highest bitrate
2456             format,rtmp_video_url = turls[-1]
2457
2458             # Get the format arg from the arg stream
2459             req_format = self._downloader.params.get('format', None)
2460
2461             # Select format if we can find one
2462             for f,v in turls:
2463                 if f == req_format:
2464                     format, rtmp_video_url = f, v
2465                     break
2466
2467             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2468             if not m:
2469                 raise ExtractorError(u'Cannot transform RTMP url')
2470             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2471             video_url = base + m.group('finalid')
2472
2473             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2474             info = {
2475                 'id': shortMediaId,
2476                 'url': video_url,
2477                 'uploader': showId,
2478                 'upload_date': officialDate,
2479                 'title': effTitle,
2480                 'ext': 'mp4',
2481                 'format': format,
2482                 'thumbnail': None,
2483                 'description': officialTitle,
2484             }
2485             results.append(info)
2486
2487         return results
2488
2489
2490 class EscapistIE(InfoExtractor):
2491     """Information extractor for The Escapist """
2492
2493     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2494     IE_NAME = u'escapist'
2495
2496     def report_extraction(self, showName):
2497         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2498
2499     def report_config_download(self, showName):
2500         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2501
2502     def _real_extract(self, url):
2503         mobj = re.match(self._VALID_URL, url)
2504         if mobj is None:
2505             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2506             return
2507         showName = mobj.group('showname')
2508         videoId = mobj.group('episode')
2509
2510         self.report_extraction(showName)
2511         try:
2512             webPage = compat_urllib_request.urlopen(url)
2513             webPageBytes = webPage.read()
2514             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2515             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2517             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2518             return
2519
2520         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2521         description = unescapeHTML(descMatch.group(1))
2522         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2523         imgUrl = unescapeHTML(imgMatch.group(1))
2524         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2525         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2526         configUrlMatch = re.search('config=(.*)$', playerUrl)
2527         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2528
2529         self.report_config_download(showName)
2530         try:
2531             configJSON = compat_urllib_request.urlopen(configUrl)
2532             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2533             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2534         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2535             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2536             return
2537
2538         # Technically, it's JavaScript, not JSON
2539         configJSON = configJSON.replace("'", '"')
2540
2541         try:
2542             config = json.loads(configJSON)
2543         except (ValueError,) as err:
2544             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2545             return
2546
2547         playlist = config['playlist']
2548         videoUrl = playlist[1]['url']
2549
2550         info = {
2551             'id': videoId,
2552             'url': videoUrl,
2553             'uploader': showName,
2554             'upload_date': None,
2555             'title': showName,
2556             'ext': 'mp4',
2557             'thumbnail': imgUrl,
2558             'description': description,
2559             'player_url': playerUrl,
2560         }
2561
2562         return [info]
2563
2564 class CollegeHumorIE(InfoExtractor):
2565     """Information extractor for collegehumor.com"""
2566
2567     _WORKING = False
2568     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2569     IE_NAME = u'collegehumor'
2570
2571     def report_manifest(self, video_id):
2572         """Report information extraction."""
2573         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2574
2575     def report_extraction(self, video_id):
2576         """Report information extraction."""
2577         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2578
2579     def _real_extract(self, url):
2580         mobj = re.match(self._VALID_URL, url)
2581         if mobj is None:
2582             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2583             return
2584         video_id = mobj.group('videoid')
2585
2586         info = {
2587             'id': video_id,
2588             'uploader': None,
2589             'upload_date': None,
2590         }
2591
2592         self.report_extraction(video_id)
2593         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2594         try:
2595             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2596         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2598             return
2599
2600         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2601         try:
2602             videoNode = mdoc.findall('./video')[0]
2603             info['description'] = videoNode.findall('./description')[0].text
2604             info['title'] = videoNode.findall('./caption')[0].text
2605             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2606             manifest_url = videoNode.findall('./file')[0].text
2607         except IndexError:
2608             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2609             return
2610
2611         manifest_url += '?hdcore=2.10.3'
2612         self.report_manifest(video_id)
2613         try:
2614             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2615         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2616             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2617             return
2618
2619         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2620         try:
2621             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2622             node_id = media_node.attrib['url']
2623             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2624         except IndexError as err:
2625             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2626             return
2627
2628         url_pr = compat_urllib_parse_urlparse(manifest_url)
2629         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2630
2631         info['url'] = url
2632         info['ext'] = 'f4f'
2633         return [info]
2634
2635
2636 class XVideosIE(InfoExtractor):
2637     """Information extractor for xvideos.com"""
2638
2639     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2640     IE_NAME = u'xvideos'
2641
2642     def report_extraction(self, video_id):
2643         """Report information extraction."""
2644         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2645
2646     def _real_extract(self, url):
2647         mobj = re.match(self._VALID_URL, url)
2648         if mobj is None:
2649             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2650             return
2651         video_id = mobj.group(1)
2652
2653         webpage = self._download_webpage(url, video_id)
2654
2655         self.report_extraction(video_id)
2656
2657
2658         # Extract video URL
2659         mobj = re.search(r'flv_url=(.+?)&', webpage)
2660         if mobj is None:
2661             self._downloader.trouble(u'ERROR: unable to extract video url')
2662             return
2663         video_url = compat_urllib_parse.unquote(mobj.group(1))
2664
2665
2666         # Extract title
2667         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2668         if mobj is None:
2669             self._downloader.trouble(u'ERROR: unable to extract video title')
2670             return
2671         video_title = mobj.group(1)
2672
2673
2674         # Extract video thumbnail
2675         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2676         if mobj is None:
2677             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2678             return
2679         video_thumbnail = mobj.group(0)
2680
2681         info = {
2682             'id': video_id,
2683             'url': video_url,
2684             'uploader': None,
2685             'upload_date': None,
2686             'title': video_title,
2687             'ext': 'flv',
2688             'thumbnail': video_thumbnail,
2689             'description': None,
2690         }
2691
2692         return [info]
2693
2694
2695 class SoundcloudIE(InfoExtractor):
2696     """Information extractor for soundcloud.com
2697        To access the media, the uid of the song and a stream token
2698        must be extracted from the page source and the script must make
2699        a request to media.soundcloud.com/crossdomain.xml. Then
2700        the media can be grabbed by requesting from an url composed
2701        of the stream token and uid
2702      """
2703
2704     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2705     IE_NAME = u'soundcloud'
2706
2707     def __init__(self, downloader=None):
2708         InfoExtractor.__init__(self, downloader)
2709
2710     def report_resolve(self, video_id):
2711         """Report information extraction."""
2712         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2713
2714     def report_extraction(self, video_id):
2715         """Report information extraction."""
2716         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2717
2718     def _real_extract(self, url):
2719         mobj = re.match(self._VALID_URL, url)
2720         if mobj is None:
2721             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2722             return
2723
2724         # extract uploader (which is in the url)
2725         uploader = mobj.group(1)
2726         # extract simple title (uploader + slug of song title)
2727         slug_title =  mobj.group(2)
2728         simple_title = uploader + u'-' + slug_title
2729
2730         self.report_resolve('%s/%s' % (uploader, slug_title))
2731
2732         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2733         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2734         request = compat_urllib_request.Request(resolv_url)
2735         try:
2736             info_json_bytes = compat_urllib_request.urlopen(request).read()
2737             info_json = info_json_bytes.decode('utf-8')
2738         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2740             return
2741
2742         info = json.loads(info_json)
2743         video_id = info['id']
2744         self.report_extraction('%s/%s' % (uploader, slug_title))
2745
2746         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2747         request = compat_urllib_request.Request(streams_url)
2748         try:
2749             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2750             stream_json = stream_json_bytes.decode('utf-8')
2751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2753             return
2754
2755         streams = json.loads(stream_json)
2756         mediaURL = streams['http_mp3_128_url']
2757
2758         return [{
2759             'id':       info['id'],
2760             'url':      mediaURL,
2761             'uploader': info['user']['username'],
2762             'upload_date':  info['created_at'],
2763             'title':    info['title'],
2764             'ext':      u'mp3',
2765             'description': info['description'],
2766         }]
2767
2768
2769 class InfoQIE(InfoExtractor):
2770     """Information extractor for infoq.com"""
2771     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2772
2773     def report_extraction(self, video_id):
2774         """Report information extraction."""
2775         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2776
2777     def _real_extract(self, url):
2778         mobj = re.match(self._VALID_URL, url)
2779         if mobj is None:
2780             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2781             return
2782
2783         webpage = self._download_webpage(url, video_id=url)
2784         self.report_extraction(url)
2785
2786         # Extract video URL
2787         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2788         if mobj is None:
2789             self._downloader.trouble(u'ERROR: unable to extract video url')
2790             return
2791         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2792         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2793
2794         # Extract title
2795         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2796         if mobj is None:
2797             self._downloader.trouble(u'ERROR: unable to extract video title')
2798             return
2799         video_title = mobj.group(1)
2800
2801         # Extract description
2802         video_description = u'No description available.'
2803         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2804         if mobj is not None:
2805             video_description = mobj.group(1)
2806
2807         video_filename = video_url.split('/')[-1]
2808         video_id, extension = video_filename.split('.')
2809
2810         info = {
2811             'id': video_id,
2812             'url': video_url,
2813             'uploader': None,
2814             'upload_date': None,
2815             'title': video_title,
2816             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2817             'thumbnail': None,
2818             'description': video_description,
2819         }
2820
2821         return [info]
2822
2823 class MixcloudIE(InfoExtractor):
2824     """Information extractor for www.mixcloud.com"""
2825
2826     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2827     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2828     IE_NAME = u'mixcloud'
2829
2830     def __init__(self, downloader=None):
2831         InfoExtractor.__init__(self, downloader)
2832
2833     def report_download_json(self, file_id):
2834         """Report JSON download."""
2835         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2836
2837     def report_extraction(self, file_id):
2838         """Report information extraction."""
2839         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2840
2841     def get_urls(self, jsonData, fmt, bitrate='best'):
2842         """Get urls from 'audio_formats' section in json"""
2843         file_url = None
2844         try:
2845             bitrate_list = jsonData[fmt]
2846             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2847                 bitrate = max(bitrate_list) # select highest
2848
2849             url_list = jsonData[fmt][bitrate]
2850         except TypeError: # we have no bitrate info.
2851             url_list = jsonData[fmt]
2852         return url_list
2853
2854     def check_urls(self, url_list):
2855         """Returns 1st active url from list"""
2856         for url in url_list:
2857             try:
2858                 compat_urllib_request.urlopen(url)
2859                 return url
2860             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861                 url = None
2862
2863         return None
2864
2865     def _print_formats(self, formats):
2866         print('Available formats:')
2867         for fmt in formats.keys():
2868             for b in formats[fmt]:
2869                 try:
2870                     ext = formats[fmt][b][0]
2871                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2872                 except TypeError: # we have no bitrate info
2873                     ext = formats[fmt][0]
2874                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2875                     break
2876
2877     def _real_extract(self, url):
2878         mobj = re.match(self._VALID_URL, url)
2879         if mobj is None:
2880             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2881             return
2882         # extract uploader & filename from url
2883         uploader = mobj.group(1).decode('utf-8')
2884         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2885
2886         # construct API request
2887         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2888         # retrieve .json file with links to files
2889         request = compat_urllib_request.Request(file_url)
2890         try:
2891             self.report_download_json(file_url)
2892             jsonData = compat_urllib_request.urlopen(request).read()
2893         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2895             return
2896
2897         # parse JSON
2898         json_data = json.loads(jsonData)
2899         player_url = json_data['player_swf_url']
2900         formats = dict(json_data['audio_formats'])
2901
2902         req_format = self._downloader.params.get('format', None)
2903         bitrate = None
2904
2905         if self._downloader.params.get('listformats', None):
2906             self._print_formats(formats)
2907             return
2908
2909         if req_format is None or req_format == 'best':
2910             for format_param in formats.keys():
2911                 url_list = self.get_urls(formats, format_param)
2912                 # check urls
2913                 file_url = self.check_urls(url_list)
2914                 if file_url is not None:
2915                     break # got it!
2916         else:
2917             if req_format not in formats:
2918                 self._downloader.trouble(u'ERROR: format is not available')
2919                 return
2920
2921             url_list = self.get_urls(formats, req_format)
2922             file_url = self.check_urls(url_list)
2923             format_param = req_format
2924
2925         return [{
2926             'id': file_id.decode('utf-8'),
2927             'url': file_url.decode('utf-8'),
2928             'uploader': uploader.decode('utf-8'),
2929             'upload_date': None,
2930             'title': json_data['name'],
2931             'ext': file_url.split('.')[-1].decode('utf-8'),
2932             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2933             'thumbnail': json_data['thumbnail_url'],
2934             'description': json_data['description'],
2935             'player_url': player_url.decode('utf-8'),
2936         }]
2937
2938 class StanfordOpenClassroomIE(InfoExtractor):
2939     """Information extractor for Stanford's Open ClassRoom"""
2940
2941     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2942     IE_NAME = u'stanfordoc'
2943
2944     def report_download_webpage(self, objid):
2945         """Report information extraction."""
2946         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2947
2948     def report_extraction(self, video_id):
2949         """Report information extraction."""
2950         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2951
2952     def _real_extract(self, url):
2953         mobj = re.match(self._VALID_URL, url)
2954         if mobj is None:
2955             raise ExtractorError(u'Invalid URL: %s' % url)
2956
2957         if mobj.group('course') and mobj.group('video'): # A specific video
2958             course = mobj.group('course')
2959             video = mobj.group('video')
2960             info = {
2961                 'id': course + '_' + video,
2962                 'uploader': None,
2963                 'upload_date': None,
2964             }
2965
2966             self.report_extraction(info['id'])
2967             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2968             xmlUrl = baseUrl + video + '.xml'
2969             try:
2970                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2971             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2972                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2973                 return
2974             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2975             try:
2976                 info['title'] = mdoc.findall('./title')[0].text
2977                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2978             except IndexError:
2979                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2980                 return
2981             info['ext'] = info['url'].rpartition('.')[2]
2982             return [info]
2983         elif mobj.group('course'): # A course page
2984             course = mobj.group('course')
2985             info = {
2986                 'id': course,
2987                 'type': 'playlist',
2988                 'uploader': None,
2989                 'upload_date': None,
2990             }
2991
2992             coursepage = self._download_webpage(url, info['id'],
2993                                         note='Downloading course info page',
2994                                         errnote='Unable to download course info page')
2995
2996             m = re.search('<h1>([^<]+)</h1>', coursepage)
2997             if m:
2998                 info['title'] = unescapeHTML(m.group(1))
2999             else:
3000                 info['title'] = info['id']
3001
3002             m = re.search('<description>([^<]+)</description>', coursepage)
3003             if m:
3004                 info['description'] = unescapeHTML(m.group(1))
3005
3006             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3007             info['list'] = [
3008                 {
3009                     'type': 'reference',
3010                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3011                 }
3012                     for vpage in links]
3013             results = []
3014             for entry in info['list']:
3015                 assert entry['type'] == 'reference'
3016                 results += self.extract(entry['url'])
3017             return results
3018         else: # Root page
3019             info = {
3020                 'id': 'Stanford OpenClassroom',
3021                 'type': 'playlist',
3022                 'uploader': None,
3023                 'upload_date': None,
3024             }
3025
3026             self.report_download_webpage(info['id'])
3027             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3028             try:
3029                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3030             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3031                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3032                 return
3033
3034             info['title'] = info['id']
3035
3036             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3037             info['list'] = [
3038                 {
3039                     'type': 'reference',
3040                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3041                 }
3042                     for cpage in links]
3043
3044             results = []
3045             for entry in info['list']:
3046                 assert entry['type'] == 'reference'
3047                 results += self.extract(entry['url'])
3048             return results
3049
3050 class MTVIE(InfoExtractor):
3051     """Information extractor for MTV.com"""
3052
3053     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3054     IE_NAME = u'mtv'
3055
3056     def report_extraction(self, video_id):
3057         """Report information extraction."""
3058         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3059
3060     def _real_extract(self, url):
3061         mobj = re.match(self._VALID_URL, url)
3062         if mobj is None:
3063             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3064             return
3065         if not mobj.group('proto'):
3066             url = 'http://' + url
3067         video_id = mobj.group('videoid')
3068
3069         webpage = self._download_webpage(url, video_id)
3070
3071         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3072         if mobj is None:
3073             self._downloader.trouble(u'ERROR: unable to extract song name')
3074             return
3075         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3076         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3077         if mobj is None:
3078             self._downloader.trouble(u'ERROR: unable to extract performer')
3079             return
3080         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3081         video_title = performer + ' - ' + song_name
3082
3083         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3084         if mobj is None:
3085             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3086             return
3087         mtvn_uri = mobj.group(1)
3088
3089         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3090         if mobj is None:
3091             self._downloader.trouble(u'ERROR: unable to extract content id')
3092             return
3093         content_id = mobj.group(1)
3094
3095         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3096         self.report_extraction(video_id)
3097         request = compat_urllib_request.Request(videogen_url)
3098         try:
3099             metadataXml = compat_urllib_request.urlopen(request).read()
3100         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3101             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3102             return
3103
3104         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3105         renditions = mdoc.findall('.//rendition')
3106
3107         # For now, always pick the highest quality.
3108         rendition = renditions[-1]
3109
3110         try:
3111             _,_,ext = rendition.attrib['type'].partition('/')
3112             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3113             video_url = rendition.find('./src').text
3114         except KeyError:
3115             self._downloader.trouble('Invalid rendition field.')
3116             return
3117
3118         info = {
3119             'id': video_id,
3120             'url': video_url,
3121             'uploader': performer,
3122             'upload_date': None,
3123             'title': video_title,
3124             'ext': ext,
3125             'format': format,
3126         }
3127
3128         return [info]
3129
3130
3131 class YoukuIE(InfoExtractor):
3132     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3133
3134     def report_download_webpage(self, file_id):
3135         """Report webpage download."""
3136         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3137
3138     def report_extraction(self, file_id):
3139         """Report information extraction."""
3140         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3141
3142     def _gen_sid(self):
3143         nowTime = int(time.time() * 1000)
3144         random1 = random.randint(1000,1998)
3145         random2 = random.randint(1000,9999)
3146
3147         return "%d%d%d" %(nowTime,random1,random2)
3148
3149     def _get_file_ID_mix_string(self, seed):
3150         mixed = []
3151         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3152         seed = float(seed)
3153         for i in range(len(source)):
3154             seed  =  (seed * 211 + 30031 ) % 65536
3155             index  =  math.floor(seed / 65536 * len(source) )
3156             mixed.append(source[int(index)])
3157             source.remove(source[int(index)])
3158         #return ''.join(mixed)
3159         return mixed
3160
3161     def _get_file_id(self, fileId, seed):
3162         mixed = self._get_file_ID_mix_string(seed)
3163         ids = fileId.split('*')
3164         realId = []
3165         for ch in ids:
3166             if ch:
3167                 realId.append(mixed[int(ch)])
3168         return ''.join(realId)
3169
3170     def _real_extract(self, url):
3171         mobj = re.match(self._VALID_URL, url)
3172         if mobj is None:
3173             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3174             return
3175         video_id = mobj.group('ID')
3176
3177         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3178
3179         request = compat_urllib_request.Request(info_url, None, std_headers)
3180         try:
3181             self.report_download_webpage(video_id)
3182             jsondata = compat_urllib_request.urlopen(request).read()
3183         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3184             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3185             return
3186
3187         self.report_extraction(video_id)
3188         try:
3189             jsonstr = jsondata.decode('utf-8')
3190             config = json.loads(jsonstr)
3191
3192             video_title =  config['data'][0]['title']
3193             seed = config['data'][0]['seed']
3194
3195             format = self._downloader.params.get('format', None)
3196             supported_format = list(config['data'][0]['streamfileids'].keys())
3197
3198             if format is None or format == 'best':
3199                 if 'hd2' in supported_format:
3200                     format = 'hd2'
3201                 else:
3202                     format = 'flv'
3203                 ext = u'flv'
3204             elif format == 'worst':
3205                 format = 'mp4'
3206                 ext = u'mp4'
3207             else:
3208                 format = 'flv'
3209                 ext = u'flv'
3210
3211
3212             fileid = config['data'][0]['streamfileids'][format]
3213             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3214         except (UnicodeDecodeError, ValueError, KeyError):
3215             self._downloader.trouble(u'ERROR: unable to extract info section')
3216             return
3217
3218         files_info=[]
3219         sid = self._gen_sid()
3220         fileid = self._get_file_id(fileid, seed)
3221
3222         #column 8,9 of fileid represent the segment number
3223         #fileid[7:9] should be changed
3224         for index, key in enumerate(keys):
3225
3226             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3228
3229             info = {
3230                 'id': '%s_part%02d' % (video_id, index),
3231                 'url': download_url,
3232                 'uploader': None,
3233                 'upload_date': None,
3234                 'title': video_title,
3235                 'ext': ext,
3236             }
3237             files_info.append(info)
3238
3239         return files_info
3240
3241
3242 class XNXXIE(InfoExtractor):
3243     """Information extractor for xnxx.com"""
3244
3245     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3246     IE_NAME = u'xnxx'
3247     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3248     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3249     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3250
3251     def report_webpage(self, video_id):
3252         """Report information extraction"""
3253         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3254
3255     def report_extraction(self, video_id):
3256         """Report information extraction"""
3257         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3258
3259     def _real_extract(self, url):
3260         mobj = re.match(self._VALID_URL, url)
3261         if mobj is None:
3262             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3263             return
3264         video_id = mobj.group(1)
3265
3266         self.report_webpage(video_id)
3267
3268         # Get webpage content
3269         try:
3270             webpage_bytes = compat_urllib_request.urlopen(url).read()
3271             webpage = webpage_bytes.decode('utf-8')
3272         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3273             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3274             return
3275
3276         result = re.search(self.VIDEO_URL_RE, webpage)
3277         if result is None:
3278             self._downloader.trouble(u'ERROR: unable to extract video url')
3279             return
3280         video_url = compat_urllib_parse.unquote(result.group(1))
3281
3282         result = re.search(self.VIDEO_TITLE_RE, webpage)
3283         if result is None:
3284             self._downloader.trouble(u'ERROR: unable to extract video title')
3285             return
3286         video_title = result.group(1)
3287
3288         result = re.search(self.VIDEO_THUMB_RE, webpage)
3289         if result is None:
3290             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3291             return
3292         video_thumbnail = result.group(1)
3293
3294         return [{
3295             'id': video_id,
3296             'url': video_url,
3297             'uploader': None,
3298             'upload_date': None,
3299             'title': video_title,
3300             'ext': 'flv',
3301             'thumbnail': video_thumbnail,
3302             'description': None,
3303         }]
3304
3305
3306 class GooglePlusIE(InfoExtractor):
3307     """Information extractor for plus.google.com."""
3308
3309     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3310     IE_NAME = u'plus.google'
3311
3312     def __init__(self, downloader=None):
3313         InfoExtractor.__init__(self, downloader)
3314
3315     def report_extract_entry(self, url):
3316         """Report downloading extry"""
3317         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3318
3319     def report_date(self, upload_date):
3320         """Report downloading extry"""
3321         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3322
3323     def report_uploader(self, uploader):
3324         """Report downloading extry"""
3325         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3326
3327     def report_title(self, video_title):
3328         """Report downloading extry"""
3329         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3330
3331     def report_extract_vid_page(self, video_page):
3332         """Report information extraction."""
3333         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3334
3335     def _real_extract(self, url):
3336         # Extract id from URL
3337         mobj = re.match(self._VALID_URL, url)
3338         if mobj is None:
3339             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3340             return
3341
3342         post_url = mobj.group(0)
3343         video_id = mobj.group(1)
3344
3345         video_extension = 'flv'
3346
3347         # Step 1, Retrieve post webpage to extract further information
3348         self.report_extract_entry(post_url)
3349         request = compat_urllib_request.Request(post_url)
3350         try:
3351             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3352         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3353             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3354             return
3355
3356         # Extract update date
3357         upload_date = None
3358         pattern = 'title="Timestamp">(.*?)</a>'
3359         mobj = re.search(pattern, webpage)
3360         if mobj:
3361             upload_date = mobj.group(1)
3362             # Convert timestring to a format suitable for filename
3363             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3364             upload_date = upload_date.strftime('%Y%m%d')
3365         self.report_date(upload_date)
3366
3367         # Extract uploader
3368         uploader = None
3369         pattern = r'rel\="author".*?>(.*?)</a>'
3370         mobj = re.search(pattern, webpage)
3371         if mobj:
3372             uploader = mobj.group(1)
3373         self.report_uploader(uploader)
3374
3375         # Extract title
3376         # Get the first line for title
3377         video_title = u'NA'
3378         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3379         mobj = re.search(pattern, webpage)
3380         if mobj:
3381             video_title = mobj.group(1)
3382         self.report_title(video_title)
3383
3384         # Step 2, Stimulate clicking the image box to launch video
3385         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3386         mobj = re.search(pattern, webpage)
3387         if mobj is None:
3388             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3389
3390         video_page = mobj.group(1)
3391         request = compat_urllib_request.Request(video_page)
3392         try:
3393             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3394         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3395             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3396             return
3397         self.report_extract_vid_page(video_page)
3398
3399
3400         # Extract video links on video page
3401         """Extract video links of all sizes"""
3402         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3403         mobj = re.findall(pattern, webpage)
3404         if len(mobj) == 0:
3405             self._downloader.trouble(u'ERROR: unable to extract video links')
3406
3407         # Sort in resolution
3408         links = sorted(mobj)
3409
3410         # Choose the lowest of the sort, i.e. highest resolution
3411         video_url = links[-1]
3412         # Only get the url. The resolution part in the tuple has no use anymore
3413         video_url = video_url[-1]
3414         # Treat escaped \u0026 style hex
3415         try:
3416             video_url = video_url.decode("unicode_escape")
3417         except AttributeError: # Python 3
3418             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3419
3420
3421         return [{
3422             'id':       video_id,
3423             'url':      video_url,
3424             'uploader': uploader,
3425             'upload_date':  upload_date,
3426             'title':    video_title,
3427             'ext':      video_extension,
3428         }]
3429
3430 class NBAIE(InfoExtractor):
3431     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3432     IE_NAME = u'nba'
3433
3434     def _real_extract(self, url):
3435         mobj = re.match(self._VALID_URL, url)
3436         if mobj is None:
3437             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3438             return
3439
3440         video_id = mobj.group(1)
3441         if video_id.endswith('/index.html'):
3442             video_id = video_id[:-len('/index.html')]
3443
3444         webpage = self._download_webpage(url, video_id)
3445
3446         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3447         def _findProp(rexp, default=None):
3448             m = re.search(rexp, webpage)
3449             if m:
3450                 return unescapeHTML(m.group(1))
3451             else:
3452                 return default
3453
3454         shortened_video_id = video_id.rpartition('/')[2]
3455         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3456         info = {
3457             'id': shortened_video_id,
3458             'url': video_url,
3459             'ext': 'mp4',
3460             'title': title,
3461             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3462             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3463         }
3464         return [info]
3465
3466 class JustinTVIE(InfoExtractor):
3467     """Information extractor for justin.tv and twitch.tv"""
3468     # TODO: One broadcast may be split into multiple videos. The key
3469     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3470     # starts at 1 and increases. Can we treat all parts as one video?
3471
3472     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3473         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3474     _JUSTIN_PAGE_LIMIT = 100
3475     IE_NAME = u'justin.tv'
3476
3477     def report_extraction(self, file_id):
3478         """Report information extraction."""
3479         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3480
3481     def report_download_page(self, channel, offset):
3482         """Report attempt to download a single page of videos."""
3483         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3484                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3485
3486     # Return count of items, list of *valid* items
3487     def _parse_page(self, url):
3488         try:
3489             urlh = compat_urllib_request.urlopen(url)
3490             webpage_bytes = urlh.read()
3491             webpage = webpage_bytes.decode('utf-8', 'ignore')
3492         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3494             return
3495
3496         response = json.loads(webpage)
3497         if type(response) != list:
3498             error_text = response.get('error', 'unknown error')
3499             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3500             return
3501         info = []
3502         for clip in response:
3503             video_url = clip['video_file_url']
3504             if video_url:
3505                 video_extension = os.path.splitext(video_url)[1][1:]
3506                 video_date = re.sub('-', '', clip['start_time'][:10])
3507                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3508                 video_id = clip['id']
3509                 video_title = clip.get('title', video_id)
3510                 info.append({
3511                     'id': video_id,
3512                     'url': video_url,
3513                     'title': video_title,
3514                     'uploader': clip.get('channel_name', video_uploader_id),
3515                     'uploader_id': video_uploader_id,
3516                     'upload_date': video_date,
3517                     'ext': video_extension,
3518                 })
3519         return (len(response), info)
3520
3521     def _real_extract(self, url):
3522         mobj = re.match(self._VALID_URL, url)
3523         if mobj is None:
3524             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3525             return
3526
3527         api = 'http://api.justin.tv'
3528         video_id = mobj.group(mobj.lastindex)
3529         paged = False
3530         if mobj.lastindex == 1:
3531             paged = True
3532             api += '/channel/archives/%s.json'
3533         else:
3534             api += '/broadcast/by_archive/%s.json'
3535         api = api % (video_id,)
3536
3537         self.report_extraction(video_id)
3538
3539         info = []
3540         offset = 0
3541         limit = self._JUSTIN_PAGE_LIMIT
3542         while True:
3543             if paged:
3544                 self.report_download_page(video_id, offset)
3545             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3546             page_count, page_info = self._parse_page(page_url)
3547             info.extend(page_info)
3548             if not paged or page_count != limit:
3549                 break
3550             offset += limit
3551         return info
3552
3553 class FunnyOrDieIE(InfoExtractor):
3554     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3555
3556     def _real_extract(self, url):
3557         mobj = re.match(self._VALID_URL, url)
3558         if mobj is None:
3559             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3560             return
3561
3562         video_id = mobj.group('id')
3563         webpage = self._download_webpage(url, video_id)
3564
3565         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3566         if not m:
3567             self._downloader.trouble(u'ERROR: unable to find video information')
3568         video_url = unescapeHTML(m.group('url'))
3569
3570         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3571         if not m:
3572             self._downloader.trouble(u'Cannot find video title')
3573         title = unescapeHTML(m.group('title'))
3574
3575         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3576         if m:
3577             desc = unescapeHTML(m.group('desc'))
3578         else:
3579             desc = None
3580
3581         info = {
3582             'id': video_id,
3583             'url': video_url,
3584             'ext': 'mp4',
3585             'title': title,
3586             'description': desc,
3587         }
3588         return [info]
3589
3590 class SteamIE(InfoExtractor):
3591     _VALID_URL = r"""http://store.steampowered.com/
3592                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3593                 (?P<gameID>\d+)/?
3594                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3595                 """
3596
3597     @classmethod
3598     def suitable(cls, url):
3599         """Receives a URL and returns True if suitable for this IE."""
3600         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3601
3602     def _real_extract(self, url):
3603         m = re.match(self._VALID_URL, url, re.VERBOSE)
3604         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3605         gameID = m.group('gameID')
3606         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3607         webpage = self._download_webpage(videourl, gameID)
3608         mweb = re.finditer(urlRE, webpage)
3609         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3610         titles = re.finditer(namesRE, webpage)
3611         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3612         thumbs = re.finditer(thumbsRE, webpage)
3613         videos = []
3614         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3615             video_id = vid.group('videoID')
3616             title = vtitle.group('videoName')
3617             video_url = vid.group('videoURL')
3618             video_thumb = thumb.group('thumbnail')
3619             if not video_url:
3620                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3621             info = {
3622                 'id':video_id,
3623                 'url':video_url,
3624                 'ext': 'flv',
3625                 'title': unescapeHTML(title),
3626                 'thumbnail': video_thumb
3627                   }
3628             videos.append(info)
3629         return videos
3630
3631 class UstreamIE(InfoExtractor):
3632     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3633     IE_NAME = u'ustream'
3634
3635     def _real_extract(self, url):
3636         m = re.match(self._VALID_URL, url)
3637         video_id = m.group('videoID')
3638         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3639         webpage = self._download_webpage(url, video_id)
3640         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3641         title = m.group('title')
3642         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3643         uploader = m.group('uploader')
3644         info = {
3645                 'id':video_id,
3646                 'url':video_url,
3647                 'ext': 'flv',
3648                 'title': title,
3649                 'uploader': uploader
3650                   }
3651         return [info]
3652
3653 class RBMARadioIE(InfoExtractor):
3654     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3655
3656     def _real_extract(self, url):
3657         m = re.match(self._VALID_URL, url)
3658         video_id = m.group('videoID')
3659
3660         webpage = self._download_webpage(url, video_id)
3661         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3662         if not m:
3663             raise ExtractorError(u'Cannot find metadata')
3664         json_data = m.group(1)
3665
3666         try:
3667             data = json.loads(json_data)
3668         except ValueError as e:
3669             raise ExtractorError(u'Invalid JSON: ' + str(e))
3670
3671         video_url = data['akamai_url'] + '&cbr=256'
3672         url_parts = compat_urllib_parse_urlparse(video_url)
3673         video_ext = url_parts.path.rpartition('.')[2]
3674         info = {
3675                 'id': video_id,
3676                 'url': video_url,
3677                 'ext': video_ext,
3678                 'title': data['title'],
3679                 'description': data.get('teaser_text'),
3680                 'location': data.get('country_of_origin'),
3681                 'uploader': data.get('host', {}).get('name'),
3682                 'uploader_id': data.get('host', {}).get('slug'),
3683                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3684                 'duration': data.get('duration'),
3685         }
3686         return [info]
3687
3688
3689 class YouPornIE(InfoExtractor):
3690     """Information extractor for youporn.com."""
3691     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3692
3693     def _print_formats(self, formats):
3694         """Print all available formats"""
3695         print(u'Available formats:')
3696         print(u'ext\t\tformat')
3697         print(u'---------------------------------')
3698         for format in formats:
3699             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3700
3701     def _specific(self, req_format, formats):
3702         for x in formats:
3703             if(x["format"]==req_format):
3704                 return x
3705         return None
3706
3707     def _real_extract(self, url):
3708         mobj = re.match(self._VALID_URL, url)
3709         if mobj is None:
3710             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3711             return
3712
3713         video_id = mobj.group('videoid')
3714
3715         req = compat_urllib_request.Request(url)
3716         req.add_header('Cookie', 'age_verified=1')
3717         webpage = self._download_webpage(req, video_id)
3718
3719         # Get the video title
3720         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3721         if result is None:
3722             raise ExtractorError(u'Unable to extract video title')
3723         video_title = result.group('title').strip()
3724
3725         # Get the video date
3726         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3727         if result is None:
3728             self._downloader.report_warning(u'unable to extract video date')
3729             upload_date = None
3730         else:
3731             upload_date = result.group('date').strip()
3732
3733         # Get the video uploader
3734         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3735         if result is None:
3736             self._downloader.report_warning(u'unable to extract uploader')
3737             video_uploader = None
3738         else:
3739             video_uploader = result.group('uploader').strip()
3740             video_uploader = clean_html( video_uploader )
3741
3742         # Get all of the formats available
3743         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3744         result = re.search(DOWNLOAD_LIST_RE, webpage)
3745         if result is None:
3746             raise ExtractorError(u'Unable to extract download list')
3747         download_list_html = result.group('download_list').strip()
3748
3749         # Get all of the links from the page
3750         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3751         links = re.findall(LINK_RE, download_list_html)
3752         if(len(links) == 0):
3753             raise ExtractorError(u'ERROR: no known formats available for video')
3754
3755         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3756
3757         formats = []
3758         for link in links:
3759
3760             # A link looks like this:
3761             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3762             # A path looks like this:
3763             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3764             video_url = unescapeHTML( link )
3765             path = compat_urllib_parse_urlparse( video_url ).path
3766             extension = os.path.splitext( path )[1][1:]
3767             format = path.split('/')[4].split('_')[:2]
3768             size = format[0]
3769             bitrate = format[1]
3770             format = "-".join( format )
3771             title = u'%s-%s-%s' % (video_title, size, bitrate)
3772
3773             formats.append({
3774                 'id': video_id,
3775                 'url': video_url,
3776                 'uploader': video_uploader,
3777                 'upload_date': upload_date,
3778                 'title': title,
3779                 'ext': extension,
3780                 'format': format,
3781                 'thumbnail': None,
3782                 'description': None,
3783                 'player_url': None
3784             })
3785
3786         if self._downloader.params.get('listformats', None):
3787             self._print_formats(formats)
3788             return
3789
3790         req_format = self._downloader.params.get('format', None)
3791         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3792
3793         if req_format is None or req_format == 'best':
3794             return [formats[0]]
3795         elif req_format == 'worst':
3796             return [formats[-1]]
3797         elif req_format in ('-1', 'all'):
3798             return formats
3799         else:
3800             format = self._specific( req_format, formats )
3801             if result is None:
3802                 self._downloader.trouble(u'ERROR: requested format not available')
3803                 return
3804             return [format]
3805
3806
3807
3808 class PornotubeIE(InfoExtractor):
3809     """Information extractor for pornotube.com."""
3810     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3811
3812     def _real_extract(self, url):
3813         mobj = re.match(self._VALID_URL, url)
3814         if mobj is None:
3815             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3816             return
3817
3818         video_id = mobj.group('videoid')
3819         video_title = mobj.group('title')
3820
3821         # Get webpage content
3822         webpage = self._download_webpage(url, video_id)
3823
3824         # Get the video URL
3825         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3826         result = re.search(VIDEO_URL_RE, webpage)
3827         if result is None:
3828             self._downloader.trouble(u'ERROR: unable to extract video url')
3829             return
3830         video_url = compat_urllib_parse.unquote(result.group('url'))
3831
3832         #Get the uploaded date
3833         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3834         result = re.search(VIDEO_UPLOADED_RE, webpage)
3835         if result is None:
3836             self._downloader.trouble(u'ERROR: unable to extract video title')
3837             return
3838         upload_date = result.group('date')
3839
3840         info = {'id': video_id,
3841                 'url': video_url,
3842                 'uploader': None,
3843                 'upload_date': upload_date,
3844                 'title': video_title,
3845                 'ext': 'flv',
3846                 'format': 'flv'}
3847
3848         return [info]
3849
3850 class YouJizzIE(InfoExtractor):
3851     """Information extractor for youjizz.com."""
3852     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3853
3854     def _real_extract(self, url):
3855         mobj = re.match(self._VALID_URL, url)
3856         if mobj is None:
3857             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3858             return
3859
3860         video_id = mobj.group('videoid')
3861
3862         # Get webpage content
3863         webpage = self._download_webpage(url, video_id)
3864
3865         # Get the video title
3866         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3867         if result is None:
3868             raise ExtractorError(u'ERROR: unable to extract video title')
3869         video_title = result.group('title').strip()
3870
3871         # Get the embed page
3872         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3873         if result is None:
3874             raise ExtractorError(u'ERROR: unable to extract embed page')
3875
3876         embed_page_url = result.group(0).strip()
3877         video_id = result.group('videoid')
3878
3879         webpage = self._download_webpage(embed_page_url, video_id)
3880
3881         # Get the video URL
3882         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3883         if result is None:
3884             raise ExtractorError(u'ERROR: unable to extract video url')
3885         video_url = result.group('source')
3886
3887         info = {'id': video_id,
3888                 'url': video_url,
3889                 'title': video_title,
3890                 'ext': 'flv',
3891                 'format': 'flv',
3892                 'player_url': embed_page_url}
3893
3894         return [info]
3895
3896 class EightTracksIE(InfoExtractor):
3897     IE_NAME = '8tracks'
3898     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3899
3900     def _real_extract(self, url):
3901         mobj = re.match(self._VALID_URL, url)
3902         if mobj is None:
3903             raise ExtractorError(u'Invalid URL: %s' % url)
3904         playlist_id = mobj.group('id')
3905
3906         webpage = self._download_webpage(url, playlist_id)
3907
3908         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3909         if not m:
3910             raise ExtractorError(u'Cannot find trax information')
3911         json_like = m.group(1)
3912         data = json.loads(json_like)
3913
3914         session = str(random.randint(0, 1000000000))
3915         mix_id = data['id']
3916         track_count = data['tracks_count']
3917         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3918         next_url = first_url
3919         res = []
3920         for i in itertools.count():
3921             api_json = self._download_webpage(next_url, playlist_id,
3922                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3923                 errnote=u'Failed to download song information')
3924             api_data = json.loads(api_json)
3925             track_data = api_data[u'set']['track']
3926             info = {
3927                 'id': track_data['id'],
3928                 'url': track_data['track_file_stream_url'],
3929                 'title': track_data['performer'] + u' - ' + track_data['name'],
3930                 'raw_title': track_data['name'],
3931                 'uploader_id': data['user']['login'],
3932                 'ext': 'm4a',
3933             }
3934             res.append(info)
3935             if api_data['set']['at_last_track']:
3936                 break
3937             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3938         return res
3939
3940 class KeekIE(InfoExtractor):
3941     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3942     IE_NAME = u'keek'
3943
3944     def _real_extract(self, url):
3945         m = re.match(self._VALID_URL, url)
3946         video_id = m.group('videoID')
3947         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3948         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3949         webpage = self._download_webpage(url, video_id)
3950         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3951         title = unescapeHTML(m.group('title'))
3952         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3953         uploader = unescapeHTML(m.group('uploader'))
3954         info = {
3955                 'id':video_id,
3956                 'url':video_url,
3957                 'ext': 'mp4',
3958                 'title': title,
3959                 'thumbnail': thumbnail,
3960                 'uploader': uploader
3961         }
3962         return [info]
3963
3964 class TEDIE(InfoExtractor):
3965     _VALID_URL=r'''http://www.ted.com/
3966                    (
3967                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3968                         |
3969                         ((?P<type_talk>talks)) # We have a simple talk
3970                    )
3971                    /(?P<name>\w+) # Here goes the name and then ".html"
3972                    '''
3973
3974     @classmethod
3975     def suitable(cls, url):
3976         """Receives a URL and returns True if suitable for this IE."""
3977         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3978
3979     def _real_extract(self, url):
3980         m=re.match(self._VALID_URL, url, re.VERBOSE)
3981         if m.group('type_talk'):
3982             return [self._talk_info(url)]
3983         else :
3984             playlist_id=m.group('playlist_id')
3985             name=m.group('name')
3986             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3987             return self._playlist_videos_info(url,name,playlist_id)
3988
3989     def _talk_video_link(self,mediaSlug):
3990         '''Returns the video link for that mediaSlug'''
3991         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3992
3993     def _playlist_videos_info(self,url,name,playlist_id=0):
3994         '''Returns the videos of the playlist'''
3995         video_RE=r'''
3996                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3997                      ([.\s]*?)data-playlist_item_id="(\d+)"
3998                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3999                      '''
4000         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4001         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4002         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4003         m_names=re.finditer(video_name_RE,webpage)
4004         info=[]
4005         for m_video, m_name in zip(m_videos,m_names):
4006             video_id=m_video.group('video_id')
4007             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4008             info.append(self._talk_info(talk_url,video_id))
4009         return info
4010
4011     def _talk_info(self, url, video_id=0):
4012         """Return the video for the talk in the url"""
4013         m=re.match(self._VALID_URL, url,re.VERBOSE)
4014         videoName=m.group('name')
4015         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4016         # If the url includes the language we get the title translated
4017         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4018         title=re.search(title_RE, webpage).group('title')
4019         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4020                         "id":(?P<videoID>[\d]+).*?
4021                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4022         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4023         thumb_match=re.search(thumb_RE,webpage)
4024         info_match=re.search(info_RE,webpage,re.VERBOSE)
4025         video_id=info_match.group('videoID')
4026         mediaSlug=info_match.group('mediaSlug')
4027         video_url=self._talk_video_link(mediaSlug)
4028         info = {
4029                 'id': video_id,
4030                 'url': video_url,
4031                 'ext': 'mp4',
4032                 'title': title,
4033                 'thumbnail': thumb_match.group('thumbnail')
4034                 }
4035         return info
4036
4037 class MySpassIE(InfoExtractor):
4038     _VALID_URL = r'http://www.myspass.de/.*'
4039
4040     def _real_extract(self, url):
4041         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4042
4043         # video id is the last path element of the URL
4044         # usually there is a trailing slash, so also try the second but last
4045         url_path = compat_urllib_parse_urlparse(url).path
4046         url_parent_path, video_id = os.path.split(url_path)
4047         if not video_id:
4048             _, video_id = os.path.split(url_parent_path)
4049
4050         # get metadata
4051         metadata_url = META_DATA_URL_TEMPLATE % video_id
4052         metadata_text = self._download_webpage(metadata_url, video_id)
4053         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4054
4055         # extract values from metadata
4056         url_flv_el = metadata.find('url_flv')
4057         if url_flv_el is None:
4058             self._downloader.trouble(u'ERROR: unable to extract download url')
4059             return
4060         video_url = url_flv_el.text
4061         extension = os.path.splitext(video_url)[1][1:]
4062         title_el = metadata.find('title')
4063         if title_el is None:
4064             self._downloader.trouble(u'ERROR: unable to extract title')
4065             return
4066         title = title_el.text
4067         format_id_el = metadata.find('format_id')
4068         if format_id_el is None:
4069             format = ext
4070         else:
4071             format = format_id_el.text
4072         description_el = metadata.find('description')
4073         if description_el is not None:
4074             description = description_el.text
4075         else:
4076             description = None
4077         imagePreview_el = metadata.find('imagePreview')
4078         if imagePreview_el is not None:
4079             thumbnail = imagePreview_el.text
4080         else:
4081             thumbnail = None
4082         info = {
4083             'id': video_id,
4084             'url': video_url,
4085             'title': title,
4086             'ext': extension,
4087             'format': format,
4088             'thumbnail': thumbnail,
4089             'description': description
4090         }
4091         return [info]
4092
4093 def gen_extractors():
4094     """ Return a list of an instance of every supported extractor.
4095     The order does matter; the first extractor matched is the one handling the URL.
4096     """
4097     return [
4098         YoutubePlaylistIE(),
4099         YoutubeChannelIE(),
4100         YoutubeUserIE(),
4101         YoutubeSearchIE(),
4102         YoutubeIE(),
4103         MetacafeIE(),
4104         DailymotionIE(),
4105         GoogleSearchIE(),
4106         PhotobucketIE(),
4107         YahooIE(),
4108         YahooSearchIE(),
4109         DepositFilesIE(),
4110         FacebookIE(),
4111         BlipTVUserIE(),
4112         BlipTVIE(),
4113         VimeoIE(),
4114         MyVideoIE(),
4115         ComedyCentralIE(),
4116         EscapistIE(),
4117         CollegeHumorIE(),
4118         XVideosIE(),
4119         SoundcloudIE(),
4120         InfoQIE(),
4121         MixcloudIE(),
4122         StanfordOpenClassroomIE(),
4123         MTVIE(),
4124         YoukuIE(),
4125         XNXXIE(),
4126         YouJizzIE(),
4127         PornotubeIE(),
4128         YouPornIE(),
4129         GooglePlusIE(),
4130         ArteTvIE(),
4131         NBAIE(),
4132         JustinTVIE(),
4133         FunnyOrDieIE(),
4134         SteamIE(),
4135         UstreamIE(),
4136         RBMARadioIE(),
4137         EightTracksIE(),
4138         KeekIE(),
4139         TEDIE(),
4140         MySpassIE(),
4141         GenericIE()
4142     ]
4143
4144