_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _closed_captions_xml_to_srt(self, xml_string):
 232         srt = ''
 233         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 234         # TODO parse xml instead of regex
 235         for n, (start, dur_tag, dur, caption) in enumerate(texts):
 236             if not dur: dur = '4'
 237             start = float(start)
 238             end = start + float(dur)
 239             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 240             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 241             caption = unescapeHTML(caption)
 242             caption = unescapeHTML(caption) # double cycle, intentional
 243             srt += str(n+1) + '\n'
 244             srt += start + ' --> ' + end + '\n'
 245             srt += caption + '\n\n'
 246         return srt
 247
 248     def _extract_subtitles(self, video_id):
 249         self.report_video_subtitles_download(video_id)
 250         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 251         try:
 252             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 254             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 255         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 256         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 257         if not srt_lang_list:
 258             return (u'WARNING: video has no closed captions', None)
 259         if self._downloader.params.get('subtitleslang', False):
 260             srt_lang = self._downloader.params.get('subtitleslang')
 261         elif 'en' in srt_lang_list:
 262             srt_lang = 'en'
 263         else:
 264             srt_lang = list(srt_lang_list.keys())[0]
 265         if not srt_lang in srt_lang_list:
 266             return (u'WARNING: no closed captions found in the specified language', None)
 267         params = compat_urllib_parse.urlencode({
 268             'lang': srt_lang,
 269             'name': srt_lang_list[srt_lang].encode('utf-8'),
 270             'v': video_id,
 271         })
 272         url = 'http://www.youtube.com/api/timedtext?' + params
 273         try:
 274             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
 275         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 276             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 277         if not srt_xml:
 278             return (u'WARNING: Did not fetch video subtitles', None)
 279         return (None, self._closed_captions_xml_to_srt(srt_xml))
 280
 281     def _print_formats(self, formats):
 282         print('Available formats:')
 283         for x in formats:
 284             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 285
 286     def _real_initialize(self):
 287         if self._downloader is None:
 288             return
 289
 290         username = None
 291         password = None
 292         downloader_params = self._downloader.params
 293
 294         # Attempt to use provided username and password or .netrc data
 295         if downloader_params.get('username', None) is not None:
 296             username = downloader_params['username']
 297             password = downloader_params['password']
 298         elif downloader_params.get('usenetrc', False):
 299             try:
 300                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 301                 if info is not None:
 302                     username = info[0]
 303                     password = info[2]
 304                 else:
 305                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 306             except (IOError, netrc.NetrcParseError) as err:
 307                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 308                 return
 309
 310         # Set language
 311         request = compat_urllib_request.Request(self._LANG_URL)
 312         try:
 313             self.report_lang()
 314             compat_urllib_request.urlopen(request).read()
 315         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 316             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 317             return
 318
 319         # No authentication to be performed
 320         if username is None:
 321             return
 322
 323         request = compat_urllib_request.Request(self._LOGIN_URL)
 324         try:
 325             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 328             return
 329
 330         galx = None
 331         dsh = None
 332         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 333         if match:
 334           galx = match.group(1)
 335
 336         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 337         if match:
 338           dsh = match.group(1)
 339
 340         # Log in
 341         login_form_strs = {
 342                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 343                 u'Email': username,
 344                 u'GALX': galx,
 345                 u'Passwd': password,
 346                 u'PersistentCookie': u'yes',
 347                 u'_utf8': u'霱',
 348                 u'bgresponse': u'js_disabled',
 349                 u'checkConnection': u'',
 350                 u'checkedDomains': u'youtube',
 351                 u'dnConn': u'',
 352                 u'dsh': dsh,
 353                 u'pstMsg': u'0',
 354                 u'rmShown': u'1',
 355                 u'secTok': u'',
 356                 u'signIn': u'Sign in',
 357                 u'timeStmp': u'',
 358                 u'service': u'youtube',
 359                 u'uilel': u'3',
 360                 u'hl': u'en_US',
 361         }
 362         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 363         # chokes on unicode
 364         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 365         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 366         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 367         try:
 368             self.report_login()
 369             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 370             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 371                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 372                 return
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 375             return
 376
 377         # Confirm age
 378         age_form = {
 379                 'next_url':     '/',
 380                 'action_confirm':   'Confirm',
 381                 }
 382         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 383         try:
 384             self.report_age_confirmation()
 385             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 387             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 388             return
 389
 390     def _extract_id(self, url):
 391         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 392         if mobj is None:
 393             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 394             return
 395         video_id = mobj.group(2)
 396         return video_id
 397
 398     def _real_extract(self, url):
 399         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 400         mobj = re.search(self._NEXT_URL_RE, url)
 401         if mobj:
 402             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 403         video_id = self._extract_id(url)
 404
 405         # Get video webpage
 406         self.report_video_webpage_download(video_id)
 407         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 408         request = compat_urllib_request.Request(url)
 409         try:
 410             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 411         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 412             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 413             return
 414
 415         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 416
 417         # Attempt to extract SWF player URL
 418         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 419         if mobj is not None:
 420             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 421         else:
 422             player_url = None
 423
 424         # Get video info
 425         self.report_video_info_webpage_download(video_id)
 426         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 427             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 428                     % (video_id, el_type))
 429             request = compat_urllib_request.Request(video_info_url)
 430             try:
 431                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 432                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 433                 video_info = compat_parse_qs(video_info_webpage)
 434                 if 'token' in video_info:
 435                     break
 436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 437                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 438                 return
 439         if 'token' not in video_info:
 440             if 'reason' in video_info:
 441                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 442             else:
 443                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 444             return
 445
 446         # Check for "rental" videos
 447         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 448             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 449             return
 450
 451         # Start extracting information
 452         self.report_information_extraction(video_id)
 453
 454         # uploader
 455         if 'author' not in video_info:
 456             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 457             return
 458         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 459
 460         # uploader_id
 461         video_uploader_id = None
 462         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 463         if mobj is not None:
 464             video_uploader_id = mobj.group(1)
 465         else:
 466             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 467
 468         # title
 469         if 'title' not in video_info:
 470             self._downloader.trouble(u'ERROR: unable to extract video title')
 471             return
 472         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 473
 474         # thumbnail image
 475         if 'thumbnail_url' not in video_info:
 476             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 477             video_thumbnail = ''
 478         else:   # don't panic if we can't find it
 479             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 480
 481         # upload date
 482         upload_date = None
 483         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 484         if mobj is not None:
 485             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 486             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 487             for expression in format_expressions:
 488                 try:
 489                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 490                 except:
 491                     pass
 492
 493         # description
 494         video_description = get_element_by_id("eow-description", video_webpage)
 495         if video_description:
 496             video_description = clean_html(video_description)
 497         else:
 498             video_description = ''
 499
 500         # closed captions
 501         video_subtitles = None
 502         if self._downloader.params.get('writesubtitles', False):
 503             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 504             if srt_error:
 505                 self._downloader.trouble(srt_error)
 506
 507         if 'length_seconds' not in video_info:
 508             self._downloader.trouble(u'WARNING: unable to extract video duration')
 509             video_duration = ''
 510         else:
 511             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 512
 513         # token
 514         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 515
 516         # Decide which formats to download
 517         req_format = self._downloader.params.get('format', None)
 518
 519         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 520             self.report_rtmp_download()
 521             video_url_list = [(None, video_info['conn'][0])]
 522         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 523             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 524             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 525             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 526             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 527
 528             format_limit = self._downloader.params.get('format_limit', None)
 529             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 530             if format_limit is not None and format_limit in available_formats:
 531                 format_list = available_formats[available_formats.index(format_limit):]
 532             else:
 533                 format_list = available_formats
 534             existing_formats = [x for x in format_list if x in url_map]
 535             if len(existing_formats) == 0:
 536                 self._downloader.trouble(u'ERROR: no known formats available for video')
 537                 return
 538             if self._downloader.params.get('listformats', None):
 539                 self._print_formats(existing_formats)
 540                 return
 541             if req_format is None or req_format == 'best':
 542                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 543             elif req_format == 'worst':
 544                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 545             elif req_format in ('-1', 'all'):
 546                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 547             else:
 548                 # Specific formats. We pick the first in a slash-delimeted sequence.
 549                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 550                 req_formats = req_format.split('/')
 551                 video_url_list = None
 552                 for rf in req_formats:
 553                     if rf in url_map:
 554                         video_url_list = [(rf, url_map[rf])]
 555                         break
 556                 if video_url_list is None:
 557                     self._downloader.trouble(u'ERROR: requested format not available')
 558                     return
 559         else:
 560             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 561             return
 562
 563         results = []
 564         for format_param, video_real_url in video_url_list:
 565             # Extension
 566             video_extension = self._video_extensions.get(format_param, 'flv')
 567
 568             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 569                                               self._video_dimensions.get(format_param, '???'))
 570
 571             results.append({
 572                 'id':       video_id,
 573                 'url':      video_real_url,
 574                 'uploader': video_uploader,
 575                 'uploader_id': video_uploader_id,
 576                 'upload_date':  upload_date,
 577                 'title':    video_title,
 578                 'ext':      video_extension,
 579                 'format':   video_format,
 580                 'thumbnail':    video_thumbnail,
 581                 'description':  video_description,
 582                 'player_url':   player_url,
 583                 'subtitles':    video_subtitles,
 584                 'duration':     video_duration
 585             })
 586         return results
 587
 588
 589 class MetacafeIE(InfoExtractor):
 590     """Information Extractor for metacafe.com."""
 591
 592     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 593     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 594     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 595     IE_NAME = u'metacafe'
 596
 597     def __init__(self, downloader=None):
 598         InfoExtractor.__init__(self, downloader)
 599
 600     def report_disclaimer(self):
 601         """Report disclaimer retrieval."""
 602         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 603
 604     def report_age_confirmation(self):
 605         """Report attempt to confirm age."""
 606         self._downloader.to_screen(u'[metacafe] Confirming age')
 607
 608     def report_download_webpage(self, video_id):
 609         """Report webpage download."""
 610         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 611
 612     def report_extraction(self, video_id):
 613         """Report information extraction."""
 614         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 615
 616     def _real_initialize(self):
 617         # Retrieve disclaimer
 618         request = compat_urllib_request.Request(self._DISCLAIMER)
 619         try:
 620             self.report_disclaimer()
 621             disclaimer = compat_urllib_request.urlopen(request).read()
 622         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 623             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 624             return
 625
 626         # Confirm age
 627         disclaimer_form = {
 628             'filters': '0',
 629             'submit': "Continue - I'm over 18",
 630             }
 631         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 632         try:
 633             self.report_age_confirmation()
 634             disclaimer = compat_urllib_request.urlopen(request).read()
 635         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 636             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 637             return
 638
 639     def _real_extract(self, url):
 640         # Extract id and simplified title from URL
 641         mobj = re.match(self._VALID_URL, url)
 642         if mobj is None:
 643             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 644             return
 645
 646         video_id = mobj.group(1)
 647
 648         # Check if video comes from YouTube
 649         mobj2 = re.match(r'^yt-(.*)$', video_id)
 650         if mobj2 is not None:
 651             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 652             return
 653
 654         # Retrieve video webpage to extract further information
 655         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 656         try:
 657             self.report_download_webpage(video_id)
 658             webpage = compat_urllib_request.urlopen(request).read()
 659         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 660             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 661             return
 662
 663         # Extract URL, uploader and title from webpage
 664         self.report_extraction(video_id)
 665         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 666         if mobj is not None:
 667             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 668             video_extension = mediaURL[-3:]
 669
 670             # Extract gdaKey if available
 671             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 672             if mobj is None:
 673                 video_url = mediaURL
 674             else:
 675                 gdaKey = mobj.group(1)
 676                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 677         else:
 678             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 679             if mobj is None:
 680                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 681                 return
 682             vardict = compat_parse_qs(mobj.group(1))
 683             if 'mediaData' not in vardict:
 684                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 685                 return
 686             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 687             if mobj is None:
 688                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 689                 return
 690             mediaURL = mobj.group(1).replace('\\/', '/')
 691             video_extension = mediaURL[-3:]
 692             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 693
 694         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 695         if mobj is None:
 696             self._downloader.trouble(u'ERROR: unable to extract title')
 697             return
 698         video_title = mobj.group(1).decode('utf-8')
 699
 700         mobj = re.search(r'submitter=(.*?);', webpage)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 703             return
 704         video_uploader = mobj.group(1)
 705
 706         return [{
 707             'id':       video_id.decode('utf-8'),
 708             'url':      video_url.decode('utf-8'),
 709             'uploader': video_uploader.decode('utf-8'),
 710             'upload_date':  None,
 711             'title':    video_title,
 712             'ext':      video_extension.decode('utf-8'),
 713         }]
 714
 715
 716 class DailymotionIE(InfoExtractor):
 717     """Information Extractor for Dailymotion"""
 718
 719     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 720     IE_NAME = u'dailymotion'
 721     _WORKING = False
 722
 723     def __init__(self, downloader=None):
 724         InfoExtractor.__init__(self, downloader)
 725
 726     def report_extraction(self, video_id):
 727         """Report information extraction."""
 728         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 729
 730     def _real_extract(self, url):
 731         # Extract id and simplified title from URL
 732         mobj = re.match(self._VALID_URL, url)
 733         if mobj is None:
 734             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 735             return
 736
 737         video_id = mobj.group(1).split('_')[0].split('?')[0]
 738
 739         video_extension = 'mp4'
 740
 741         # Retrieve video webpage to extract further information
 742         request = compat_urllib_request.Request(url)
 743         request.add_header('Cookie', 'family_filter=off')
 744         webpage = self._download_webpage(request, video_id)
 745
 746         # Extract URL, uploader and title from webpage
 747         self.report_extraction(video_id)
 748         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 749         if mobj is None:
 750             self._downloader.trouble(u'ERROR: unable to extract media URL')
 751             return
 752         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 753
 754         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 755             if key in flashvars:
 756                 max_quality = key
 757                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 758                 break
 759         else:
 760             self._downloader.trouble(u'ERROR: unable to extract video URL')
 761             return
 762
 763         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 764         if mobj is None:
 765             self._downloader.trouble(u'ERROR: unable to extract video URL')
 766             return
 767
 768         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 769
 770         # TODO: support choosing qualities
 771
 772         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 773         if mobj is None:
 774             self._downloader.trouble(u'ERROR: unable to extract title')
 775             return
 776         video_title = unescapeHTML(mobj.group('title'))
 777
 778         video_uploader = None
 779         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 780         if mobj is None:
 781             # lookin for official user
 782             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 783             if mobj_official is None:
 784                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 785             else:
 786                 video_uploader = mobj_official.group(1)
 787         else:
 788             video_uploader = mobj.group(1)
 789
 790         video_upload_date = None
 791         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 792         if mobj is not None:
 793             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 794
 795         return [{
 796             'id':       video_id,
 797             'url':      video_url,
 798             'uploader': video_uploader,
 799             'upload_date':  video_upload_date,
 800             'title':    video_title,
 801             'ext':      video_extension,
 802         }]
 803
 804
 805 class PhotobucketIE(InfoExtractor):
 806     """Information extractor for photobucket.com."""
 807
 808     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 809     IE_NAME = u'photobucket'
 810
 811     def __init__(self, downloader=None):
 812         InfoExtractor.__init__(self, downloader)
 813
 814     def report_download_webpage(self, video_id):
 815         """Report webpage download."""
 816         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 817
 818     def report_extraction(self, video_id):
 819         """Report information extraction."""
 820         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 821
 822     def _real_extract(self, url):
 823         # Extract id from URL
 824         mobj = re.match(self._VALID_URL, url)
 825         if mobj is None:
 826             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 827             return
 828
 829         video_id = mobj.group(1)
 830
 831         video_extension = 'flv'
 832
 833         # Retrieve video webpage to extract further information
 834         request = compat_urllib_request.Request(url)
 835         try:
 836             self.report_download_webpage(video_id)
 837             webpage = compat_urllib_request.urlopen(request).read()
 838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 839             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 840             return
 841
 842         # Extract URL, uploader, and title from webpage
 843         self.report_extraction(video_id)
 844         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 845         if mobj is None:
 846             self._downloader.trouble(u'ERROR: unable to extract media URL')
 847             return
 848         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 849
 850         video_url = mediaURL
 851
 852         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 853         if mobj is None:
 854             self._downloader.trouble(u'ERROR: unable to extract title')
 855             return
 856         video_title = mobj.group(1).decode('utf-8')
 857
 858         video_uploader = mobj.group(2).decode('utf-8')
 859
 860         return [{
 861             'id':       video_id.decode('utf-8'),
 862             'url':      video_url.decode('utf-8'),
 863             'uploader': video_uploader,
 864             'upload_date':  None,
 865             'title':    video_title,
 866             'ext':      video_extension.decode('utf-8'),
 867         }]
 868
 869
 870 class YahooIE(InfoExtractor):
 871     """Information extractor for video.yahoo.com."""
 872
 873     _WORKING = False
 874     # _VALID_URL matches all Yahoo! Video URLs
 875     # _VPAGE_URL matches only the extractable '/watch/' URLs
 876     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 877     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 878     IE_NAME = u'video.yahoo'
 879
 880     def __init__(self, downloader=None):
 881         InfoExtractor.__init__(self, downloader)
 882
 883     def report_download_webpage(self, video_id):
 884         """Report webpage download."""
 885         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 886
 887     def report_extraction(self, video_id):
 888         """Report information extraction."""
 889         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 890
 891     def _real_extract(self, url, new_video=True):
 892         # Extract ID from URL
 893         mobj = re.match(self._VALID_URL, url)
 894         if mobj is None:
 895             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 896             return
 897
 898         video_id = mobj.group(2)
 899         video_extension = 'flv'
 900
 901         # Rewrite valid but non-extractable URLs as
 902         # extractable English language /watch/ URLs
 903         if re.match(self._VPAGE_URL, url) is None:
 904             request = compat_urllib_request.Request(url)
 905             try:
 906                 webpage = compat_urllib_request.urlopen(request).read()
 907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 908                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 909                 return
 910
 911             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 912             if mobj is None:
 913                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 914                 return
 915             yahoo_id = mobj.group(1)
 916
 917             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 918             if mobj is None:
 919                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 920                 return
 921             yahoo_vid = mobj.group(1)
 922
 923             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 924             return self._real_extract(url, new_video=False)
 925
 926         # Retrieve video webpage to extract further information
 927         request = compat_urllib_request.Request(url)
 928         try:
 929             self.report_download_webpage(video_id)
 930             webpage = compat_urllib_request.urlopen(request).read()
 931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 932             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 933             return
 934
 935         # Extract uploader and title from webpage
 936         self.report_extraction(video_id)
 937         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 938         if mobj is None:
 939             self._downloader.trouble(u'ERROR: unable to extract video title')
 940             return
 941         video_title = mobj.group(1).decode('utf-8')
 942
 943         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 944         if mobj is None:
 945             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 946             return
 947         video_uploader = mobj.group(1).decode('utf-8')
 948
 949         # Extract video thumbnail
 950         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 951         if mobj is None:
 952             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 953             return
 954         video_thumbnail = mobj.group(1).decode('utf-8')
 955
 956         # Extract video description
 957         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 958         if mobj is None:
 959             self._downloader.trouble(u'ERROR: unable to extract video description')
 960             return
 961         video_description = mobj.group(1).decode('utf-8')
 962         if not video_description:
 963             video_description = 'No description available.'
 964
 965         # Extract video height and width
 966         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 967         if mobj is None:
 968             self._downloader.trouble(u'ERROR: unable to extract video height')
 969             return
 970         yv_video_height = mobj.group(1)
 971
 972         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 973         if mobj is None:
 974             self._downloader.trouble(u'ERROR: unable to extract video width')
 975             return
 976         yv_video_width = mobj.group(1)
 977
 978         # Retrieve video playlist to extract media URL
 979         # I'm not completely sure what all these options are, but we
 980         # seem to need most of them, otherwise the server sends a 401.
 981         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 982         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 983         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 984                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 985                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 986         try:
 987             self.report_download_webpage(video_id)
 988             webpage = compat_urllib_request.urlopen(request).read()
 989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 990             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 991             return
 992
 993         # Extract media URL from playlist XML
 994         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 995         if mobj is None:
 996             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 997             return
 998         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 999         video_url = unescapeHTML(video_url)
1000
1001         return [{
1002             'id':       video_id.decode('utf-8'),
1003             'url':      video_url,
1004             'uploader': video_uploader,
1005             'upload_date':  None,
1006             'title':    video_title,
1007             'ext':      video_extension.decode('utf-8'),
1008             'thumbnail':    video_thumbnail.decode('utf-8'),
1009             'description':  video_description,
1010         }]
1011
1012
1013 class VimeoIE(InfoExtractor):
1014     """Information extractor for vimeo.com."""
1015
1016     # _VALID_URL matches Vimeo URLs
1017     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1018     IE_NAME = u'vimeo'
1019
1020     def __init__(self, downloader=None):
1021         InfoExtractor.__init__(self, downloader)
1022
1023     def report_download_webpage(self, video_id):
1024         """Report webpage download."""
1025         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026
1027     def report_extraction(self, video_id):
1028         """Report information extraction."""
1029         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1036             return
1037
1038         video_id = mobj.group('id')
1039         if not mobj.group('proto'):
1040             url = 'https://' + url
1041         if mobj.group('direct_link'):
1042             url = 'https://vimeo.com/' + video_id
1043
1044         # Retrieve video webpage to extract further information
1045         request = compat_urllib_request.Request(url, None, std_headers)
1046         try:
1047             self.report_download_webpage(video_id)
1048             webpage_bytes = compat_urllib_request.urlopen(request).read()
1049             webpage = webpage_bytes.decode('utf-8')
1050         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1052             return
1053
1054         # Now we begin extracting as much information as we can from what we
1055         # retrieved. First we extract the information common to all extractors,
1056         # and latter we extract those that are Vimeo specific.
1057         self.report_extraction(video_id)
1058
1059         # Extract the config JSON
1060         try:
1061             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062             config = json.loads(config)
1063         except:
1064             self._downloader.trouble(u'ERROR: unable to extract info section')
1065             return
1066
1067         # Extract title
1068         video_title = config["video"]["title"]
1069
1070         # Extract uploader and uploader_id
1071         video_uploader = config["video"]["owner"]["name"]
1072         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1073
1074         # Extract video thumbnail
1075         video_thumbnail = config["video"]["thumbnail"]
1076
1077         # Extract video description
1078         video_description = get_element_by_attribute("itemprop", "description", webpage)
1079         if video_description: video_description = clean_html(video_description)
1080         else: video_description = ''
1081
1082         # Extract upload date
1083         video_upload_date = None
1084         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085         if mobj is not None:
1086             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1087
1088         # Vimeo specific: extract request signature and timestamp
1089         sig = config['request']['signature']
1090         timestamp = config['request']['timestamp']
1091
1092         # Vimeo specific: extract video codec and quality information
1093         # First consider quality, then codecs, then take everything
1094         # TODO bind to format param
1095         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096         files = { 'hd': [], 'sd': [], 'other': []}
1097         for codec_name, codec_extension in codecs:
1098             if codec_name in config["video"]["files"]:
1099                 if 'hd' in config["video"]["files"][codec_name]:
1100                     files['hd'].append((codec_name, codec_extension, 'hd'))
1101                 elif 'sd' in config["video"]["files"][codec_name]:
1102                     files['sd'].append((codec_name, codec_extension, 'sd'))
1103                 else:
1104                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105
1106         for quality in ('hd', 'sd', 'other'):
1107             if len(files[quality]) > 0:
1108                 video_quality = files[quality][0][2]
1109                 video_codec = files[quality][0][0]
1110                 video_extension = files[quality][0][1]
1111                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1112                 break
1113         else:
1114             self._downloader.trouble(u'ERROR: no known codec found')
1115             return
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def __init__(self, downloader=None):
1142         InfoExtractor.__init__(self, downloader)
1143
1144     def report_download_webpage(self, video_id):
1145         """Report webpage download."""
1146         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147
1148     def report_extraction(self, video_id):
1149         """Report information extraction."""
1150         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151
1152     def fetch_webpage(self, url):
1153         request = compat_urllib_request.Request(url)
1154         try:
1155             self.report_download_webpage(url)
1156             webpage = compat_urllib_request.urlopen(request).read()
1157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159             return
1160         except ValueError as err:
1161             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1162             return
1163         return webpage
1164
1165     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166         page = self.fetch_webpage(url)
1167         mobj = re.search(regex, page, regexFlags)
1168         info = {}
1169
1170         if mobj is None:
1171             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1172             return
1173
1174         for (i, key, err) in matchTuples:
1175             if mobj.group(i) is None:
1176                 self._downloader.trouble(err)
1177                 return
1178             else:
1179                 info[key] = mobj.group(i)
1180
1181         return info
1182
1183     def extractLiveStream(self, url):
1184         video_lang = url.split('/')[-4]
1185         info = self.grep_webpage(
1186             url,
1187             r'src="(.*?/videothek_js.*?\.js)',
1188             0,
1189             [
1190                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1191             ]
1192         )
1193         http_host = url.split('/')[2]
1194         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195         info = self.grep_webpage(
1196             next_url,
1197             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198                 '(http://.*?\.swf).*?' +
1199                 '(rtmp://.*?)\'',
1200             re.DOTALL,
1201             [
1202                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1203                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1205             ]
1206         )
1207         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208
1209     def extractPlus7Stream(self, url):
1210         video_lang = url.split('/')[-3]
1211         info = self.grep_webpage(
1212             url,
1213             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1214             0,
1215             [
1216                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1217             ]
1218         )
1219         next_url = compat_urllib_parse.unquote(info.get('url'))
1220         info = self.grep_webpage(
1221             next_url,
1222             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1223             0,
1224             [
1225                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1226             ]
1227         )
1228         next_url = compat_urllib_parse.unquote(info.get('url'))
1229
1230         info = self.grep_webpage(
1231             next_url,
1232             r'<video id="(.*?)".*?>.*?' +
1233                 '<name>(.*?)</name>.*?' +
1234                 '<dateVideo>(.*?)</dateVideo>.*?' +
1235                 '<url quality="hd">(.*?)</url>',
1236             re.DOTALL,
1237             [
1238                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1239                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1241                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1242             ]
1243         )
1244
1245         return {
1246             'id':           info.get('id'),
1247             'url':          compat_urllib_parse.unquote(info.get('url')),
1248             'uploader':     u'arte.tv',
1249             'upload_date':  info.get('date'),
1250             'title':        info.get('title').decode('utf-8'),
1251             'ext':          u'mp4',
1252             'format':       u'NA',
1253             'player_url':   None,
1254         }
1255
1256     def _real_extract(self, url):
1257         video_id = url.split('/')[-1]
1258         self.report_extraction(video_id)
1259
1260         if re.search(self._LIVE_URL, video_id) is not None:
1261             self.extractLiveStream(url)
1262             return
1263         else:
1264             info = self.extractPlus7Stream(url)
1265
1266         return [info]
1267
1268
1269 class GenericIE(InfoExtractor):
1270     """Generic last-resort information extractor."""
1271
1272     _VALID_URL = r'.*'
1273     IE_NAME = u'generic'
1274
1275     def __init__(self, downloader=None):
1276         InfoExtractor.__init__(self, downloader)
1277
1278     def report_download_webpage(self, video_id):
1279         """Report webpage download."""
1280         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282
1283     def report_extraction(self, video_id):
1284         """Report information extraction."""
1285         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286
1287     def report_following_redirect(self, new_url):
1288         """Report information extraction."""
1289         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1290
1291     def _test_redirect(self, url):
1292         """Check if it is a redirect, like url shorteners, in case restart chain."""
1293         class HeadRequest(compat_urllib_request.Request):
1294             def get_method(self):
1295                 return "HEAD"
1296
1297         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298             """
1299             Subclass the HTTPRedirectHandler to make it use our
1300             HeadRequest also on the redirected URL
1301             """
1302             def redirect_request(self, req, fp, code, msg, headers, newurl):
1303                 if code in (301, 302, 303, 307):
1304                     newurl = newurl.replace(' ', '%20')
1305                     newheaders = dict((k,v) for k,v in req.headers.items()
1306                                       if k.lower() not in ("content-length", "content-type"))
1307                     return HeadRequest(newurl,
1308                                        headers=newheaders,
1309                                        origin_req_host=req.get_origin_req_host(),
1310                                        unverifiable=True)
1311                 else:
1312                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1313
1314         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315             """
1316             Fallback to GET if HEAD is not allowed (405 HTTP error)
1317             """
1318             def http_error_405(self, req, fp, code, msg, headers):
1319                 fp.read()
1320                 fp.close()
1321
1322                 newheaders = dict((k,v) for k,v in req.headers.items()
1323                                   if k.lower() not in ("content-length", "content-type"))
1324                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325                                                  headers=newheaders,
1326                                                  origin_req_host=req.get_origin_req_host(),
1327                                                  unverifiable=True))
1328
1329         # Build our opener
1330         opener = compat_urllib_request.OpenerDirector()
1331         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332                         HTTPMethodFallback, HEADRedirectHandler,
1333                         compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334             opener.add_handler(handler())
1335
1336         response = opener.open(HeadRequest(url))
1337         new_url = response.geturl()
1338
1339         if url == new_url:
1340             return False
1341
1342         self.report_following_redirect(new_url)
1343         self._downloader.download([new_url])
1344         return True
1345
1346     def _real_extract(self, url):
1347         if self._test_redirect(url): return
1348
1349         video_id = url.split('/')[-1]
1350         request = compat_urllib_request.Request(url)
1351         try:
1352             self.report_download_webpage(video_id)
1353             webpage = compat_urllib_request.urlopen(request).read()
1354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356             return
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371             return
1372
1373         # It's possible that one of the regexes
1374         # matched, but returned an empty group:
1375         if mobj.group(1) is None:
1376             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377             return
1378
1379         video_url = compat_urllib_parse.unquote(mobj.group(1))
1380         video_id = os.path.basename(video_url)
1381
1382         # here's a fun little line of code for you:
1383         video_extension = os.path.splitext(video_id)[1][1:]
1384         video_id = os.path.splitext(video_id)[0]
1385
1386         # it's tempting to parse this further, but you would
1387         # have to take into account all the variations like
1388         #   Video Title - Site Name
1389         #   Site Name | Video Title
1390         #   Video Title - Tagline | Site Name
1391         # and so on and so forth; it's just not practical
1392         mobj = re.search(r'<title>(.*)</title>', webpage)
1393         if mobj is None:
1394             self._downloader.trouble(u'ERROR: unable to extract title')
1395             return
1396         video_title = mobj.group(1)
1397
1398         # video uploader is domain name
1399         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1400         if mobj is None:
1401             self._downloader.trouble(u'ERROR: unable to extract title')
1402             return
1403         video_uploader = mobj.group(1)
1404
1405         return [{
1406             'id':       video_id,
1407             'url':      video_url,
1408             'uploader': video_uploader,
1409             'upload_date':  None,
1410             'title':    video_title,
1411             'ext':      video_extension,
1412         }]
1413
1414
1415 class YoutubeSearchIE(InfoExtractor):
1416     """Information Extractor for YouTube search queries."""
1417     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1418     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1419     _max_youtube_results = 1000
1420     IE_NAME = u'youtube:search'
1421
1422     def __init__(self, downloader=None):
1423         InfoExtractor.__init__(self, downloader)
1424
1425     def report_download_page(self, query, pagenum):
1426         """Report attempt to download search page with given number."""
1427         query = query.decode(preferredencoding())
1428         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429
1430     def _real_extract(self, query):
1431         mobj = re.match(self._VALID_URL, query)
1432         if mobj is None:
1433             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1434             return
1435
1436         prefix, query = query.split(':')
1437         prefix = prefix[8:]
1438         query = query.encode('utf-8')
1439         if prefix == '':
1440             self._download_n_results(query, 1)
1441             return
1442         elif prefix == 'all':
1443             self._download_n_results(query, self._max_youtube_results)
1444             return
1445         else:
1446             try:
1447                 n = int(prefix)
1448                 if n <= 0:
1449                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450                     return
1451                 elif n > self._max_youtube_results:
1452                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1453                     n = self._max_youtube_results
1454                 self._download_n_results(query, n)
1455                 return
1456             except ValueError: # parsing prefix as integer fails
1457                 self._download_n_results(query, 1)
1458                 return
1459
1460     def _download_n_results(self, query, n):
1461         """Downloads a specified number of results for a query"""
1462
1463         video_ids = []
1464         pagenum = 0
1465         limit = n
1466
1467         while (50 * pagenum) < limit:
1468             self.report_download_page(query, pagenum+1)
1469             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1470             request = compat_urllib_request.Request(result_url)
1471             try:
1472                 data = compat_urllib_request.urlopen(request).read()
1473             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1474                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1475                 return
1476             api_response = json.loads(data)['data']
1477
1478             new_ids = list(video['id'] for video in api_response['items'])
1479             video_ids += new_ids
1480
1481             limit = min(n, api_response['totalItems'])
1482             pagenum += 1
1483
1484         if len(video_ids) > n:
1485             video_ids = video_ids[:n]
1486         for id in video_ids:
1487             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1488         return
1489
1490
1491 class GoogleSearchIE(InfoExtractor):
1492     """Information Extractor for Google Video search queries."""
1493     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497     _max_google_results = 1000
1498     IE_NAME = u'video.google:search'
1499
1500     def __init__(self, downloader=None):
1501         InfoExtractor.__init__(self, downloader)
1502
1503     def report_download_page(self, query, pagenum):
1504         """Report attempt to download playlist page with given number."""
1505         query = query.decode(preferredencoding())
1506         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1507
1508     def _real_extract(self, query):
1509         mobj = re.match(self._VALID_URL, query)
1510         if mobj is None:
1511             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1512             return
1513
1514         prefix, query = query.split(':')
1515         prefix = prefix[8:]
1516         query = query.encode('utf-8')
1517         if prefix == '':
1518             self._download_n_results(query, 1)
1519             return
1520         elif prefix == 'all':
1521             self._download_n_results(query, self._max_google_results)
1522             return
1523         else:
1524             try:
1525                 n = int(prefix)
1526                 if n <= 0:
1527                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1528                     return
1529                 elif n > self._max_google_results:
1530                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1531                     n = self._max_google_results
1532                 self._download_n_results(query, n)
1533                 return
1534             except ValueError: # parsing prefix as integer fails
1535                 self._download_n_results(query, 1)
1536                 return
1537
1538     def _download_n_results(self, query, n):
1539         """Downloads a specified number of results for a query"""
1540
1541         video_ids = []
1542         pagenum = 0
1543
1544         while True:
1545             self.report_download_page(query, pagenum)
1546             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1547             request = compat_urllib_request.Request(result_url)
1548             try:
1549                 page = compat_urllib_request.urlopen(request).read()
1550             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1551                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1552                 return
1553
1554             # Extract video identifiers
1555             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1556                 video_id = mobj.group(1)
1557                 if video_id not in video_ids:
1558                     video_ids.append(video_id)
1559                     if len(video_ids) == n:
1560                         # Specified n videos reached
1561                         for id in video_ids:
1562                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1563                         return
1564
1565             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1566                 for id in video_ids:
1567                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1568                 return
1569
1570             pagenum = pagenum + 1
1571
1572
1573 class YahooSearchIE(InfoExtractor):
1574     """Information Extractor for Yahoo! Video search queries."""
1575
1576     _WORKING = False
1577     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1578     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1579     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1580     _MORE_PAGES_INDICATOR = r'\s*Next'
1581     _max_yahoo_results = 1000
1582     IE_NAME = u'video.yahoo:search'
1583
1584     def __init__(self, downloader=None):
1585         InfoExtractor.__init__(self, downloader)
1586
1587     def report_download_page(self, query, pagenum):
1588         """Report attempt to download playlist page with given number."""
1589         query = query.decode(preferredencoding())
1590         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1591
1592     def _real_extract(self, query):
1593         mobj = re.match(self._VALID_URL, query)
1594         if mobj is None:
1595             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1596             return
1597
1598         prefix, query = query.split(':')
1599         prefix = prefix[8:]
1600         query = query.encode('utf-8')
1601         if prefix == '':
1602             self._download_n_results(query, 1)
1603             return
1604         elif prefix == 'all':
1605             self._download_n_results(query, self._max_yahoo_results)
1606             return
1607         else:
1608             try:
1609                 n = int(prefix)
1610                 if n <= 0:
1611                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1612                     return
1613                 elif n > self._max_yahoo_results:
1614                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615                     n = self._max_yahoo_results
1616                 self._download_n_results(query, n)
1617                 return
1618             except ValueError: # parsing prefix as integer fails
1619                 self._download_n_results(query, 1)
1620                 return
1621
1622     def _download_n_results(self, query, n):
1623         """Downloads a specified number of results for a query"""
1624
1625         video_ids = []
1626         already_seen = set()
1627         pagenum = 1
1628
1629         while True:
1630             self.report_download_page(query, pagenum)
1631             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632             request = compat_urllib_request.Request(result_url)
1633             try:
1634                 page = compat_urllib_request.urlopen(request).read()
1635             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1637                 return
1638
1639             # Extract video identifiers
1640             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641                 video_id = mobj.group(1)
1642                 if video_id not in already_seen:
1643                     video_ids.append(video_id)
1644                     already_seen.add(video_id)
1645                     if len(video_ids) == n:
1646                         # Specified n videos reached
1647                         for id in video_ids:
1648                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649                         return
1650
1651             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652                 for id in video_ids:
1653                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654                 return
1655
1656             pagenum = pagenum + 1
1657
1658
1659 class YoutubePlaylistIE(InfoExtractor):
1660     """Information Extractor for YouTube playlists."""
1661
1662     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1663     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1664     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1665     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1666     IE_NAME = u'youtube:playlist'
1667
1668     def __init__(self, downloader=None):
1669         InfoExtractor.__init__(self, downloader)
1670
1671     def report_download_page(self, playlist_id, pagenum):
1672         """Report attempt to download playlist page with given number."""
1673         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1674
1675     def _real_extract(self, url):
1676         # Extract playlist id
1677         mobj = re.match(self._VALID_URL, url)
1678         if mobj is None:
1679             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1680             return
1681
1682         # Single video case
1683         if mobj.group(3) is not None:
1684             self._downloader.download([mobj.group(3)])
1685             return
1686
1687         # Download playlist pages
1688         # prefix is 'p' as default for playlists but there are other types that need extra care
1689         playlist_prefix = mobj.group(1)
1690         if playlist_prefix == 'a':
1691             playlist_access = 'artist'
1692         else:
1693             playlist_prefix = 'p'
1694             playlist_access = 'view_play_list'
1695         playlist_id = mobj.group(2)
1696         video_ids = []
1697         pagenum = 1
1698
1699         while True:
1700             self.report_download_page(playlist_id, pagenum)
1701             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1702             request = compat_urllib_request.Request(url)
1703             try:
1704                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1705             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1706                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1707                 return
1708
1709             # Extract video identifiers
1710             ids_in_page = []
1711             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1712                 if mobj.group(1) not in ids_in_page:
1713                     ids_in_page.append(mobj.group(1))
1714             video_ids.extend(ids_in_page)
1715
1716             if self._MORE_PAGES_INDICATOR not in page:
1717                 break
1718             pagenum = pagenum + 1
1719
1720         total = len(video_ids)
1721
1722         playliststart = self._downloader.params.get('playliststart', 1) - 1
1723         playlistend = self._downloader.params.get('playlistend', -1)
1724         if playlistend == -1:
1725             video_ids = video_ids[playliststart:]
1726         else:
1727             video_ids = video_ids[playliststart:playlistend]
1728
1729         if len(video_ids) == total:
1730             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1731         else:
1732             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1733
1734         for id in video_ids:
1735             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1736         return
1737
1738
1739 class YoutubeChannelIE(InfoExtractor):
1740     """Information Extractor for YouTube channels."""
1741
1742     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1743     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1745     IE_NAME = u'youtube:channel'
1746
1747     def report_download_page(self, channel_id, pagenum):
1748         """Report attempt to download channel page with given number."""
1749         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1750
1751     def _real_extract(self, url):
1752         # Extract channel id
1753         mobj = re.match(self._VALID_URL, url)
1754         if mobj is None:
1755             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1756             return
1757
1758         # Download channel pages
1759         channel_id = mobj.group(1)
1760         video_ids = []
1761         pagenum = 1
1762
1763         while True:
1764             self.report_download_page(channel_id, pagenum)
1765             url = self._TEMPLATE_URL % (channel_id, pagenum)
1766             request = compat_urllib_request.Request(url)
1767             try:
1768                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1769             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1770                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1771                 return
1772
1773             # Extract video identifiers
1774             ids_in_page = []
1775             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1776                 if mobj.group(1) not in ids_in_page:
1777                     ids_in_page.append(mobj.group(1))
1778             video_ids.extend(ids_in_page)
1779
1780             if self._MORE_PAGES_INDICATOR not in page:
1781                 break
1782             pagenum = pagenum + 1
1783
1784         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1785
1786         for id in video_ids:
1787             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1788         return
1789
1790
1791 class YoutubeUserIE(InfoExtractor):
1792     """Information Extractor for YouTube users."""
1793
1794     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1795     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1796     _GDATA_PAGE_SIZE = 50
1797     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1798     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1799     IE_NAME = u'youtube:user'
1800
1801     def __init__(self, downloader=None):
1802         InfoExtractor.__init__(self, downloader)
1803
1804     def report_download_page(self, username, start_index):
1805         """Report attempt to download user page."""
1806         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1807                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1808
1809     def _real_extract(self, url):
1810         # Extract username
1811         mobj = re.match(self._VALID_URL, url)
1812         if mobj is None:
1813             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1814             return
1815
1816         username = mobj.group(1)
1817
1818         # Download video ids using YouTube Data API. Result size per
1819         # query is limited (currently to 50 videos) so we need to query
1820         # page by page until there are no video ids - it means we got
1821         # all of them.
1822
1823         video_ids = []
1824         pagenum = 0
1825
1826         while True:
1827             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1828             self.report_download_page(username, start_index)
1829
1830             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1831
1832             try:
1833                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1834             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1836                 return
1837
1838             # Extract video identifiers
1839             ids_in_page = []
1840
1841             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1842                 if mobj.group(1) not in ids_in_page:
1843                     ids_in_page.append(mobj.group(1))
1844
1845             video_ids.extend(ids_in_page)
1846
1847             # A little optimization - if current page is not
1848             # "full", ie. does not contain PAGE_SIZE video ids then
1849             # we can assume that this page is the last one - there
1850             # are no more ids on further pages - no need to query
1851             # again.
1852
1853             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1854                 break
1855
1856             pagenum += 1
1857
1858         all_ids_count = len(video_ids)
1859         playliststart = self._downloader.params.get('playliststart', 1) - 1
1860         playlistend = self._downloader.params.get('playlistend', -1)
1861
1862         if playlistend == -1:
1863             video_ids = video_ids[playliststart:]
1864         else:
1865             video_ids = video_ids[playliststart:playlistend]
1866
1867         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1868                 (username, all_ids_count, len(video_ids)))
1869
1870         for video_id in video_ids:
1871             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1872
1873
1874 class BlipTVUserIE(InfoExtractor):
1875     """Information Extractor for blip.tv users."""
1876
1877     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1878     _PAGE_SIZE = 12
1879     IE_NAME = u'blip.tv:user'
1880
1881     def __init__(self, downloader=None):
1882         InfoExtractor.__init__(self, downloader)
1883
1884     def report_download_page(self, username, pagenum):
1885         """Report attempt to download user page."""
1886         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1887                 (self.IE_NAME, username, pagenum))
1888
1889     def _real_extract(self, url):
1890         # Extract username
1891         mobj = re.match(self._VALID_URL, url)
1892         if mobj is None:
1893             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1894             return
1895
1896         username = mobj.group(1)
1897
1898         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1899
1900         request = compat_urllib_request.Request(url)
1901
1902         try:
1903             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1904             mobj = re.search(r'data-users-id="([^"]+)"', page)
1905             page_base = page_base % mobj.group(1)
1906         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1908             return
1909
1910
1911         # Download video ids using BlipTV Ajax calls. Result size per
1912         # query is limited (currently to 12 videos) so we need to query
1913         # page by page until there are no video ids - it means we got
1914         # all of them.
1915
1916         video_ids = []
1917         pagenum = 1
1918
1919         while True:
1920             self.report_download_page(username, pagenum)
1921
1922             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1923
1924             try:
1925                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1927                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1928                 return
1929
1930             # Extract video identifiers
1931             ids_in_page = []
1932
1933             for mobj in re.finditer(r'href="/([^"]+)"', page):
1934                 if mobj.group(1) not in ids_in_page:
1935                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1936
1937             video_ids.extend(ids_in_page)
1938
1939             # A little optimization - if current page is not
1940             # "full", ie. does not contain PAGE_SIZE video ids then
1941             # we can assume that this page is the last one - there
1942             # are no more ids on further pages - no need to query
1943             # again.
1944
1945             if len(ids_in_page) < self._PAGE_SIZE:
1946                 break
1947
1948             pagenum += 1
1949
1950         all_ids_count = len(video_ids)
1951         playliststart = self._downloader.params.get('playliststart', 1) - 1
1952         playlistend = self._downloader.params.get('playlistend', -1)
1953
1954         if playlistend == -1:
1955             video_ids = video_ids[playliststart:]
1956         else:
1957             video_ids = video_ids[playliststart:playlistend]
1958
1959         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1960                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1961
1962         for video_id in video_ids:
1963             self._downloader.download([u'http://blip.tv/'+video_id])
1964
1965
1966 class DepositFilesIE(InfoExtractor):
1967     """Information extractor for depositfiles.com"""
1968
1969     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1970
1971     def report_download_webpage(self, file_id):
1972         """Report webpage download."""
1973         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1974
1975     def report_extraction(self, file_id):
1976         """Report information extraction."""
1977         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1978
1979     def _real_extract(self, url):
1980         file_id = url.split('/')[-1]
1981         # Rebuild url in english locale
1982         url = 'http://depositfiles.com/en/files/' + file_id
1983
1984         # Retrieve file webpage with 'Free download' button pressed
1985         free_download_indication = { 'gateway_result' : '1' }
1986         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1987         try:
1988             self.report_download_webpage(file_id)
1989             webpage = compat_urllib_request.urlopen(request).read()
1990         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1991             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1992             return
1993
1994         # Search for the real file URL
1995         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1996         if (mobj is None) or (mobj.group(1) is None):
1997             # Try to figure out reason of the error.
1998             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1999             if (mobj is not None) and (mobj.group(1) is not None):
2000                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2001                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2002             else:
2003                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2004             return
2005
2006         file_url = mobj.group(1)
2007         file_extension = os.path.splitext(file_url)[1][1:]
2008
2009         # Search for file title
2010         mobj = re.search(r'<b title="(.*?)">', webpage)
2011         if mobj is None:
2012             self._downloader.trouble(u'ERROR: unable to extract title')
2013             return
2014         file_title = mobj.group(1).decode('utf-8')
2015
2016         return [{
2017             'id':       file_id.decode('utf-8'),
2018             'url':      file_url.decode('utf-8'),
2019             'uploader': None,
2020             'upload_date':  None,
2021             'title':    file_title,
2022             'ext':      file_extension.decode('utf-8'),
2023         }]
2024
2025
2026 class FacebookIE(InfoExtractor):
2027     """Information Extractor for Facebook"""
2028
2029     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2030     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2031     _NETRC_MACHINE = 'facebook'
2032     IE_NAME = u'facebook'
2033
2034     def report_login(self):
2035         """Report attempt to log in."""
2036         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2037
2038     def _real_initialize(self):
2039         if self._downloader is None:
2040             return
2041
2042         useremail = None
2043         password = None
2044         downloader_params = self._downloader.params
2045
2046         # Attempt to use provided username and password or .netrc data
2047         if downloader_params.get('username', None) is not None:
2048             useremail = downloader_params['username']
2049             password = downloader_params['password']
2050         elif downloader_params.get('usenetrc', False):
2051             try:
2052                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2053                 if info is not None:
2054                     useremail = info[0]
2055                     password = info[2]
2056                 else:
2057                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2058             except (IOError, netrc.NetrcParseError) as err:
2059                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2060                 return
2061
2062         if useremail is None:
2063             return
2064
2065         # Log in
2066         login_form = {
2067             'email': useremail,
2068             'pass': password,
2069             'login': 'Log+In'
2070             }
2071         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2072         try:
2073             self.report_login()
2074             login_results = compat_urllib_request.urlopen(request).read()
2075             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2076                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2077                 return
2078         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2079             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2080             return
2081
2082     def _real_extract(self, url):
2083         mobj = re.match(self._VALID_URL, url)
2084         if mobj is None:
2085             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2086             return
2087         video_id = mobj.group('ID')
2088
2089         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2090         webpage = self._download_webpage(url, video_id)
2091
2092         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2093         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2094         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2095         if not m:
2096             raise ExtractorError(u'Cannot parse data')
2097         data = dict(json.loads(m.group(1)))
2098         params_raw = compat_urllib_parse.unquote(data['params'])
2099         params = json.loads(params_raw)
2100         video_url = params['hd_src']
2101         video_duration = int(params['video_duration'])
2102
2103         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2104         if not m:
2105             raise ExtractorError(u'Cannot find title in webpage')
2106         video_title = unescapeHTML(m.group(1))
2107
2108         info = {
2109             'id': video_id,
2110             'title': video_title,
2111             'url': video_url,
2112             'ext': 'mp4',
2113             'duration': video_duration,
2114             'thumbnail': params['thumbnail_src'],
2115         }
2116         return [info]
2117
2118
2119 class BlipTVIE(InfoExtractor):
2120     """Information extractor for blip.tv"""
2121
2122     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2123     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2124     IE_NAME = u'blip.tv'
2125
2126     def report_extraction(self, file_id):
2127         """Report information extraction."""
2128         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2129
2130     def report_direct_download(self, title):
2131         """Report information extraction."""
2132         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2133
2134     def _real_extract(self, url):
2135         mobj = re.match(self._VALID_URL, url)
2136         if mobj is None:
2137             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2138             return
2139
2140         if '?' in url:
2141             cchar = '&'
2142         else:
2143             cchar = '?'
2144         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2145         request = compat_urllib_request.Request(json_url)
2146         request.add_header('User-Agent', 'iTunes/10.6.1')
2147         self.report_extraction(mobj.group(1))
2148         info = None
2149         try:
2150             urlh = compat_urllib_request.urlopen(request)
2151             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2152                 basename = url.split('/')[-1]
2153                 title,ext = os.path.splitext(basename)
2154                 title = title.decode('UTF-8')
2155                 ext = ext.replace('.', '')
2156                 self.report_direct_download(title)
2157                 info = {
2158                     'id': title,
2159                     'url': url,
2160                     'uploader': None,
2161                     'upload_date': None,
2162                     'title': title,
2163                     'ext': ext,
2164                     'urlhandle': urlh
2165                 }
2166         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2167             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2168         if info is None: # Regular URL
2169             try:
2170                 json_code_bytes = urlh.read()
2171                 json_code = json_code_bytes.decode('utf-8')
2172             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2173                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2174                 return
2175
2176             try:
2177                 json_data = json.loads(json_code)
2178                 if 'Post' in json_data:
2179                     data = json_data['Post']
2180                 else:
2181                     data = json_data
2182
2183                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2184                 video_url = data['media']['url']
2185                 umobj = re.match(self._URL_EXT, video_url)
2186                 if umobj is None:
2187                     raise ValueError('Can not determine filename extension')
2188                 ext = umobj.group(1)
2189
2190                 info = {
2191                     'id': data['item_id'],
2192                     'url': video_url,
2193                     'uploader': data['display_name'],
2194                     'upload_date': upload_date,
2195                     'title': data['title'],
2196                     'ext': ext,
2197                     'format': data['media']['mimeType'],
2198                     'thumbnail': data['thumbnailUrl'],
2199                     'description': data['description'],
2200                     'player_url': data['embedUrl'],
2201                     'user_agent': 'iTunes/10.6.1',
2202                 }
2203             except (ValueError,KeyError) as err:
2204                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2205                 return
2206
2207         return [info]
2208
2209
2210 class MyVideoIE(InfoExtractor):
2211     """Information Extractor for myvideo.de."""
2212
2213     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2214     IE_NAME = u'myvideo'
2215
2216     def __init__(self, downloader=None):
2217         InfoExtractor.__init__(self, downloader)
2218
2219     def report_extraction(self, video_id):
2220         """Report information extraction."""
2221         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2222
2223     def _real_extract(self,url):
2224         mobj = re.match(self._VALID_URL, url)
2225         if mobj is None:
2226             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2227             return
2228
2229         video_id = mobj.group(1)
2230
2231         # Get video webpage
2232         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2233         webpage = self._download_webpage(webpage_url, video_id)
2234
2235         self.report_extraction(video_id)
2236         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2237                  webpage)
2238         if mobj is None:
2239             self._downloader.trouble(u'ERROR: unable to extract media URL')
2240             return
2241         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2242
2243         mobj = re.search('<title>([^<]+)</title>', webpage)
2244         if mobj is None:
2245             self._downloader.trouble(u'ERROR: unable to extract title')
2246             return
2247
2248         video_title = mobj.group(1)
2249
2250         return [{
2251             'id':       video_id,
2252             'url':      video_url,
2253             'uploader': None,
2254             'upload_date':  None,
2255             'title':    video_title,
2256             'ext':      u'flv',
2257         }]
2258
2259 class ComedyCentralIE(InfoExtractor):
2260     """Information extractor for The Daily Show and Colbert Report """
2261
2262     # urls can be abbreviations like :thedailyshow or :colbert
2263     # urls for episodes like:
2264     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2265     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2266     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2267     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2268                       |(https?://)?(www\.)?
2269                           (?P<showname>thedailyshow|colbertnation)\.com/
2270                          (full-episodes/(?P<episode>.*)|
2271                           (?P<clip>
2272                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2273                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2274                      $"""
2275
2276     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2277
2278     _video_extensions = {
2279         '3500': 'mp4',
2280         '2200': 'mp4',
2281         '1700': 'mp4',
2282         '1200': 'mp4',
2283         '750': 'mp4',
2284         '400': 'mp4',
2285     }
2286     _video_dimensions = {
2287         '3500': '1280x720',
2288         '2200': '960x540',
2289         '1700': '768x432',
2290         '1200': '640x360',
2291         '750': '512x288',
2292         '400': '384x216',
2293     }
2294
2295     def suitable(self, url):
2296         """Receives a URL and returns True if suitable for this IE."""
2297         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2298
2299     def report_extraction(self, episode_id):
2300         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2301
2302     def report_config_download(self, episode_id, media_id):
2303         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2304
2305     def report_index_download(self, episode_id):
2306         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2307
2308     def _print_formats(self, formats):
2309         print('Available formats:')
2310         for x in formats:
2311             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2312
2313
2314     def _real_extract(self, url):
2315         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2316         if mobj is None:
2317             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2318             return
2319
2320         if mobj.group('shortname'):
2321             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2322                 url = u'http://www.thedailyshow.com/full-episodes/'
2323             else:
2324                 url = u'http://www.colbertnation.com/full-episodes/'
2325             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2326             assert mobj is not None
2327
2328         if mobj.group('clip'):
2329             if mobj.group('showname') == 'thedailyshow':
2330                 epTitle = mobj.group('tdstitle')
2331             else:
2332                 epTitle = mobj.group('cntitle')
2333             dlNewest = False
2334         else:
2335             dlNewest = not mobj.group('episode')
2336             if dlNewest:
2337                 epTitle = mobj.group('showname')
2338             else:
2339                 epTitle = mobj.group('episode')
2340
2341         req = compat_urllib_request.Request(url)
2342         self.report_extraction(epTitle)
2343         try:
2344             htmlHandle = compat_urllib_request.urlopen(req)
2345             html = htmlHandle.read()
2346             webpage = html.decode('utf-8')
2347         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2348             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2349             return
2350         if dlNewest:
2351             url = htmlHandle.geturl()
2352             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2353             if mobj is None:
2354                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2355                 return
2356             if mobj.group('episode') == '':
2357                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2358                 return
2359             epTitle = mobj.group('episode')
2360
2361         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2362
2363         if len(mMovieParams) == 0:
2364             # The Colbert Report embeds the information in a without
2365             # a URL prefix; so extract the alternate reference
2366             # and then add the URL prefix manually.
2367
2368             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2369             if len(altMovieParams) == 0:
2370                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2371                 return
2372             else:
2373                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2374
2375         uri = mMovieParams[0][1]
2376         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2377         self.report_index_download(epTitle)
2378         try:
2379             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2380         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2381             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2382             return
2383
2384         results = []
2385
2386         idoc = xml.etree.ElementTree.fromstring(indexXml)
2387         itemEls = idoc.findall('.//item')
2388         for partNum,itemEl in enumerate(itemEls):
2389             mediaId = itemEl.findall('./guid')[0].text
2390             shortMediaId = mediaId.split(':')[-1]
2391             showId = mediaId.split(':')[-2].replace('.com', '')
2392             officialTitle = itemEl.findall('./title')[0].text
2393             officialDate = itemEl.findall('./pubDate')[0].text
2394
2395             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2396                         compat_urllib_parse.urlencode({'uri': mediaId}))
2397             configReq = compat_urllib_request.Request(configUrl)
2398             self.report_config_download(epTitle, shortMediaId)
2399             try:
2400                 configXml = compat_urllib_request.urlopen(configReq).read()
2401             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2403                 return
2404
2405             cdoc = xml.etree.ElementTree.fromstring(configXml)
2406             turls = []
2407             for rendition in cdoc.findall('.//rendition'):
2408                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2409                 turls.append(finfo)
2410
2411             if len(turls) == 0:
2412                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2413                 continue
2414
2415             if self._downloader.params.get('listformats', None):
2416                 self._print_formats([i[0] for i in turls])
2417                 return
2418
2419             # For now, just pick the highest bitrate
2420             format,rtmp_video_url = turls[-1]
2421
2422             # Get the format arg from the arg stream
2423             req_format = self._downloader.params.get('format', None)
2424
2425             # Select format if we can find one
2426             for f,v in turls:
2427                 if f == req_format:
2428                     format, rtmp_video_url = f, v
2429                     break
2430
2431             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2432             if not m:
2433                 raise ExtractorError(u'Cannot transform RTMP url')
2434             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2435             video_url = base + m.group('finalid')
2436
2437             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2438             info = {
2439                 'id': shortMediaId,
2440                 'url': video_url,
2441                 'uploader': showId,
2442                 'upload_date': officialDate,
2443                 'title': effTitle,
2444                 'ext': 'mp4',
2445                 'format': format,
2446                 'thumbnail': None,
2447                 'description': officialTitle,
2448             }
2449             results.append(info)
2450
2451         return results
2452
2453
2454 class EscapistIE(InfoExtractor):
2455     """Information extractor for The Escapist """
2456
2457     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2458     IE_NAME = u'escapist'
2459
2460     def report_extraction(self, showName):
2461         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2462
2463     def report_config_download(self, showName):
2464         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2465
2466     def _real_extract(self, url):
2467         mobj = re.match(self._VALID_URL, url)
2468         if mobj is None:
2469             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2470             return
2471         showName = mobj.group('showname')
2472         videoId = mobj.group('episode')
2473
2474         self.report_extraction(showName)
2475         try:
2476             webPage = compat_urllib_request.urlopen(url)
2477             webPageBytes = webPage.read()
2478             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2479             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2480         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2481             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2482             return
2483
2484         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2485         description = unescapeHTML(descMatch.group(1))
2486         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2487         imgUrl = unescapeHTML(imgMatch.group(1))
2488         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2489         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2490         configUrlMatch = re.search('config=(.*)$', playerUrl)
2491         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2492
2493         self.report_config_download(showName)
2494         try:
2495             configJSON = compat_urllib_request.urlopen(configUrl)
2496             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2497             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2498         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2499             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2500             return
2501
2502         # Technically, it's JavaScript, not JSON
2503         configJSON = configJSON.replace("'", '"')
2504
2505         try:
2506             config = json.loads(configJSON)
2507         except (ValueError,) as err:
2508             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2509             return
2510
2511         playlist = config['playlist']
2512         videoUrl = playlist[1]['url']
2513
2514         info = {
2515             'id': videoId,
2516             'url': videoUrl,
2517             'uploader': showName,
2518             'upload_date': None,
2519             'title': showName,
2520             'ext': 'flv',
2521             'thumbnail': imgUrl,
2522             'description': description,
2523             'player_url': playerUrl,
2524         }
2525
2526         return [info]
2527
2528 class CollegeHumorIE(InfoExtractor):
2529     """Information extractor for collegehumor.com"""
2530
2531     _WORKING = False
2532     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2533     IE_NAME = u'collegehumor'
2534
2535     def report_manifest(self, video_id):
2536         """Report information extraction."""
2537         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2538
2539     def report_extraction(self, video_id):
2540         """Report information extraction."""
2541         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2542
2543     def _real_extract(self, url):
2544         mobj = re.match(self._VALID_URL, url)
2545         if mobj is None:
2546             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2547             return
2548         video_id = mobj.group('videoid')
2549
2550         info = {
2551             'id': video_id,
2552             'uploader': None,
2553             'upload_date': None,
2554         }
2555
2556         self.report_extraction(video_id)
2557         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2558         try:
2559             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2560         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2561             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2562             return
2563
2564         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2565         try:
2566             videoNode = mdoc.findall('./video')[0]
2567             info['description'] = videoNode.findall('./description')[0].text
2568             info['title'] = videoNode.findall('./caption')[0].text
2569             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2570             manifest_url = videoNode.findall('./file')[0].text
2571         except IndexError:
2572             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2573             return
2574
2575         manifest_url += '?hdcore=2.10.3'
2576         self.report_manifest(video_id)
2577         try:
2578             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2579         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2581             return
2582
2583         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2584         try:
2585             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2586             node_id = media_node.attrib['url']
2587             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2588         except IndexError as err:
2589             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2590             return
2591
2592         url_pr = compat_urllib_parse_urlparse(manifest_url)
2593         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2594
2595         info['url'] = url
2596         info['ext'] = 'f4f'
2597         return [info]
2598
2599
2600 class XVideosIE(InfoExtractor):
2601     """Information extractor for xvideos.com"""
2602
2603     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2604     IE_NAME = u'xvideos'
2605
2606     def report_extraction(self, video_id):
2607         """Report information extraction."""
2608         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2609
2610     def _real_extract(self, url):
2611         mobj = re.match(self._VALID_URL, url)
2612         if mobj is None:
2613             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2614             return
2615         video_id = mobj.group(1)
2616
2617         webpage = self._download_webpage(url, video_id)
2618
2619         self.report_extraction(video_id)
2620
2621
2622         # Extract video URL
2623         mobj = re.search(r'flv_url=(.+?)&', webpage)
2624         if mobj is None:
2625             self._downloader.trouble(u'ERROR: unable to extract video url')
2626             return
2627         video_url = compat_urllib_parse.unquote(mobj.group(1))
2628
2629
2630         # Extract title
2631         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2632         if mobj is None:
2633             self._downloader.trouble(u'ERROR: unable to extract video title')
2634             return
2635         video_title = mobj.group(1)
2636
2637
2638         # Extract video thumbnail
2639         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2640         if mobj is None:
2641             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2642             return
2643         video_thumbnail = mobj.group(0)
2644
2645         info = {
2646             'id': video_id,
2647             'url': video_url,
2648             'uploader': None,
2649             'upload_date': None,
2650             'title': video_title,
2651             'ext': 'flv',
2652             'thumbnail': video_thumbnail,
2653             'description': None,
2654         }
2655
2656         return [info]
2657
2658
2659 class SoundcloudIE(InfoExtractor):
2660     """Information extractor for soundcloud.com
2661        To access the media, the uid of the song and a stream token
2662        must be extracted from the page source and the script must make
2663        a request to media.soundcloud.com/crossdomain.xml. Then
2664        the media can be grabbed by requesting from an url composed
2665        of the stream token and uid
2666      """
2667
2668     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2669     IE_NAME = u'soundcloud'
2670
2671     def __init__(self, downloader=None):
2672         InfoExtractor.__init__(self, downloader)
2673
2674     def report_resolve(self, video_id):
2675         """Report information extraction."""
2676         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2677
2678     def report_extraction(self, video_id):
2679         """Report information extraction."""
2680         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2681
2682     def _real_extract(self, url):
2683         mobj = re.match(self._VALID_URL, url)
2684         if mobj is None:
2685             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2686             return
2687
2688         # extract uploader (which is in the url)
2689         uploader = mobj.group(1)
2690         # extract simple title (uploader + slug of song title)
2691         slug_title =  mobj.group(2)
2692         simple_title = uploader + u'-' + slug_title
2693
2694         self.report_resolve('%s/%s' % (uploader, slug_title))
2695
2696         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2697         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2698         request = compat_urllib_request.Request(resolv_url)
2699         try:
2700             info_json_bytes = compat_urllib_request.urlopen(request).read()
2701             info_json = info_json_bytes.decode('utf-8')
2702         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2703             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2704             return
2705
2706         info = json.loads(info_json)
2707         video_id = info['id']
2708         self.report_extraction('%s/%s' % (uploader, slug_title))
2709
2710         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2711         request = compat_urllib_request.Request(streams_url)
2712         try:
2713             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2714             stream_json = stream_json_bytes.decode('utf-8')
2715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2716             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2717             return
2718
2719         streams = json.loads(stream_json)
2720         mediaURL = streams['http_mp3_128_url']
2721
2722         return [{
2723             'id':       info['id'],
2724             'url':      mediaURL,
2725             'uploader': info['user']['username'],
2726             'upload_date':  info['created_at'],
2727             'title':    info['title'],
2728             'ext':      u'mp3',
2729             'description': info['description'],
2730         }]
2731
2732
2733 class InfoQIE(InfoExtractor):
2734     """Information extractor for infoq.com"""
2735     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2736
2737     def report_extraction(self, video_id):
2738         """Report information extraction."""
2739         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2740
2741     def _real_extract(self, url):
2742         mobj = re.match(self._VALID_URL, url)
2743         if mobj is None:
2744             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2745             return
2746
2747         webpage = self._download_webpage(url, video_id=url)
2748         self.report_extraction(url)
2749
2750         # Extract video URL
2751         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2752         if mobj is None:
2753             self._downloader.trouble(u'ERROR: unable to extract video url')
2754             return
2755         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2756         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2757
2758         # Extract title
2759         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2760         if mobj is None:
2761             self._downloader.trouble(u'ERROR: unable to extract video title')
2762             return
2763         video_title = mobj.group(1)
2764
2765         # Extract description
2766         video_description = u'No description available.'
2767         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2768         if mobj is not None:
2769             video_description = mobj.group(1)
2770
2771         video_filename = video_url.split('/')[-1]
2772         video_id, extension = video_filename.split('.')
2773
2774         info = {
2775             'id': video_id,
2776             'url': video_url,
2777             'uploader': None,
2778             'upload_date': None,
2779             'title': video_title,
2780             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2781             'thumbnail': None,
2782             'description': video_description,
2783         }
2784
2785         return [info]
2786
2787 class MixcloudIE(InfoExtractor):
2788     """Information extractor for www.mixcloud.com"""
2789
2790     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2791     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2792     IE_NAME = u'mixcloud'
2793
2794     def __init__(self, downloader=None):
2795         InfoExtractor.__init__(self, downloader)
2796
2797     def report_download_json(self, file_id):
2798         """Report JSON download."""
2799         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2800
2801     def report_extraction(self, file_id):
2802         """Report information extraction."""
2803         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2804
2805     def get_urls(self, jsonData, fmt, bitrate='best'):
2806         """Get urls from 'audio_formats' section in json"""
2807         file_url = None
2808         try:
2809             bitrate_list = jsonData[fmt]
2810             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2811                 bitrate = max(bitrate_list) # select highest
2812
2813             url_list = jsonData[fmt][bitrate]
2814         except TypeError: # we have no bitrate info.
2815             url_list = jsonData[fmt]
2816         return url_list
2817
2818     def check_urls(self, url_list):
2819         """Returns 1st active url from list"""
2820         for url in url_list:
2821             try:
2822                 compat_urllib_request.urlopen(url)
2823                 return url
2824             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2825                 url = None
2826
2827         return None
2828
2829     def _print_formats(self, formats):
2830         print('Available formats:')
2831         for fmt in formats.keys():
2832             for b in formats[fmt]:
2833                 try:
2834                     ext = formats[fmt][b][0]
2835                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2836                 except TypeError: # we have no bitrate info
2837                     ext = formats[fmt][0]
2838                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2839                     break
2840
2841     def _real_extract(self, url):
2842         mobj = re.match(self._VALID_URL, url)
2843         if mobj is None:
2844             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2845             return
2846         # extract uploader & filename from url
2847         uploader = mobj.group(1).decode('utf-8')
2848         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2849
2850         # construct API request
2851         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2852         # retrieve .json file with links to files
2853         request = compat_urllib_request.Request(file_url)
2854         try:
2855             self.report_download_json(file_url)
2856             jsonData = compat_urllib_request.urlopen(request).read()
2857         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2858             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2859             return
2860
2861         # parse JSON
2862         json_data = json.loads(jsonData)
2863         player_url = json_data['player_swf_url']
2864         formats = dict(json_data['audio_formats'])
2865
2866         req_format = self._downloader.params.get('format', None)
2867         bitrate = None
2868
2869         if self._downloader.params.get('listformats', None):
2870             self._print_formats(formats)
2871             return
2872
2873         if req_format is None or req_format == 'best':
2874             for format_param in formats.keys():
2875                 url_list = self.get_urls(formats, format_param)
2876                 # check urls
2877                 file_url = self.check_urls(url_list)
2878                 if file_url is not None:
2879                     break # got it!
2880         else:
2881             if req_format not in formats:
2882                 self._downloader.trouble(u'ERROR: format is not available')
2883                 return
2884
2885             url_list = self.get_urls(formats, req_format)
2886             file_url = self.check_urls(url_list)
2887             format_param = req_format
2888
2889         return [{
2890             'id': file_id.decode('utf-8'),
2891             'url': file_url.decode('utf-8'),
2892             'uploader': uploader.decode('utf-8'),
2893             'upload_date': None,
2894             'title': json_data['name'],
2895             'ext': file_url.split('.')[-1].decode('utf-8'),
2896             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2897             'thumbnail': json_data['thumbnail_url'],
2898             'description': json_data['description'],
2899             'player_url': player_url.decode('utf-8'),
2900         }]
2901
2902 class StanfordOpenClassroomIE(InfoExtractor):
2903     """Information extractor for Stanford's Open ClassRoom"""
2904
2905     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2906     IE_NAME = u'stanfordoc'
2907
2908     def report_download_webpage(self, objid):
2909         """Report information extraction."""
2910         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2911
2912     def report_extraction(self, video_id):
2913         """Report information extraction."""
2914         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2915
2916     def _real_extract(self, url):
2917         mobj = re.match(self._VALID_URL, url)
2918         if mobj is None:
2919             raise ExtractorError(u'Invalid URL: %s' % url)
2920
2921         if mobj.group('course') and mobj.group('video'): # A specific video
2922             course = mobj.group('course')
2923             video = mobj.group('video')
2924             info = {
2925                 'id': course + '_' + video,
2926                 'uploader': None,
2927                 'upload_date': None,
2928             }
2929
2930             self.report_extraction(info['id'])
2931             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2932             xmlUrl = baseUrl + video + '.xml'
2933             try:
2934                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2935             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2936                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2937                 return
2938             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2939             try:
2940                 info['title'] = mdoc.findall('./title')[0].text
2941                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2942             except IndexError:
2943                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2944                 return
2945             info['ext'] = info['url'].rpartition('.')[2]
2946             return [info]
2947         elif mobj.group('course'): # A course page
2948             course = mobj.group('course')
2949             info = {
2950                 'id': course,
2951                 'type': 'playlist',
2952                 'uploader': None,
2953                 'upload_date': None,
2954             }
2955
2956             coursepage = self._download_webpage(url, info['id'],
2957                                         note='Downloading course info page',
2958                                         errnote='Unable to download course info page')
2959
2960             m = re.search('<h1>([^<]+)</h1>', coursepage)
2961             if m:
2962                 info['title'] = unescapeHTML(m.group(1))
2963             else:
2964                 info['title'] = info['id']
2965
2966             m = re.search('<description>([^<]+)</description>', coursepage)
2967             if m:
2968                 info['description'] = unescapeHTML(m.group(1))
2969
2970             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2971             info['list'] = [
2972                 {
2973                     'type': 'reference',
2974                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2975                 }
2976                     for vpage in links]
2977             results = []
2978             for entry in info['list']:
2979                 assert entry['type'] == 'reference'
2980                 results += self.extract(entry['url'])
2981             return results
2982         else: # Root page
2983             info = {
2984                 'id': 'Stanford OpenClassroom',
2985                 'type': 'playlist',
2986                 'uploader': None,
2987                 'upload_date': None,
2988             }
2989
2990             self.report_download_webpage(info['id'])
2991             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2992             try:
2993                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2994             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2995                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2996                 return
2997
2998             info['title'] = info['id']
2999
3000             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3001             info['list'] = [
3002                 {
3003                     'type': 'reference',
3004                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3005                 }
3006                     for cpage in links]
3007
3008             results = []
3009             for entry in info['list']:
3010                 assert entry['type'] == 'reference'
3011                 results += self.extract(entry['url'])
3012             return results
3013
3014 class MTVIE(InfoExtractor):
3015     """Information extractor for MTV.com"""
3016
3017     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3018     IE_NAME = u'mtv'
3019
3020     def report_extraction(self, video_id):
3021         """Report information extraction."""
3022         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3023
3024     def _real_extract(self, url):
3025         mobj = re.match(self._VALID_URL, url)
3026         if mobj is None:
3027             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3028             return
3029         if not mobj.group('proto'):
3030             url = 'http://' + url
3031         video_id = mobj.group('videoid')
3032
3033         webpage = self._download_webpage(url, video_id)
3034
3035         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3036         if mobj is None:
3037             self._downloader.trouble(u'ERROR: unable to extract song name')
3038             return
3039         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3040         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3041         if mobj is None:
3042             self._downloader.trouble(u'ERROR: unable to extract performer')
3043             return
3044         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3045         video_title = performer + ' - ' + song_name
3046
3047         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3048         if mobj is None:
3049             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3050             return
3051         mtvn_uri = mobj.group(1)
3052
3053         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3054         if mobj is None:
3055             self._downloader.trouble(u'ERROR: unable to extract content id')
3056             return
3057         content_id = mobj.group(1)
3058
3059         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3060         self.report_extraction(video_id)
3061         request = compat_urllib_request.Request(videogen_url)
3062         try:
3063             metadataXml = compat_urllib_request.urlopen(request).read()
3064         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3065             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3066             return
3067
3068         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3069         renditions = mdoc.findall('.//rendition')
3070
3071         # For now, always pick the highest quality.
3072         rendition = renditions[-1]
3073
3074         try:
3075             _,_,ext = rendition.attrib['type'].partition('/')
3076             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3077             video_url = rendition.find('./src').text
3078         except KeyError:
3079             self._downloader.trouble('Invalid rendition field.')
3080             return
3081
3082         info = {
3083             'id': video_id,
3084             'url': video_url,
3085             'uploader': performer,
3086             'upload_date': None,
3087             'title': video_title,
3088             'ext': ext,
3089             'format': format,
3090         }
3091
3092         return [info]
3093
3094
3095 class YoukuIE(InfoExtractor):
3096     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3097
3098     def report_download_webpage(self, file_id):
3099         """Report webpage download."""
3100         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3101
3102     def report_extraction(self, file_id):
3103         """Report information extraction."""
3104         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3105
3106     def _gen_sid(self):
3107         nowTime = int(time.time() * 1000)
3108         random1 = random.randint(1000,1998)
3109         random2 = random.randint(1000,9999)
3110
3111         return "%d%d%d" %(nowTime,random1,random2)
3112
3113     def _get_file_ID_mix_string(self, seed):
3114         mixed = []
3115         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3116         seed = float(seed)
3117         for i in range(len(source)):
3118             seed  =  (seed * 211 + 30031 ) % 65536
3119             index  =  math.floor(seed / 65536 * len(source) )
3120             mixed.append(source[int(index)])
3121             source.remove(source[int(index)])
3122         #return ''.join(mixed)
3123         return mixed
3124
3125     def _get_file_id(self, fileId, seed):
3126         mixed = self._get_file_ID_mix_string(seed)
3127         ids = fileId.split('*')
3128         realId = []
3129         for ch in ids:
3130             if ch:
3131                 realId.append(mixed[int(ch)])
3132         return ''.join(realId)
3133
3134     def _real_extract(self, url):
3135         mobj = re.match(self._VALID_URL, url)
3136         if mobj is None:
3137             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3138             return
3139         video_id = mobj.group('ID')
3140
3141         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3142
3143         request = compat_urllib_request.Request(info_url, None, std_headers)
3144         try:
3145             self.report_download_webpage(video_id)
3146             jsondata = compat_urllib_request.urlopen(request).read()
3147         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3148             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3149             return
3150
3151         self.report_extraction(video_id)
3152         try:
3153             jsonstr = jsondata.decode('utf-8')
3154             config = json.loads(jsonstr)
3155
3156             video_title =  config['data'][0]['title']
3157             seed = config['data'][0]['seed']
3158
3159             format = self._downloader.params.get('format', None)
3160             supported_format = list(config['data'][0]['streamfileids'].keys())
3161
3162             if format is None or format == 'best':
3163                 if 'hd2' in supported_format:
3164                     format = 'hd2'
3165                 else:
3166                     format = 'flv'
3167                 ext = u'flv'
3168             elif format == 'worst':
3169                 format = 'mp4'
3170                 ext = u'mp4'
3171             else:
3172                 format = 'flv'
3173                 ext = u'flv'
3174
3175
3176             fileid = config['data'][0]['streamfileids'][format]
3177             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3178         except (UnicodeDecodeError, ValueError, KeyError):
3179             self._downloader.trouble(u'ERROR: unable to extract info section')
3180             return
3181
3182         files_info=[]
3183         sid = self._gen_sid()
3184         fileid = self._get_file_id(fileid, seed)
3185
3186         #column 8,9 of fileid represent the segment number
3187         #fileid[7:9] should be changed
3188         for index, key in enumerate(keys):
3189
3190             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3191             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3192
3193             info = {
3194                 'id': '%s_part%02d' % (video_id, index),
3195                 'url': download_url,
3196                 'uploader': None,
3197                 'upload_date': None,
3198                 'title': video_title,
3199                 'ext': ext,
3200             }
3201             files_info.append(info)
3202
3203         return files_info
3204
3205
3206 class XNXXIE(InfoExtractor):
3207     """Information extractor for xnxx.com"""
3208
3209     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3210     IE_NAME = u'xnxx'
3211     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3212     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3213     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3214
3215     def report_webpage(self, video_id):
3216         """Report information extraction"""
3217         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3218
3219     def report_extraction(self, video_id):
3220         """Report information extraction"""
3221         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3222
3223     def _real_extract(self, url):
3224         mobj = re.match(self._VALID_URL, url)
3225         if mobj is None:
3226             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3227             return
3228         video_id = mobj.group(1)
3229
3230         self.report_webpage(video_id)
3231
3232         # Get webpage content
3233         try:
3234             webpage_bytes = compat_urllib_request.urlopen(url).read()
3235             webpage = webpage_bytes.decode('utf-8')
3236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3237             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3238             return
3239
3240         result = re.search(self.VIDEO_URL_RE, webpage)
3241         if result is None:
3242             self._downloader.trouble(u'ERROR: unable to extract video url')
3243             return
3244         video_url = compat_urllib_parse.unquote(result.group(1))
3245
3246         result = re.search(self.VIDEO_TITLE_RE, webpage)
3247         if result is None:
3248             self._downloader.trouble(u'ERROR: unable to extract video title')
3249             return
3250         video_title = result.group(1)
3251
3252         result = re.search(self.VIDEO_THUMB_RE, webpage)
3253         if result is None:
3254             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3255             return
3256         video_thumbnail = result.group(1)
3257
3258         return [{
3259             'id': video_id,
3260             'url': video_url,
3261             'uploader': None,
3262             'upload_date': None,
3263             'title': video_title,
3264             'ext': 'flv',
3265             'thumbnail': video_thumbnail,
3266             'description': None,
3267         }]
3268
3269
3270 class GooglePlusIE(InfoExtractor):
3271     """Information extractor for plus.google.com."""
3272
3273     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3274     IE_NAME = u'plus.google'
3275
3276     def __init__(self, downloader=None):
3277         InfoExtractor.__init__(self, downloader)
3278
3279     def report_extract_entry(self, url):
3280         """Report downloading extry"""
3281         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3282
3283     def report_date(self, upload_date):
3284         """Report downloading extry"""
3285         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3286
3287     def report_uploader(self, uploader):
3288         """Report downloading extry"""
3289         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3290
3291     def report_title(self, video_title):
3292         """Report downloading extry"""
3293         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3294
3295     def report_extract_vid_page(self, video_page):
3296         """Report information extraction."""
3297         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3298
3299     def _real_extract(self, url):
3300         # Extract id from URL
3301         mobj = re.match(self._VALID_URL, url)
3302         if mobj is None:
3303             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3304             return
3305
3306         post_url = mobj.group(0)
3307         video_id = mobj.group(1)
3308
3309         video_extension = 'flv'
3310
3311         # Step 1, Retrieve post webpage to extract further information
3312         self.report_extract_entry(post_url)
3313         request = compat_urllib_request.Request(post_url)
3314         try:
3315             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3316         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3317             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3318             return
3319
3320         # Extract update date
3321         upload_date = None
3322         pattern = 'title="Timestamp">(.*?)</a>'
3323         mobj = re.search(pattern, webpage)
3324         if mobj:
3325             upload_date = mobj.group(1)
3326             # Convert timestring to a format suitable for filename
3327             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3328             upload_date = upload_date.strftime('%Y%m%d')
3329         self.report_date(upload_date)
3330
3331         # Extract uploader
3332         uploader = None
3333         pattern = r'rel\="author".*?>(.*?)</a>'
3334         mobj = re.search(pattern, webpage)
3335         if mobj:
3336             uploader = mobj.group(1)
3337         self.report_uploader(uploader)
3338
3339         # Extract title
3340         # Get the first line for title
3341         video_title = u'NA'
3342         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3343         mobj = re.search(pattern, webpage)
3344         if mobj:
3345             video_title = mobj.group(1)
3346         self.report_title(video_title)
3347
3348         # Step 2, Stimulate clicking the image box to launch video
3349         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3350         mobj = re.search(pattern, webpage)
3351         if mobj is None:
3352             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3353
3354         video_page = mobj.group(1)
3355         request = compat_urllib_request.Request(video_page)
3356         try:
3357             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3358         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3359             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3360             return
3361         self.report_extract_vid_page(video_page)
3362
3363
3364         # Extract video links on video page
3365         """Extract video links of all sizes"""
3366         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3367         mobj = re.findall(pattern, webpage)
3368         if len(mobj) == 0:
3369             self._downloader.trouble(u'ERROR: unable to extract video links')
3370
3371         # Sort in resolution
3372         links = sorted(mobj)
3373
3374         # Choose the lowest of the sort, i.e. highest resolution
3375         video_url = links[-1]
3376         # Only get the url. The resolution part in the tuple has no use anymore
3377         video_url = video_url[-1]
3378         # Treat escaped \u0026 style hex
3379         try:
3380             video_url = video_url.decode("unicode_escape")
3381         except AttributeError: # Python 3
3382             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3383
3384
3385         return [{
3386             'id':       video_id,
3387             'url':      video_url,
3388             'uploader': uploader,
3389             'upload_date':  upload_date,
3390             'title':    video_title,
3391             'ext':      video_extension,
3392         }]
3393
3394 class NBAIE(InfoExtractor):
3395     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3396     IE_NAME = u'nba'
3397
3398     def _real_extract(self, url):
3399         mobj = re.match(self._VALID_URL, url)
3400         if mobj is None:
3401             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3402             return
3403
3404         video_id = mobj.group(1)
3405         if video_id.endswith('/index.html'):
3406             video_id = video_id[:-len('/index.html')]
3407
3408         webpage = self._download_webpage(url, video_id)
3409
3410         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3411         def _findProp(rexp, default=None):
3412             m = re.search(rexp, webpage)
3413             if m:
3414                 return unescapeHTML(m.group(1))
3415             else:
3416                 return default
3417
3418         shortened_video_id = video_id.rpartition('/')[2]
3419         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3420         info = {
3421             'id': shortened_video_id,
3422             'url': video_url,
3423             'ext': 'mp4',
3424             'title': title,
3425             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3426             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3427         }
3428         return [info]
3429
3430 class JustinTVIE(InfoExtractor):
3431     """Information extractor for justin.tv and twitch.tv"""
3432     # TODO: One broadcast may be split into multiple videos. The key
3433     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3434     # starts at 1 and increases. Can we treat all parts as one video?
3435
3436     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3437         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3438     _JUSTIN_PAGE_LIMIT = 100
3439     IE_NAME = u'justin.tv'
3440
3441     def report_extraction(self, file_id):
3442         """Report information extraction."""
3443         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3444
3445     def report_download_page(self, channel, offset):
3446         """Report attempt to download a single page of videos."""
3447         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3448                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3449
3450     # Return count of items, list of *valid* items
3451     def _parse_page(self, url):
3452         try:
3453             urlh = compat_urllib_request.urlopen(url)
3454             webpage_bytes = urlh.read()
3455             webpage = webpage_bytes.decode('utf-8', 'ignore')
3456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3457             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3458             return
3459
3460         response = json.loads(webpage)
3461         if type(response) != list:
3462             error_text = response.get('error', 'unknown error')
3463             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3464             return
3465         info = []
3466         for clip in response:
3467             video_url = clip['video_file_url']
3468             if video_url:
3469                 video_extension = os.path.splitext(video_url)[1][1:]
3470                 video_date = re.sub('-', '', clip['start_time'][:10])
3471                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3472                 video_id = clip['id']
3473                 video_title = clip.get('title', video_id)
3474                 info.append({
3475                     'id': video_id,
3476                     'url': video_url,
3477                     'title': video_title,
3478                     'uploader': clip.get('channel_name', video_uploader_id),
3479                     'uploader_id': video_uploader_id,
3480                     'upload_date': video_date,
3481                     'ext': video_extension,
3482                 })
3483         return (len(response), info)
3484
3485     def _real_extract(self, url):
3486         mobj = re.match(self._VALID_URL, url)
3487         if mobj is None:
3488             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3489             return
3490
3491         api = 'http://api.justin.tv'
3492         video_id = mobj.group(mobj.lastindex)
3493         paged = False
3494         if mobj.lastindex == 1:
3495             paged = True
3496             api += '/channel/archives/%s.json'
3497         else:
3498             api += '/broadcast/by_archive/%s.json'
3499         api = api % (video_id,)
3500
3501         self.report_extraction(video_id)
3502
3503         info = []
3504         offset = 0
3505         limit = self._JUSTIN_PAGE_LIMIT
3506         while True:
3507             if paged:
3508                 self.report_download_page(video_id, offset)
3509             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3510             page_count, page_info = self._parse_page(page_url)
3511             info.extend(page_info)
3512             if not paged or page_count != limit:
3513                 break
3514             offset += limit
3515         return info
3516
3517 class FunnyOrDieIE(InfoExtractor):
3518     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3519
3520     def _real_extract(self, url):
3521         mobj = re.match(self._VALID_URL, url)
3522         if mobj is None:
3523             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3524             return
3525
3526         video_id = mobj.group('id')
3527         webpage = self._download_webpage(url, video_id)
3528
3529         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3530         if not m:
3531             self._downloader.trouble(u'ERROR: unable to find video information')
3532         video_url = unescapeHTML(m.group('url'))
3533
3534         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3535         if not m:
3536             self._downloader.trouble(u'Cannot find video title')
3537         title = unescapeHTML(m.group('title'))
3538
3539         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3540         if m:
3541             desc = unescapeHTML(m.group('desc'))
3542         else:
3543             desc = None
3544
3545         info = {
3546             'id': video_id,
3547             'url': video_url,
3548             'ext': 'mp4',
3549             'title': title,
3550             'description': desc,
3551         }
3552         return [info]
3553
3554 class TweetReelIE(InfoExtractor):
3555     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3556
3557     def _real_extract(self, url):
3558         mobj = re.match(self._VALID_URL, url)
3559         if mobj is None:
3560             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3561             return
3562
3563         video_id = mobj.group('id')
3564         webpage = self._download_webpage(url, video_id)
3565
3566         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3567         if not m:
3568             self._downloader.trouble(u'ERROR: Cannot find status ID')
3569         status_id = m.group(1)
3570
3571         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3572         if not m:
3573             self._downloader.trouble(u'WARNING: Cannot find description')
3574         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3575
3576         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3577         if not m:
3578             self._downloader.trouble(u'ERROR: Cannot find uploader')
3579         uploader = unescapeHTML(m.group('uploader'))
3580         uploader_id = unescapeHTML(m.group('uploader_id'))
3581
3582         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3583         if not m:
3584             self._downloader.trouble(u'ERROR: Cannot find upload date')
3585         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3586
3587         title = desc
3588         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3589
3590         info = {
3591             'id': video_id,
3592             'url': video_url,
3593             'ext': 'mov',
3594             'title': title,
3595             'description': desc,
3596             'uploader': uploader,
3597             'uploader_id': uploader_id,
3598             'internal_id': status_id,
3599             'upload_date': upload_date
3600         }
3601         return [info]
3602
3603 class SteamIE(InfoExtractor):
3604     _VALID_URL = r"""http://store.steampowered.com/
3605                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3606                 (?P<gameID>\d+)/?
3607                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3608                 """
3609
3610     def suitable(self, url):
3611         """Receives a URL and returns True if suitable for this IE."""
3612         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3613
3614     def _real_extract(self, url):
3615         m = re.match(self._VALID_URL, url, re.VERBOSE)
3616         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3617         gameID = m.group('gameID')
3618         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3619         webpage = self._download_webpage(videourl, gameID)
3620         mweb = re.finditer(urlRE, webpage)
3621         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3622         titles = re.finditer(namesRE, webpage)
3623         videos = []
3624         for vid,vtitle in zip(mweb,titles):
3625             video_id = vid.group('videoID')
3626             title = vtitle.group('videoName')
3627             video_url = vid.group('videoURL')
3628             if not video_url:
3629                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3630             info = {
3631                 'id':video_id,
3632                 'url':video_url,
3633                 'ext': 'flv',
3634                 'title': unescapeHTML(title)
3635                   }
3636             videos.append(info)
3637         return videos
3638
3639 class UstreamIE(InfoExtractor):
3640     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3641     IE_NAME = u'ustream'
3642
3643     def _real_extract(self, url):
3644         m = re.match(self._VALID_URL, url)
3645         video_id = m.group('videoID')
3646         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3647         webpage = self._download_webpage(url, video_id)
3648         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3649         title = m.group('title')
3650         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3651         uploader = m.group('uploader')
3652         info = {
3653                 'id':video_id,
3654                 'url':video_url,
3655                 'ext': 'flv',
3656                 'title': title,
3657                 'uploader': uploader
3658                   }
3659         return [info]
3660
3661 class RBMARadioIE(InfoExtractor):
3662     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3663
3664     def _real_extract(self, url):
3665         m = re.match(self._VALID_URL, url)
3666         video_id = m.group('videoID')
3667
3668         webpage = self._download_webpage(url, video_id)
3669         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3670         if not m:
3671             raise ExtractorError(u'Cannot find metadata')
3672         json_data = m.group(1)
3673
3674         try:
3675             data = json.loads(json_data)
3676         except ValueError as e:
3677             raise ExtractorError(u'Invalid JSON: ' + str(e))
3678
3679         video_url = data['akamai_url'] + '&cbr=256'
3680         url_parts = compat_urllib_parse_urlparse(video_url)
3681         video_ext = url_parts.path.rpartition('.')[2]
3682         info = {
3683                 'id': video_id,
3684                 'url': video_url,
3685                 'ext': video_ext,
3686                 'title': data['title'],
3687                 'description': data.get('teaser_text'),
3688                 'location': data.get('country_of_origin'),
3689                 'uploader': data.get('host', {}).get('name'),
3690                 'uploader_id': data.get('host', {}).get('slug'),
3691                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3692                 'duration': data.get('duration'),
3693         }
3694         return [info]
3695
3696
3697 class YouPornIE(InfoExtractor):
3698     """Information extractor for youporn.com."""
3699     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3700
3701     def _print_formats(self, formats):
3702         """Print all available formats"""
3703         print(u'Available formats:')
3704         print(u'ext\t\tformat')
3705         print(u'---------------------------------')
3706         for format in formats:
3707             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3708
3709     def _specific(self, req_format, formats):
3710         for x in formats:
3711             if(x["format"]==req_format):
3712                 return x
3713         return None
3714
3715     def _real_extract(self, url):
3716         mobj = re.match(self._VALID_URL, url)
3717         if mobj is None:
3718             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3719             return
3720
3721         video_id = mobj.group('videoid')
3722
3723         req = compat_urllib_request.Request(url)
3724         req.add_header('Cookie', 'age_verified=1')
3725         webpage = self._download_webpage(req, video_id)
3726
3727         # Get the video title
3728         result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3729         if result is None:
3730             raise ExtractorError(u'ERROR: unable to extract video title')
3731         video_title = result.group('title').strip()
3732
3733         # Get the video date
3734         result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3735         if result is None:
3736             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3737             upload_date = None
3738         else:
3739             upload_date = result.group('date').strip()
3740
3741         # Get the video uploader
3742         result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3743         if result is None:
3744             self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3745             video_uploader = None
3746         else:
3747             video_uploader = result.group('uploader').strip()
3748             video_uploader = clean_html( video_uploader )
3749
3750         # Get all of the formats available
3751         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3752         result = re.search(DOWNLOAD_LIST_RE, webpage)
3753         if result is None:
3754             raise ExtractorError(u'Unable to extract download list')
3755         download_list_html = result.group('download_list').strip()
3756
3757         # Get all of the links from the page
3758         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3759         links = re.findall(LINK_RE, download_list_html)
3760         if(len(links) == 0):
3761             raise ExtractorError(u'ERROR: no known formats available for video')
3762
3763         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3764
3765         formats = []
3766         for link in links:
3767
3768             # A link looks like this:
3769             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3770             # A path looks like this:
3771             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3772             video_url = unescapeHTML( link )
3773             path = compat_urllib_parse_urlparse( video_url ).path
3774             extension = os.path.splitext( path )[1][1:]
3775             format = path.split('/')[4].split('_')[:2]
3776             size = format[0]
3777             bitrate = format[1]
3778             format = "-".join( format )
3779             title = u'%s-%s-%s' % (video_title, size, bitrate)
3780
3781             formats.append({
3782                 'id': video_id,
3783                 'url': video_url,
3784                 'uploader': video_uploader,
3785                 'upload_date': upload_date,
3786                 'title': title,
3787                 'ext': extension,
3788                 'format': format,
3789                 'thumbnail': None,
3790                 'description': None,
3791                 'player_url': None
3792             })
3793
3794         if self._downloader.params.get('listformats', None):
3795             self._print_formats(formats)
3796             return
3797
3798         req_format = self._downloader.params.get('format', None)
3799         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3800
3801         if req_format is None or req_format == 'best':
3802             return [formats[0]]
3803         elif req_format == 'worst':
3804             return [formats[-1]]
3805         elif req_format in ('-1', 'all'):
3806             return formats
3807         else:
3808             format = self._specific( req_format, formats )
3809             if result is None:
3810                 self._downloader.trouble(u'ERROR: requested format not available')
3811                 return
3812             return [format]
3813
3814
3815
3816 class PornotubeIE(InfoExtractor):
3817     """Information extractor for pornotube.com."""
3818     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3819
3820     def _real_extract(self, url):
3821         mobj = re.match(self._VALID_URL, url)
3822         if mobj is None:
3823             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3824             return
3825
3826         video_id = mobj.group('videoid')
3827         video_title = mobj.group('title')
3828
3829         # Get webpage content
3830         webpage = self._download_webpage(url, video_id)
3831
3832         # Get the video URL
3833         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3834         result = re.search(VIDEO_URL_RE, webpage)
3835         if result is None:
3836             self._downloader.trouble(u'ERROR: unable to extract video url')
3837             return
3838         video_url = compat_urllib_parse.unquote(result.group('url'))
3839
3840         #Get the uploaded date
3841         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3842         result = re.search(VIDEO_UPLOADED_RE, webpage)
3843         if result is None:
3844             self._downloader.trouble(u'ERROR: unable to extract video title')
3845             return
3846         upload_date = result.group('date')
3847
3848         info = {'id': video_id,
3849                 'url': video_url,
3850                 'uploader': None,
3851                 'upload_date': upload_date,
3852                 'title': video_title,
3853                 'ext': 'flv',
3854                 'format': 'flv'}
3855
3856         return [info]
3857
3858 class YouJizzIE(InfoExtractor):
3859     """Information extractor for youjizz.com."""
3860     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3861
3862     def _real_extract(self, url):
3863         mobj = re.match(self._VALID_URL, url)
3864         if mobj is None:
3865             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3866             return
3867
3868         video_id = mobj.group('videoid')
3869
3870         # Get webpage content
3871         webpage = self._download_webpage(url, video_id)
3872
3873         # Get the video title
3874         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3875         if result is None:
3876             raise ExtractorError(u'ERROR: unable to extract video title')
3877         video_title = result.group('title').strip()
3878
3879         # Get the embed page
3880         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3881         if result is None:
3882             raise ExtractorError(u'ERROR: unable to extract embed page')
3883
3884         embed_page_url = result.group(0).strip()
3885         video_id = result.group('videoid')
3886
3887         webpage = self._download_webpage(embed_page_url, video_id)
3888
3889         # Get the video URL
3890         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3891         if result is None:
3892             raise ExtractorError(u'ERROR: unable to extract video url')
3893         video_url = result.group('source')
3894
3895         info = {'id': video_id,
3896                 'url': video_url,
3897                 'title': video_title,
3898                 'ext': 'flv',
3899                 'format': 'flv',
3900                 'player_url': embed_page_url}
3901
3902         return [info]
3903
3904 class EightTracksIE(InfoExtractor):
3905     IE_NAME = '8tracks'
3906     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3907
3908     def _real_extract(self, url):
3909         mobj = re.match(self._VALID_URL, url)
3910         if mobj is None:
3911             raise ExtractorError(u'Invalid URL: %s' % url)
3912         playlist_id = mobj.group('id')
3913
3914         webpage = self._download_webpage(url, playlist_id)
3915
3916         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3917         if not m:
3918             raise ExtractorError(u'Cannot find trax information')
3919         json_like = m.group(1)
3920         data = json.loads(json_like)
3921
3922         session = str(random.randint(0, 1000000000))
3923         mix_id = data['id']
3924         track_count = data['tracks_count']
3925         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3926         next_url = first_url
3927         res = []
3928         for i in itertools.count():
3929             api_json = self._download_webpage(next_url, playlist_id,
3930                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3931                 errnote=u'Failed to download song information')
3932             api_data = json.loads(api_json)
3933             track_data = api_data[u'set']['track']
3934             info = {
3935                 'id': track_data['id'],
3936                 'url': track_data['track_file_stream_url'],
3937                 'title': track_data['performer'] + u' - ' + track_data['name'],
3938                 'raw_title': track_data['name'],
3939                 'uploader_id': data['user']['login'],
3940                 'ext': 'm4a',
3941             }
3942             res.append(info)
3943             if api_data['set']['at_last_track']:
3944                 break
3945             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3946         return res
3947
3948 class KeekIE(InfoExtractor):
3949     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3950     IE_NAME = u'keek'
3951
3952     def _real_extract(self, url):
3953         m = re.match(self._VALID_URL, url)
3954         video_id = m.group('videoID')
3955         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3956         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3957         webpage = self._download_webpage(url, video_id)
3958         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3959         title = unescapeHTML(m.group('title'))
3960         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3961         uploader = unescapeHTML(m.group('uploader'))
3962         info = {
3963                 'id':video_id,
3964                 'url':video_url,
3965                 'ext': 'mp4',
3966                 'title': title,
3967                 'thumbnail': thumbnail,
3968                 'uploader': uploader
3969         }
3970         return [info]
3971
3972 class TEDIE(InfoExtractor):
3973     _VALID_URL=r'http://www.ted.com/talks/(?P<videoName>\w+)'
3974     def _real_extract(self, url):
3975         m=re.match(self._VALID_URL, url)
3976         videoName=m.group('videoName')
3977         webpage=self._download_webpage(url, 0, 'Downloading \"%s\" page' % videoName)
3978         #If the url includes the language we get the title translated
3979         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
3980         title=re.search(title_RE, webpage).group('title')
3981         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3982                         "id":(?P<videoID>[\d]+).*?
3983                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3984         info_match=re.search(info_RE,webpage,re.VERBOSE)
3985         video_id=info_match.group('videoID')
3986         mediaSlug=info_match.group('mediaSlug')
3987         video_url='http://download.ted.com/talks/%s.mp4' % mediaSlug
3988         info = {
3989                 'id':video_id,
3990                 'url':video_url,
3991                 'ext': 'mp4',
3992                 'title': title
3993         }
3994         return [info]
3995
3996 class MySpassIE(InfoExtractor):
3997     _VALID_URL = r'http://www.myspass.de/.*'
3998
3999     def _real_extract(self, url):
4000         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4001
4002         # video id is the last path element of the URL
4003         # usually there is a trailing slash, so also try the second but last
4004         url_path = compat_urllib_parse_urlparse(url).path
4005         url_parent_path, video_id = os.path.split(url_path)
4006         if not video_id:
4007             _, video_id = os.path.split(url_parent_path)
4008
4009         # get metadata
4010         metadata_url = META_DATA_URL_TEMPLATE % video_id
4011         metadata_text = self._download_webpage(metadata_url, video_id)
4012         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4013
4014         # extract values from metadata
4015         url_flv_el = metadata.find('url_flv')
4016         if url_flv_el is None:
4017             self._downloader.trouble(u'ERROR: unable to extract download url')
4018             return
4019         video_url = url_flv_el.text
4020         extension = os.path.splitext(video_url)[1][1:]
4021         title_el = metadata.find('title')
4022         if title_el is None:
4023             self._downloader.trouble(u'ERROR: unable to extract title')
4024             return
4025         title = title_el.text
4026         format_id_el = metadata.find('format_id')
4027         if format_id_el is None:
4028             format = ext
4029         else:
4030             format = format_id_el.text
4031         description_el = metadata.find('description')
4032         if description_el is not None:
4033             description = description_el.text
4034         else:
4035             description = None
4036         imagePreview_el = metadata.find('imagePreview')
4037         if imagePreview_el is not None:
4038             thumbnail = imagePreview_el.text
4039         else:
4040             thumbnail = None
4041         info = {
4042             'id': video_id,
4043             'url': video_url,
4044             'title': title,
4045             'ext': extension,
4046             'format': format,
4047             'thumbnail': thumbnail,
4048             'description': description
4049         }
4050         return [info]
4051
4052 def gen_extractors():
4053     """ Return a list of an instance of every supported extractor.
4054     The order does matter; the first extractor matched is the one handling the URL.
4055     """
4056     return [
4057         YoutubePlaylistIE(),
4058         YoutubeChannelIE(),
4059         YoutubeUserIE(),
4060         YoutubeSearchIE(),
4061         YoutubeIE(),
4062         MetacafeIE(),
4063         DailymotionIE(),
4064         GoogleSearchIE(),
4065         PhotobucketIE(),
4066         YahooIE(),
4067         YahooSearchIE(),
4068         DepositFilesIE(),
4069         FacebookIE(),
4070         BlipTVUserIE(),
4071         BlipTVIE(),
4072         VimeoIE(),
4073         MyVideoIE(),
4074         ComedyCentralIE(),
4075         EscapistIE(),
4076         CollegeHumorIE(),
4077         XVideosIE(),
4078         SoundcloudIE(),
4079         InfoQIE(),
4080         MixcloudIE(),
4081         StanfordOpenClassroomIE(),
4082         MTVIE(),
4083         YoukuIE(),
4084         XNXXIE(),
4085         YouJizzIE(),
4086         PornotubeIE(),
4087         YouPornIE(),
4088         GooglePlusIE(),
4089         ArteTvIE(),
4090         NBAIE(),
4091         JustinTVIE(),
4092         FunnyOrDieIE(),
4093         TweetReelIE(),
4094         SteamIE(),
4095         UstreamIE(),
4096         RBMARadioIE(),
4097         EightTracksIE(),
4098         KeekIE(),
4099         TEDIE(),
4100         MySpassIE(),
4101         GenericIE()
4102     ]
4103
4104