_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18
  19 from .utils import *
  20
  21
  22 class InfoExtractor(object):
  23     """Information Extractor class.
  24
  25     Information extractors are the classes that, given a URL, extract
  26     information about the video (or videos) the URL refers to. This
  27     information includes the real video URL, the video title, author and
  28     others. The information is stored in a dictionary which is then
  29     passed to the FileDownloader. The FileDownloader processes this
  30     information possibly downloading the video to the file system, among
  31     other possible outcomes.
  32
  33     The dictionaries must include the following fields:
  34
  35     id:             Video identifier.
  36     url:            Final video URL.
  37     title:          Video title, unescaped.
  38     ext:            Video filename extension.
  39
  40     The following fields are optional:
  41
  42     format:         The video format, defaults to ext (used for --get-format)
  43     thumbnail:      Full URL to a video thumbnail image.
  44     description:    One-line video description.
  45     uploader:       Full name of the video uploader.
  46     upload_date:    Video upload date (YYYYMMDD).
  47     uploader_id:    Nickname or id of the video uploader.
  48     location:       Physical location of the video.
  49     player_url:     SWF Player URL (used for rtmpdump).
  50     subtitles:      The .srt file contents.
  51     urlhandle:      [internal] The urlHandle to be used to download the file,
  52                     like returned by urllib.request.urlopen
  53
  54     The fields should all be Unicode strings.
  55
  56     Subclasses of this one should re-define the _real_initialize() and
  57     _real_extract() methods and define a _VALID_URL regexp.
  58     Probably, they should also be added to the list of extractors.
  59
  60     _real_extract() must return a *list* of information dictionaries as
  61     described above.
  62
  63     Finally, the _WORKING attribute should be set to False for broken IEs
  64     in order to warn the users and skip the tests.
  65     """
  66
  67     _ready = False
  68     _downloader = None
  69     _WORKING = True
  70
  71     def __init__(self, downloader=None):
  72         """Constructor. Receives an optional downloader."""
  73         self._ready = False
  74         self.set_downloader(downloader)
  75
  76     def suitable(self, url):
  77         """Receives a URL and returns True if suitable for this IE."""
  78         return re.match(self._VALID_URL, url) is not None
  79
  80     def working(self):
  81         """Getter method for _WORKING."""
  82         return self._WORKING
  83
  84     def initialize(self):
  85         """Initializes an instance (authentication, etc)."""
  86         if not self._ready:
  87             self._real_initialize()
  88             self._ready = True
  89
  90     def extract(self, url):
  91         """Extracts URL information and returns it in list of dicts."""
  92         self.initialize()
  93         return self._real_extract(url)
  94
  95     def set_downloader(self, downloader):
  96         """Sets the downloader for this IE."""
  97         self._downloader = downloader
  98
  99     def _real_initialize(self):
 100         """Real initialization process. Redefine in subclasses."""
 101         pass
 102
 103     def _real_extract(self, url):
 104         """Real extraction process. Redefine in subclasses."""
 105         pass
 106
 107     @property
 108     def IE_NAME(self):
 109         return type(self).__name__[:-2]
 110
 111     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 112         """ Returns the response handle """
 113         if note is None:
 114             note = u'Downloading video webpage'
 115         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 116         try:
 117             return compat_urllib_request.urlopen(url_or_request)
 118         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 119             if errnote is None:
 120                 errnote = u'Unable to download webpage'
 121             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 122
 123     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 124         """ Returns the data of the page as a string """
 125         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 126         webpage_bytes = urlh.read()
 127         return webpage_bytes.decode('utf-8', 'replace')
 128
 129
 130 class YoutubeIE(InfoExtractor):
 131     """Information extractor for youtube.com."""
 132
 133     _VALID_URL = r"""^
 134                      (
 135                          (?:https?://)?                                       # http(s):// (optional)
 136                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 137                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 138                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 139                          (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 140                          (?:                                                  # the various things that can precede the ID:
 141                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 142                              |(?:                                             # or the v= param in all its forms
 143                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 144                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 145                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 146                                  v=
 147                              )
 148                          )?                                                   # optional -> youtube.com/xxxx is OK
 149                      )?                                                       # all until now is optional -> you can pass the naked ID
 150                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 151                      (?(1).+)?                                                # if we found the ID, everything can follow
 152                      $"""
 153     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 154     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 155     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 156     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 157     _NETRC_MACHINE = 'youtube'
 158     # Listed in order of quality
 159     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 160     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 161     _video_extensions = {
 162         '13': '3gp',
 163         '17': 'mp4',
 164         '18': 'mp4',
 165         '22': 'mp4',
 166         '37': 'mp4',
 167         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 168         '43': 'webm',
 169         '44': 'webm',
 170         '45': 'webm',
 171         '46': 'webm',
 172     }
 173     _video_dimensions = {
 174         '5': '240x400',
 175         '6': '???',
 176         '13': '???',
 177         '17': '144x176',
 178         '18': '360x640',
 179         '22': '720x1280',
 180         '34': '360x640',
 181         '35': '480x854',
 182         '37': '1080x1920',
 183         '38': '3072x4096',
 184         '43': '360x640',
 185         '44': '480x854',
 186         '45': '720x1280',
 187         '46': '1080x1920',
 188     }
 189     IE_NAME = u'youtube'
 190
 191     def suitable(self, url):
 192         """Receives a URL and returns True if suitable for this IE."""
 193         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 194
 195     def report_lang(self):
 196         """Report attempt to set language."""
 197         self._downloader.to_screen(u'[youtube] Setting language')
 198
 199     def report_login(self):
 200         """Report attempt to log in."""
 201         self._downloader.to_screen(u'[youtube] Logging in')
 202
 203     def report_age_confirmation(self):
 204         """Report attempt to confirm age."""
 205         self._downloader.to_screen(u'[youtube] Confirming age')
 206
 207     def report_video_webpage_download(self, video_id):
 208         """Report attempt to download video webpage."""
 209         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 210
 211     def report_video_info_webpage_download(self, video_id):
 212         """Report attempt to download video info webpage."""
 213         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 214
 215     def report_video_subtitles_download(self, video_id):
 216         """Report attempt to download video info webpage."""
 217         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 218
 219     def report_information_extraction(self, video_id):
 220         """Report attempt to extract video information."""
 221         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 222
 223     def report_unavailable_format(self, video_id, format):
 224         """Report extracted video URL."""
 225         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 226
 227     def report_rtmp_download(self):
 228         """Indicate the download will use the RTMP protocol."""
 229         self._downloader.to_screen(u'[youtube] RTMP download detected')
 230
 231     def _extract_subtitles(self, video_id):
 232         self.report_video_subtitles_download(video_id)
 233         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 234         try:
 235             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 237             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 238         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 239         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 240         if not srt_lang_list:
 241             return (u'WARNING: video has no closed captions', None)
 242         if self._downloader.params.get('subtitleslang', False):
 243             srt_lang = self._downloader.params.get('subtitleslang')
 244         elif 'en' in srt_lang_list:
 245             srt_lang = 'en'
 246         else:
 247             srt_lang = list(srt_lang_list.keys())[0]
 248         if not srt_lang in srt_lang_list:
 249             return (u'WARNING: no closed captions found in the specified language', None)
 250         params = compat_urllib_parse.urlencode({
 251             'lang': srt_lang,
 252             'name': srt_lang_list[srt_lang].encode('utf-8'),
 253             'v': video_id,
 254             'fmt': 'srt',
 255         })
 256         url = 'http://www.youtube.com/api/timedtext?' + params
 257         try:
 258             srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
 259         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 260             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 261         if not srt:
 262             return (u'WARNING: Did not fetch video subtitles', None)
 263         return (None, srt)
 264
 265     def _print_formats(self, formats):
 266         print('Available formats:')
 267         for x in formats:
 268             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 269
 270     def _real_initialize(self):
 271         if self._downloader is None:
 272             return
 273
 274         username = None
 275         password = None
 276         downloader_params = self._downloader.params
 277
 278         # Attempt to use provided username and password or .netrc data
 279         if downloader_params.get('username', None) is not None:
 280             username = downloader_params['username']
 281             password = downloader_params['password']
 282         elif downloader_params.get('usenetrc', False):
 283             try:
 284                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 285                 if info is not None:
 286                     username = info[0]
 287                     password = info[2]
 288                 else:
 289                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 290             except (IOError, netrc.NetrcParseError) as err:
 291                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
 292                 return
 293
 294         # Set language
 295         request = compat_urllib_request.Request(self._LANG_URL)
 296         try:
 297             self.report_lang()
 298             compat_urllib_request.urlopen(request).read()
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
 301             return
 302
 303         # No authentication to be performed
 304         if username is None:
 305             return
 306
 307         request = compat_urllib_request.Request(self._LOGIN_URL)
 308         try:
 309             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 311             self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
 312             return
 313
 314         galx = None
 315         dsh = None
 316         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 317         if match:
 318           galx = match.group(1)
 319
 320         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 321         if match:
 322           dsh = match.group(1)
 323
 324         # Log in
 325         login_form_strs = {
 326                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 327                 u'Email': username,
 328                 u'GALX': galx,
 329                 u'Passwd': password,
 330                 u'PersistentCookie': u'yes',
 331                 u'_utf8': u'霱',
 332                 u'bgresponse': u'js_disabled',
 333                 u'checkConnection': u'',
 334                 u'checkedDomains': u'youtube',
 335                 u'dnConn': u'',
 336                 u'dsh': dsh,
 337                 u'pstMsg': u'0',
 338                 u'rmShown': u'1',
 339                 u'secTok': u'',
 340                 u'signIn': u'Sign in',
 341                 u'timeStmp': u'',
 342                 u'service': u'youtube',
 343                 u'uilel': u'3',
 344                 u'hl': u'en_US',
 345         }
 346         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 347         # chokes on unicode
 348         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 349         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 350         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 351         try:
 352             self.report_login()
 353             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 354             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 355                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 356                 return
 357         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 358             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
 359             return
 360
 361         # Confirm age
 362         age_form = {
 363                 'next_url':     '/',
 364                 'action_confirm':   'Confirm',
 365                 }
 366         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 367         try:
 368             self.report_age_confirmation()
 369             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 370         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 371             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 372             return
 373
 374     def _extract_id(self, url):
 375         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 376         if mobj is None:
 377             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 378             return
 379         video_id = mobj.group(2)
 380         return video_id
 381
 382     def _real_extract(self, url):
 383         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 384         mobj = re.search(self._NEXT_URL_RE, url)
 385         if mobj:
 386             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 387         video_id = self._extract_id(url)
 388
 389         # Get video webpage
 390         self.report_video_webpage_download(video_id)
 391         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 392         request = compat_urllib_request.Request(url)
 393         try:
 394             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 395         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 396             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 397             return
 398
 399         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 400
 401         # Attempt to extract SWF player URL
 402         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 403         if mobj is not None:
 404             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 405         else:
 406             player_url = None
 407
 408         # Get video info
 409         self.report_video_info_webpage_download(video_id)
 410         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 411             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 412                     % (video_id, el_type))
 413             request = compat_urllib_request.Request(video_info_url)
 414             try:
 415                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 416                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 417                 video_info = compat_parse_qs(video_info_webpage)
 418                 if 'token' in video_info:
 419                     break
 420             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 421                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 422                 return
 423         if 'token' not in video_info:
 424             if 'reason' in video_info:
 425                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 426             else:
 427                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 428             return
 429
 430         # Check for "rental" videos
 431         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 432             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 433             return
 434
 435         # Start extracting information
 436         self.report_information_extraction(video_id)
 437
 438         # uploader
 439         if 'author' not in video_info:
 440             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 441             return
 442         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 443
 444         # uploader_id
 445         video_uploader_id = None
 446         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 447         if mobj is not None:
 448             video_uploader_id = mobj.group(1)
 449         else:
 450             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 451
 452         # title
 453         if 'title' not in video_info:
 454             self._downloader.trouble(u'ERROR: unable to extract video title')
 455             return
 456         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 457
 458         # thumbnail image
 459         if 'thumbnail_url' not in video_info:
 460             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 461             video_thumbnail = ''
 462         else:   # don't panic if we can't find it
 463             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 464
 465         # upload date
 466         upload_date = None
 467         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 468         if mobj is not None:
 469             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 470             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 471             for expression in format_expressions:
 472                 try:
 473                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 474                 except:
 475                     pass
 476
 477         # description
 478         video_description = get_element_by_id("eow-description", video_webpage)
 479         if video_description:
 480             video_description = clean_html(video_description)
 481         else:
 482             video_description = ''
 483
 484         # closed captions
 485         video_subtitles = None
 486         if self._downloader.params.get('writesubtitles', False):
 487             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
 488             if srt_error:
 489                 self._downloader.trouble(srt_error)
 490
 491         if 'length_seconds' not in video_info:
 492             self._downloader.trouble(u'WARNING: unable to extract video duration')
 493             video_duration = ''
 494         else:
 495             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 496
 497         # token
 498         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 499
 500         # Decide which formats to download
 501         req_format = self._downloader.params.get('format', None)
 502
 503         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 504             self.report_rtmp_download()
 505             video_url_list = [(None, video_info['conn'][0])]
 506         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 507             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 508             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 509             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 510             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 511
 512             format_limit = self._downloader.params.get('format_limit', None)
 513             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 514             if format_limit is not None and format_limit in available_formats:
 515                 format_list = available_formats[available_formats.index(format_limit):]
 516             else:
 517                 format_list = available_formats
 518             existing_formats = [x for x in format_list if x in url_map]
 519             if len(existing_formats) == 0:
 520                 self._downloader.trouble(u'ERROR: no known formats available for video')
 521                 return
 522             if self._downloader.params.get('listformats', None):
 523                 self._print_formats(existing_formats)
 524                 return
 525             if req_format is None or req_format == 'best':
 526                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 527             elif req_format == 'worst':
 528                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 529             elif req_format in ('-1', 'all'):
 530                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 531             else:
 532                 # Specific formats. We pick the first in a slash-delimeted sequence.
 533                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 534                 req_formats = req_format.split('/')
 535                 video_url_list = None
 536                 for rf in req_formats:
 537                     if rf in url_map:
 538                         video_url_list = [(rf, url_map[rf])]
 539                         break
 540                 if video_url_list is None:
 541                     self._downloader.trouble(u'ERROR: requested format not available')
 542                     return
 543         else:
 544             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 545             return
 546
 547         results = []
 548         for format_param, video_real_url in video_url_list:
 549             # Extension
 550             video_extension = self._video_extensions.get(format_param, 'flv')
 551
 552             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 553                                               self._video_dimensions.get(format_param, '???'))
 554
 555             results.append({
 556                 'id':       video_id,
 557                 'url':      video_real_url,
 558                 'uploader': video_uploader,
 559                 'uploader_id': video_uploader_id,
 560                 'upload_date':  upload_date,
 561                 'title':    video_title,
 562                 'ext':      video_extension,
 563                 'format':   video_format,
 564                 'thumbnail':    video_thumbnail,
 565                 'description':  video_description,
 566                 'player_url':   player_url,
 567                 'subtitles':    video_subtitles,
 568                 'duration':     video_duration
 569             })
 570         return results
 571
 572
 573 class MetacafeIE(InfoExtractor):
 574     """Information Extractor for metacafe.com."""
 575
 576     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 577     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 578     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 579     IE_NAME = u'metacafe'
 580
 581     def __init__(self, downloader=None):
 582         InfoExtractor.__init__(self, downloader)
 583
 584     def report_disclaimer(self):
 585         """Report disclaimer retrieval."""
 586         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 587
 588     def report_age_confirmation(self):
 589         """Report attempt to confirm age."""
 590         self._downloader.to_screen(u'[metacafe] Confirming age')
 591
 592     def report_download_webpage(self, video_id):
 593         """Report webpage download."""
 594         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 595
 596     def report_extraction(self, video_id):
 597         """Report information extraction."""
 598         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 599
 600     def _real_initialize(self):
 601         # Retrieve disclaimer
 602         request = compat_urllib_request.Request(self._DISCLAIMER)
 603         try:
 604             self.report_disclaimer()
 605             disclaimer = compat_urllib_request.urlopen(request).read()
 606         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 607             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 608             return
 609
 610         # Confirm age
 611         disclaimer_form = {
 612             'filters': '0',
 613             'submit': "Continue - I'm over 18",
 614             }
 615         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 616         try:
 617             self.report_age_confirmation()
 618             disclaimer = compat_urllib_request.urlopen(request).read()
 619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 620             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 621             return
 622
 623     def _real_extract(self, url):
 624         # Extract id and simplified title from URL
 625         mobj = re.match(self._VALID_URL, url)
 626         if mobj is None:
 627             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 628             return
 629
 630         video_id = mobj.group(1)
 631
 632         # Check if video comes from YouTube
 633         mobj2 = re.match(r'^yt-(.*)$', video_id)
 634         if mobj2 is not None:
 635             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 636             return
 637
 638         # Retrieve video webpage to extract further information
 639         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 640         try:
 641             self.report_download_webpage(video_id)
 642             webpage = compat_urllib_request.urlopen(request).read()
 643         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 644             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 645             return
 646
 647         # Extract URL, uploader and title from webpage
 648         self.report_extraction(video_id)
 649         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 650         if mobj is not None:
 651             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 652             video_extension = mediaURL[-3:]
 653
 654             # Extract gdaKey if available
 655             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 656             if mobj is None:
 657                 video_url = mediaURL
 658             else:
 659                 gdaKey = mobj.group(1)
 660                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 661         else:
 662             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 663             if mobj is None:
 664                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 665                 return
 666             vardict = compat_parse_qs(mobj.group(1))
 667             if 'mediaData' not in vardict:
 668                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 669                 return
 670             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 671             if mobj is None:
 672                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 673                 return
 674             mediaURL = mobj.group(1).replace('\\/', '/')
 675             video_extension = mediaURL[-3:]
 676             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 677
 678         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 679         if mobj is None:
 680             self._downloader.trouble(u'ERROR: unable to extract title')
 681             return
 682         video_title = mobj.group(1).decode('utf-8')
 683
 684         mobj = re.search(r'submitter=(.*?);', webpage)
 685         if mobj is None:
 686             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 687             return
 688         video_uploader = mobj.group(1)
 689
 690         return [{
 691             'id':       video_id.decode('utf-8'),
 692             'url':      video_url.decode('utf-8'),
 693             'uploader': video_uploader.decode('utf-8'),
 694             'upload_date':  None,
 695             'title':    video_title,
 696             'ext':      video_extension.decode('utf-8'),
 697         }]
 698
 699
 700 class DailymotionIE(InfoExtractor):
 701     """Information Extractor for Dailymotion"""
 702
 703     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 704     IE_NAME = u'dailymotion'
 705     _WORKING = False
 706
 707     def __init__(self, downloader=None):
 708         InfoExtractor.__init__(self, downloader)
 709
 710     def report_extraction(self, video_id):
 711         """Report information extraction."""
 712         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 713
 714     def _real_extract(self, url):
 715         # Extract id and simplified title from URL
 716         mobj = re.match(self._VALID_URL, url)
 717         if mobj is None:
 718             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 719             return
 720
 721         video_id = mobj.group(1).split('_')[0].split('?')[0]
 722
 723         video_extension = 'mp4'
 724
 725         # Retrieve video webpage to extract further information
 726         request = compat_urllib_request.Request(url)
 727         request.add_header('Cookie', 'family_filter=off')
 728         webpage = self._download_webpage(request, video_id)
 729
 730         # Extract URL, uploader and title from webpage
 731         self.report_extraction(video_id)
 732         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 733         if mobj is None:
 734             self._downloader.trouble(u'ERROR: unable to extract media URL')
 735             return
 736         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 737
 738         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 739             if key in flashvars:
 740                 max_quality = key
 741                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 742                 break
 743         else:
 744             self._downloader.trouble(u'ERROR: unable to extract video URL')
 745             return
 746
 747         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 748         if mobj is None:
 749             self._downloader.trouble(u'ERROR: unable to extract video URL')
 750             return
 751
 752         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 753
 754         # TODO: support choosing qualities
 755
 756         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 757         if mobj is None:
 758             self._downloader.trouble(u'ERROR: unable to extract title')
 759             return
 760         video_title = unescapeHTML(mobj.group('title'))
 761
 762         video_uploader = None
 763         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 764         if mobj is None:
 765             # lookin for official user
 766             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 767             if mobj_official is None:
 768                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 769             else:
 770                 video_uploader = mobj_official.group(1)
 771         else:
 772             video_uploader = mobj.group(1)
 773
 774         video_upload_date = None
 775         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 776         if mobj is not None:
 777             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 778
 779         return [{
 780             'id':       video_id,
 781             'url':      video_url,
 782             'uploader': video_uploader,
 783             'upload_date':  video_upload_date,
 784             'title':    video_title,
 785             'ext':      video_extension,
 786         }]
 787
 788
 789 class PhotobucketIE(InfoExtractor):
 790     """Information extractor for photobucket.com."""
 791
 792     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 793     IE_NAME = u'photobucket'
 794
 795     def __init__(self, downloader=None):
 796         InfoExtractor.__init__(self, downloader)
 797
 798     def report_download_webpage(self, video_id):
 799         """Report webpage download."""
 800         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 801
 802     def report_extraction(self, video_id):
 803         """Report information extraction."""
 804         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 805
 806     def _real_extract(self, url):
 807         # Extract id from URL
 808         mobj = re.match(self._VALID_URL, url)
 809         if mobj is None:
 810             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 811             return
 812
 813         video_id = mobj.group(1)
 814
 815         video_extension = 'flv'
 816
 817         # Retrieve video webpage to extract further information
 818         request = compat_urllib_request.Request(url)
 819         try:
 820             self.report_download_webpage(video_id)
 821             webpage = compat_urllib_request.urlopen(request).read()
 822         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 823             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 824             return
 825
 826         # Extract URL, uploader, and title from webpage
 827         self.report_extraction(video_id)
 828         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 829         if mobj is None:
 830             self._downloader.trouble(u'ERROR: unable to extract media URL')
 831             return
 832         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 833
 834         video_url = mediaURL
 835
 836         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 837         if mobj is None:
 838             self._downloader.trouble(u'ERROR: unable to extract title')
 839             return
 840         video_title = mobj.group(1).decode('utf-8')
 841
 842         video_uploader = mobj.group(2).decode('utf-8')
 843
 844         return [{
 845             'id':       video_id.decode('utf-8'),
 846             'url':      video_url.decode('utf-8'),
 847             'uploader': video_uploader,
 848             'upload_date':  None,
 849             'title':    video_title,
 850             'ext':      video_extension.decode('utf-8'),
 851         }]
 852
 853
 854 class YahooIE(InfoExtractor):
 855     """Information extractor for video.yahoo.com."""
 856
 857     _WORKING = False
 858     # _VALID_URL matches all Yahoo! Video URLs
 859     # _VPAGE_URL matches only the extractable '/watch/' URLs
 860     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 861     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 862     IE_NAME = u'video.yahoo'
 863
 864     def __init__(self, downloader=None):
 865         InfoExtractor.__init__(self, downloader)
 866
 867     def report_download_webpage(self, video_id):
 868         """Report webpage download."""
 869         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 870
 871     def report_extraction(self, video_id):
 872         """Report information extraction."""
 873         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 874
 875     def _real_extract(self, url, new_video=True):
 876         # Extract ID from URL
 877         mobj = re.match(self._VALID_URL, url)
 878         if mobj is None:
 879             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 880             return
 881
 882         video_id = mobj.group(2)
 883         video_extension = 'flv'
 884
 885         # Rewrite valid but non-extractable URLs as
 886         # extractable English language /watch/ URLs
 887         if re.match(self._VPAGE_URL, url) is None:
 888             request = compat_urllib_request.Request(url)
 889             try:
 890                 webpage = compat_urllib_request.urlopen(request).read()
 891             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 892                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 893                 return
 894
 895             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 896             if mobj is None:
 897                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 898                 return
 899             yahoo_id = mobj.group(1)
 900
 901             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 902             if mobj is None:
 903                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 904                 return
 905             yahoo_vid = mobj.group(1)
 906
 907             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 908             return self._real_extract(url, new_video=False)
 909
 910         # Retrieve video webpage to extract further information
 911         request = compat_urllib_request.Request(url)
 912         try:
 913             self.report_download_webpage(video_id)
 914             webpage = compat_urllib_request.urlopen(request).read()
 915         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 916             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 917             return
 918
 919         # Extract uploader and title from webpage
 920         self.report_extraction(video_id)
 921         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 922         if mobj is None:
 923             self._downloader.trouble(u'ERROR: unable to extract video title')
 924             return
 925         video_title = mobj.group(1).decode('utf-8')
 926
 927         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 928         if mobj is None:
 929             self._downloader.trouble(u'ERROR: unable to extract video uploader')
 930             return
 931         video_uploader = mobj.group(1).decode('utf-8')
 932
 933         # Extract video thumbnail
 934         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 935         if mobj is None:
 936             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 937             return
 938         video_thumbnail = mobj.group(1).decode('utf-8')
 939
 940         # Extract video description
 941         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 942         if mobj is None:
 943             self._downloader.trouble(u'ERROR: unable to extract video description')
 944             return
 945         video_description = mobj.group(1).decode('utf-8')
 946         if not video_description:
 947             video_description = 'No description available.'
 948
 949         # Extract video height and width
 950         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 951         if mobj is None:
 952             self._downloader.trouble(u'ERROR: unable to extract video height')
 953             return
 954         yv_video_height = mobj.group(1)
 955
 956         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 957         if mobj is None:
 958             self._downloader.trouble(u'ERROR: unable to extract video width')
 959             return
 960         yv_video_width = mobj.group(1)
 961
 962         # Retrieve video playlist to extract media URL
 963         # I'm not completely sure what all these options are, but we
 964         # seem to need most of them, otherwise the server sends a 401.
 965         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 966         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 967         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 968                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 969                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 970         try:
 971             self.report_download_webpage(video_id)
 972             webpage = compat_urllib_request.urlopen(request).read()
 973         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 974             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 975             return
 976
 977         # Extract media URL from playlist XML
 978         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 979         if mobj is None:
 980             self._downloader.trouble(u'ERROR: Unable to extract media URL')
 981             return
 982         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
 983         video_url = unescapeHTML(video_url)
 984
 985         return [{
 986             'id':       video_id.decode('utf-8'),
 987             'url':      video_url,
 988             'uploader': video_uploader,
 989             'upload_date':  None,
 990             'title':    video_title,
 991             'ext':      video_extension.decode('utf-8'),
 992             'thumbnail':    video_thumbnail.decode('utf-8'),
 993             'description':  video_description,
 994         }]
 995
 996
 997 class VimeoIE(InfoExtractor):
 998     """Information extractor for vimeo.com."""
 999
1000     # _VALID_URL matches Vimeo URLs
1001     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1002     IE_NAME = u'vimeo'
1003
1004     def __init__(self, downloader=None):
1005         InfoExtractor.__init__(self, downloader)
1006
1007     def report_download_webpage(self, video_id):
1008         """Report webpage download."""
1009         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1010
1011     def report_extraction(self, video_id):
1012         """Report information extraction."""
1013         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1014
1015     def _real_extract(self, url, new_video=True):
1016         # Extract ID from URL
1017         mobj = re.match(self._VALID_URL, url)
1018         if mobj is None:
1019             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1020             return
1021
1022         video_id = mobj.group('id')
1023         if not mobj.group('proto'):
1024             url = 'https://' + url
1025         if mobj.group('direct_link'):
1026             url = 'https://vimeo.com/' + video_id
1027
1028         # Retrieve video webpage to extract further information
1029         request = compat_urllib_request.Request(url, None, std_headers)
1030         try:
1031             self.report_download_webpage(video_id)
1032             webpage_bytes = compat_urllib_request.urlopen(request).read()
1033             webpage = webpage_bytes.decode('utf-8')
1034         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1035             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1036             return
1037
1038         # Now we begin extracting as much information as we can from what we
1039         # retrieved. First we extract the information common to all extractors,
1040         # and latter we extract those that are Vimeo specific.
1041         self.report_extraction(video_id)
1042
1043         # Extract the config JSON
1044         try:
1045             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1046             config = json.loads(config)
1047         except:
1048             self._downloader.trouble(u'ERROR: unable to extract info section')
1049             return
1050
1051         # Extract title
1052         video_title = config["video"]["title"]
1053
1054         # Extract uploader and uploader_id
1055         video_uploader = config["video"]["owner"]["name"]
1056         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1057
1058         # Extract video thumbnail
1059         video_thumbnail = config["video"]["thumbnail"]
1060
1061         # Extract video description
1062         video_description = get_element_by_attribute("itemprop", "description", webpage)
1063         if video_description: video_description = clean_html(video_description)
1064         else: video_description = ''
1065
1066         # Extract upload date
1067         video_upload_date = None
1068         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1069         if mobj is not None:
1070             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1071
1072         # Vimeo specific: extract request signature and timestamp
1073         sig = config['request']['signature']
1074         timestamp = config['request']['timestamp']
1075
1076         # Vimeo specific: extract video codec and quality information
1077         # First consider quality, then codecs, then take everything
1078         # TODO bind to format param
1079         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1080         files = { 'hd': [], 'sd': [], 'other': []}
1081         for codec_name, codec_extension in codecs:
1082             if codec_name in config["video"]["files"]:
1083                 if 'hd' in config["video"]["files"][codec_name]:
1084                     files['hd'].append((codec_name, codec_extension, 'hd'))
1085                 elif 'sd' in config["video"]["files"][codec_name]:
1086                     files['sd'].append((codec_name, codec_extension, 'sd'))
1087                 else:
1088                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1089
1090         for quality in ('hd', 'sd', 'other'):
1091             if len(files[quality]) > 0:
1092                 video_quality = files[quality][0][2]
1093                 video_codec = files[quality][0][0]
1094                 video_extension = files[quality][0][1]
1095                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1096                 break
1097         else:
1098             self._downloader.trouble(u'ERROR: no known codec found')
1099             return
1100
1101         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1102                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1103
1104         return [{
1105             'id':       video_id,
1106             'url':      video_url,
1107             'uploader': video_uploader,
1108             'uploader_id': video_uploader_id,
1109             'upload_date':  video_upload_date,
1110             'title':    video_title,
1111             'ext':      video_extension,
1112             'thumbnail':    video_thumbnail,
1113             'description':  video_description,
1114         }]
1115
1116
1117 class ArteTvIE(InfoExtractor):
1118     """arte.tv information extractor."""
1119
1120     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1121     _LIVE_URL = r'index-[0-9]+\.html$'
1122
1123     IE_NAME = u'arte.tv'
1124
1125     def __init__(self, downloader=None):
1126         InfoExtractor.__init__(self, downloader)
1127
1128     def report_download_webpage(self, video_id):
1129         """Report webpage download."""
1130         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1131
1132     def report_extraction(self, video_id):
1133         """Report information extraction."""
1134         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1135
1136     def fetch_webpage(self, url):
1137         request = compat_urllib_request.Request(url)
1138         try:
1139             self.report_download_webpage(url)
1140             webpage = compat_urllib_request.urlopen(request).read()
1141         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1142             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1143             return
1144         except ValueError as err:
1145             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1146             return
1147         return webpage
1148
1149     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1150         page = self.fetch_webpage(url)
1151         mobj = re.search(regex, page, regexFlags)
1152         info = {}
1153
1154         if mobj is None:
1155             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1156             return
1157
1158         for (i, key, err) in matchTuples:
1159             if mobj.group(i) is None:
1160                 self._downloader.trouble(err)
1161                 return
1162             else:
1163                 info[key] = mobj.group(i)
1164
1165         return info
1166
1167     def extractLiveStream(self, url):
1168         video_lang = url.split('/')[-4]
1169         info = self.grep_webpage(
1170             url,
1171             r'src="(.*?/videothek_js.*?\.js)',
1172             0,
1173             [
1174                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1175             ]
1176         )
1177         http_host = url.split('/')[2]
1178         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1179         info = self.grep_webpage(
1180             next_url,
1181             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1182                 '(http://.*?\.swf).*?' +
1183                 '(rtmp://.*?)\'',
1184             re.DOTALL,
1185             [
1186                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1187                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1188                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1189             ]
1190         )
1191         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1192
1193     def extractPlus7Stream(self, url):
1194         video_lang = url.split('/')[-3]
1195         info = self.grep_webpage(
1196             url,
1197             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1198             0,
1199             [
1200                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1201             ]
1202         )
1203         next_url = compat_urllib_parse.unquote(info.get('url'))
1204         info = self.grep_webpage(
1205             next_url,
1206             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1207             0,
1208             [
1209                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1210             ]
1211         )
1212         next_url = compat_urllib_parse.unquote(info.get('url'))
1213
1214         info = self.grep_webpage(
1215             next_url,
1216             r'<video id="(.*?)".*?>.*?' +
1217                 '<name>(.*?)</name>.*?' +
1218                 '<dateVideo>(.*?)</dateVideo>.*?' +
1219                 '<url quality="hd">(.*?)</url>',
1220             re.DOTALL,
1221             [
1222                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1223                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1224                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1225                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1226             ]
1227         )
1228
1229         return {
1230             'id':           info.get('id'),
1231             'url':          compat_urllib_parse.unquote(info.get('url')),
1232             'uploader':     u'arte.tv',
1233             'upload_date':  info.get('date'),
1234             'title':        info.get('title').decode('utf-8'),
1235             'ext':          u'mp4',
1236             'format':       u'NA',
1237             'player_url':   None,
1238         }
1239
1240     def _real_extract(self, url):
1241         video_id = url.split('/')[-1]
1242         self.report_extraction(video_id)
1243
1244         if re.search(self._LIVE_URL, video_id) is not None:
1245             self.extractLiveStream(url)
1246             return
1247         else:
1248             info = self.extractPlus7Stream(url)
1249
1250         return [info]
1251
1252
1253 class GenericIE(InfoExtractor):
1254     """Generic last-resort information extractor."""
1255
1256     _VALID_URL = r'.*'
1257     IE_NAME = u'generic'
1258
1259     def __init__(self, downloader=None):
1260         InfoExtractor.__init__(self, downloader)
1261
1262     def report_download_webpage(self, video_id):
1263         """Report webpage download."""
1264         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1265         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1266
1267     def report_extraction(self, video_id):
1268         """Report information extraction."""
1269         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1270
1271     def report_following_redirect(self, new_url):
1272         """Report information extraction."""
1273         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1274
1275     def _test_redirect(self, url):
1276         """Check if it is a redirect, like url shorteners, in case restart chain."""
1277         class HeadRequest(compat_urllib_request.Request):
1278             def get_method(self):
1279                 return "HEAD"
1280
1281         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1282             """
1283             Subclass the HTTPRedirectHandler to make it use our
1284             HeadRequest also on the redirected URL
1285             """
1286             def redirect_request(self, req, fp, code, msg, headers, newurl):
1287                 if code in (301, 302, 303, 307):
1288                     newurl = newurl.replace(' ', '%20')
1289                     newheaders = dict((k,v) for k,v in req.headers.items()
1290                                       if k.lower() not in ("content-length", "content-type"))
1291                     return HeadRequest(newurl,
1292                                        headers=newheaders,
1293                                        origin_req_host=req.get_origin_req_host(),
1294                                        unverifiable=True)
1295                 else:
1296                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1297
1298         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1299             """
1300             Fallback to GET if HEAD is not allowed (405 HTTP error)
1301             """
1302             def http_error_405(self, req, fp, code, msg, headers):
1303                 fp.read()
1304                 fp.close()
1305
1306                 newheaders = dict((k,v) for k,v in req.headers.items()
1307                                   if k.lower() not in ("content-length", "content-type"))
1308                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1309                                                  headers=newheaders,
1310                                                  origin_req_host=req.get_origin_req_host(),
1311                                                  unverifiable=True))
1312
1313         # Build our opener
1314         opener = compat_urllib_request.OpenerDirector()
1315         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1316                         HTTPMethodFallback, HEADRedirectHandler,
1317                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1318             opener.add_handler(handler())
1319
1320         response = opener.open(HeadRequest(url))
1321         new_url = response.geturl()
1322
1323         if url == new_url:
1324             return False
1325
1326         self.report_following_redirect(new_url)
1327         self._downloader.download([new_url])
1328         return True
1329
1330     def _real_extract(self, url):
1331         if self._test_redirect(url): return
1332
1333         video_id = url.split('/')[-1]
1334         request = compat_urllib_request.Request(url)
1335         try:
1336             self.report_download_webpage(video_id)
1337             webpage = compat_urllib_request.urlopen(request).read()
1338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1339             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1340             return
1341         except ValueError as err:
1342             # since this is the last-resort InfoExtractor, if
1343             # this error is thrown, it'll be thrown here
1344             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1345             return
1346
1347         self.report_extraction(video_id)
1348         # Start with something easy: JW Player in SWFObject
1349         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1350         if mobj is None:
1351             # Broaden the search a little bit
1352             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1353         if mobj is None:
1354             # Broaden the search a little bit: JWPlayer JS loader
1355             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1356         if mobj is None:
1357             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1358             return
1359
1360         # It's possible that one of the regexes
1361         # matched, but returned an empty group:
1362         if mobj.group(1) is None:
1363             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1364             return
1365
1366         video_url = compat_urllib_parse.unquote(mobj.group(1))
1367         video_id = os.path.basename(video_url)
1368
1369         # here's a fun little line of code for you:
1370         video_extension = os.path.splitext(video_id)[1][1:]
1371         video_id = os.path.splitext(video_id)[0]
1372
1373         # it's tempting to parse this further, but you would
1374         # have to take into account all the variations like
1375         #   Video Title - Site Name
1376         #   Site Name | Video Title
1377         #   Video Title - Tagline | Site Name
1378         # and so on and so forth; it's just not practical
1379         mobj = re.search(r'<title>(.*)</title>', webpage)
1380         if mobj is None:
1381             self._downloader.trouble(u'ERROR: unable to extract title')
1382             return
1383         video_title = mobj.group(1)
1384
1385         # video uploader is domain name
1386         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1387         if mobj is None:
1388             self._downloader.trouble(u'ERROR: unable to extract title')
1389             return
1390         video_uploader = mobj.group(1)
1391
1392         return [{
1393             'id':       video_id,
1394             'url':      video_url,
1395             'uploader': video_uploader,
1396             'upload_date':  None,
1397             'title':    video_title,
1398             'ext':      video_extension,
1399         }]
1400
1401
1402 class YoutubeSearchIE(InfoExtractor):
1403     """Information Extractor for YouTube search queries."""
1404     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1405     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1406     _max_youtube_results = 1000
1407     IE_NAME = u'youtube:search'
1408
1409     def __init__(self, downloader=None):
1410         InfoExtractor.__init__(self, downloader)
1411
1412     def report_download_page(self, query, pagenum):
1413         """Report attempt to download search page with given number."""
1414         query = query.decode(preferredencoding())
1415         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1416
1417     def _real_extract(self, query):
1418         mobj = re.match(self._VALID_URL, query)
1419         if mobj is None:
1420             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1421             return
1422
1423         prefix, query = query.split(':')
1424         prefix = prefix[8:]
1425         query = query.encode('utf-8')
1426         if prefix == '':
1427             self._download_n_results(query, 1)
1428             return
1429         elif prefix == 'all':
1430             self._download_n_results(query, self._max_youtube_results)
1431             return
1432         else:
1433             try:
1434                 n = int(prefix)
1435                 if n <= 0:
1436                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1437                     return
1438                 elif n > self._max_youtube_results:
1439                     self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1440                     n = self._max_youtube_results
1441                 self._download_n_results(query, n)
1442                 return
1443             except ValueError: # parsing prefix as integer fails
1444                 self._download_n_results(query, 1)
1445                 return
1446
1447     def _download_n_results(self, query, n):
1448         """Downloads a specified number of results for a query"""
1449
1450         video_ids = []
1451         pagenum = 0
1452         limit = n
1453
1454         while (50 * pagenum) < limit:
1455             self.report_download_page(query, pagenum+1)
1456             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1457             request = compat_urllib_request.Request(result_url)
1458             try:
1459                 data = compat_urllib_request.urlopen(request).read()
1460             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1461                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1462                 return
1463             api_response = json.loads(data)['data']
1464
1465             new_ids = list(video['id'] for video in api_response['items'])
1466             video_ids += new_ids
1467
1468             limit = min(n, api_response['totalItems'])
1469             pagenum += 1
1470
1471         if len(video_ids) > n:
1472             video_ids = video_ids[:n]
1473         for id in video_ids:
1474             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1475         return
1476
1477
1478 class GoogleSearchIE(InfoExtractor):
1479     """Information Extractor for Google Video search queries."""
1480     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1481     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1482     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1483     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1484     _max_google_results = 1000
1485     IE_NAME = u'video.google:search'
1486
1487     def __init__(self, downloader=None):
1488         InfoExtractor.__init__(self, downloader)
1489
1490     def report_download_page(self, query, pagenum):
1491         """Report attempt to download playlist page with given number."""
1492         query = query.decode(preferredencoding())
1493         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1494
1495     def _real_extract(self, query):
1496         mobj = re.match(self._VALID_URL, query)
1497         if mobj is None:
1498             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1499             return
1500
1501         prefix, query = query.split(':')
1502         prefix = prefix[8:]
1503         query = query.encode('utf-8')
1504         if prefix == '':
1505             self._download_n_results(query, 1)
1506             return
1507         elif prefix == 'all':
1508             self._download_n_results(query, self._max_google_results)
1509             return
1510         else:
1511             try:
1512                 n = int(prefix)
1513                 if n <= 0:
1514                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1515                     return
1516                 elif n > self._max_google_results:
1517                     self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1518                     n = self._max_google_results
1519                 self._download_n_results(query, n)
1520                 return
1521             except ValueError: # parsing prefix as integer fails
1522                 self._download_n_results(query, 1)
1523                 return
1524
1525     def _download_n_results(self, query, n):
1526         """Downloads a specified number of results for a query"""
1527
1528         video_ids = []
1529         pagenum = 0
1530
1531         while True:
1532             self.report_download_page(query, pagenum)
1533             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1534             request = compat_urllib_request.Request(result_url)
1535             try:
1536                 page = compat_urllib_request.urlopen(request).read()
1537             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1538                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1539                 return
1540
1541             # Extract video identifiers
1542             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1543                 video_id = mobj.group(1)
1544                 if video_id not in video_ids:
1545                     video_ids.append(video_id)
1546                     if len(video_ids) == n:
1547                         # Specified n videos reached
1548                         for id in video_ids:
1549                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1550                         return
1551
1552             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1553                 for id in video_ids:
1554                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1555                 return
1556
1557             pagenum = pagenum + 1
1558
1559
1560 class YahooSearchIE(InfoExtractor):
1561     """Information Extractor for Yahoo! Video search queries."""
1562
1563     _WORKING = False
1564     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1565     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1566     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1567     _MORE_PAGES_INDICATOR = r'\s*Next'
1568     _max_yahoo_results = 1000
1569     IE_NAME = u'video.yahoo:search'
1570
1571     def __init__(self, downloader=None):
1572         InfoExtractor.__init__(self, downloader)
1573
1574     def report_download_page(self, query, pagenum):
1575         """Report attempt to download playlist page with given number."""
1576         query = query.decode(preferredencoding())
1577         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1578
1579     def _real_extract(self, query):
1580         mobj = re.match(self._VALID_URL, query)
1581         if mobj is None:
1582             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1583             return
1584
1585         prefix, query = query.split(':')
1586         prefix = prefix[8:]
1587         query = query.encode('utf-8')
1588         if prefix == '':
1589             self._download_n_results(query, 1)
1590             return
1591         elif prefix == 'all':
1592             self._download_n_results(query, self._max_yahoo_results)
1593             return
1594         else:
1595             try:
1596                 n = int(prefix)
1597                 if n <= 0:
1598                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1599                     return
1600                 elif n > self._max_yahoo_results:
1601                     self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1602                     n = self._max_yahoo_results
1603                 self._download_n_results(query, n)
1604                 return
1605             except ValueError: # parsing prefix as integer fails
1606                 self._download_n_results(query, 1)
1607                 return
1608
1609     def _download_n_results(self, query, n):
1610         """Downloads a specified number of results for a query"""
1611
1612         video_ids = []
1613         already_seen = set()
1614         pagenum = 1
1615
1616         while True:
1617             self.report_download_page(query, pagenum)
1618             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1619             request = compat_urllib_request.Request(result_url)
1620             try:
1621                 page = compat_urllib_request.urlopen(request).read()
1622             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1623                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1624                 return
1625
1626             # Extract video identifiers
1627             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1628                 video_id = mobj.group(1)
1629                 if video_id not in already_seen:
1630                     video_ids.append(video_id)
1631                     already_seen.add(video_id)
1632                     if len(video_ids) == n:
1633                         # Specified n videos reached
1634                         for id in video_ids:
1635                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1636                         return
1637
1638             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1639                 for id in video_ids:
1640                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1641                 return
1642
1643             pagenum = pagenum + 1
1644
1645
1646 class YoutubePlaylistIE(InfoExtractor):
1647     """Information Extractor for YouTube playlists."""
1648
1649     _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1650     _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1651     _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1652     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1653     IE_NAME = u'youtube:playlist'
1654
1655     def __init__(self, downloader=None):
1656         InfoExtractor.__init__(self, downloader)
1657
1658     def report_download_page(self, playlist_id, pagenum):
1659         """Report attempt to download playlist page with given number."""
1660         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1661
1662     def _real_extract(self, url):
1663         # Extract playlist id
1664         mobj = re.match(self._VALID_URL, url)
1665         if mobj is None:
1666             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1667             return
1668
1669         # Single video case
1670         if mobj.group(3) is not None:
1671             self._downloader.download([mobj.group(3)])
1672             return
1673
1674         # Download playlist pages
1675         # prefix is 'p' as default for playlists but there are other types that need extra care
1676         playlist_prefix = mobj.group(1)
1677         if playlist_prefix == 'a':
1678             playlist_access = 'artist'
1679         else:
1680             playlist_prefix = 'p'
1681             playlist_access = 'view_play_list'
1682         playlist_id = mobj.group(2)
1683         video_ids = []
1684         pagenum = 1
1685
1686         while True:
1687             self.report_download_page(playlist_id, pagenum)
1688             url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1689             request = compat_urllib_request.Request(url)
1690             try:
1691                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1692             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1693                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1694                 return
1695
1696             # Extract video identifiers
1697             ids_in_page = []
1698             for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1699                 if mobj.group(1) not in ids_in_page:
1700                     ids_in_page.append(mobj.group(1))
1701             video_ids.extend(ids_in_page)
1702
1703             if self._MORE_PAGES_INDICATOR not in page:
1704                 break
1705             pagenum = pagenum + 1
1706
1707         total = len(video_ids)
1708
1709         playliststart = self._downloader.params.get('playliststart', 1) - 1
1710         playlistend = self._downloader.params.get('playlistend', -1)
1711         if playlistend == -1:
1712             video_ids = video_ids[playliststart:]
1713         else:
1714             video_ids = video_ids[playliststart:playlistend]
1715
1716         if len(video_ids) == total:
1717             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1718         else:
1719             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1720
1721         for id in video_ids:
1722             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1723         return
1724
1725
1726 class YoutubeChannelIE(InfoExtractor):
1727     """Information Extractor for YouTube channels."""
1728
1729     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1730     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1731     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1732     IE_NAME = u'youtube:channel'
1733
1734     def report_download_page(self, channel_id, pagenum):
1735         """Report attempt to download channel page with given number."""
1736         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1737
1738     def _real_extract(self, url):
1739         # Extract channel id
1740         mobj = re.match(self._VALID_URL, url)
1741         if mobj is None:
1742             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1743             return
1744
1745         # Download channel pages
1746         channel_id = mobj.group(1)
1747         video_ids = []
1748         pagenum = 1
1749
1750         while True:
1751             self.report_download_page(channel_id, pagenum)
1752             url = self._TEMPLATE_URL % (channel_id, pagenum)
1753             request = compat_urllib_request.Request(url)
1754             try:
1755                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1756             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1758                 return
1759
1760             # Extract video identifiers
1761             ids_in_page = []
1762             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1763                 if mobj.group(1) not in ids_in_page:
1764                     ids_in_page.append(mobj.group(1))
1765             video_ids.extend(ids_in_page)
1766
1767             if self._MORE_PAGES_INDICATOR not in page:
1768                 break
1769             pagenum = pagenum + 1
1770
1771         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1772
1773         for id in video_ids:
1774             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1775         return
1776
1777
1778 class YoutubeUserIE(InfoExtractor):
1779     """Information Extractor for YouTube users."""
1780
1781     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1782     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1783     _GDATA_PAGE_SIZE = 50
1784     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1785     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1786     IE_NAME = u'youtube:user'
1787
1788     def __init__(self, downloader=None):
1789         InfoExtractor.__init__(self, downloader)
1790
1791     def report_download_page(self, username, start_index):
1792         """Report attempt to download user page."""
1793         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1794                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1795
1796     def _real_extract(self, url):
1797         # Extract username
1798         mobj = re.match(self._VALID_URL, url)
1799         if mobj is None:
1800             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1801             return
1802
1803         username = mobj.group(1)
1804
1805         # Download video ids using YouTube Data API. Result size per
1806         # query is limited (currently to 50 videos) so we need to query
1807         # page by page until there are no video ids - it means we got
1808         # all of them.
1809
1810         video_ids = []
1811         pagenum = 0
1812
1813         while True:
1814             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1815             self.report_download_page(username, start_index)
1816
1817             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1818
1819             try:
1820                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1821             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1822                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1823                 return
1824
1825             # Extract video identifiers
1826             ids_in_page = []
1827
1828             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1829                 if mobj.group(1) not in ids_in_page:
1830                     ids_in_page.append(mobj.group(1))
1831
1832             video_ids.extend(ids_in_page)
1833
1834             # A little optimization - if current page is not
1835             # "full", ie. does not contain PAGE_SIZE video ids then
1836             # we can assume that this page is the last one - there
1837             # are no more ids on further pages - no need to query
1838             # again.
1839
1840             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1841                 break
1842
1843             pagenum += 1
1844
1845         all_ids_count = len(video_ids)
1846         playliststart = self._downloader.params.get('playliststart', 1) - 1
1847         playlistend = self._downloader.params.get('playlistend', -1)
1848
1849         if playlistend == -1:
1850             video_ids = video_ids[playliststart:]
1851         else:
1852             video_ids = video_ids[playliststart:playlistend]
1853
1854         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1855                 (username, all_ids_count, len(video_ids)))
1856
1857         for video_id in video_ids:
1858             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1859
1860
1861 class BlipTVUserIE(InfoExtractor):
1862     """Information Extractor for blip.tv users."""
1863
1864     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1865     _PAGE_SIZE = 12
1866     IE_NAME = u'blip.tv:user'
1867
1868     def __init__(self, downloader=None):
1869         InfoExtractor.__init__(self, downloader)
1870
1871     def report_download_page(self, username, pagenum):
1872         """Report attempt to download user page."""
1873         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1874                 (self.IE_NAME, username, pagenum))
1875
1876     def _real_extract(self, url):
1877         # Extract username
1878         mobj = re.match(self._VALID_URL, url)
1879         if mobj is None:
1880             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1881             return
1882
1883         username = mobj.group(1)
1884
1885         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1886
1887         request = compat_urllib_request.Request(url)
1888
1889         try:
1890             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1891             mobj = re.search(r'data-users-id="([^"]+)"', page)
1892             page_base = page_base % mobj.group(1)
1893         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1894             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1895             return
1896
1897
1898         # Download video ids using BlipTV Ajax calls. Result size per
1899         # query is limited (currently to 12 videos) so we need to query
1900         # page by page until there are no video ids - it means we got
1901         # all of them.
1902
1903         video_ids = []
1904         pagenum = 1
1905
1906         while True:
1907             self.report_download_page(username, pagenum)
1908
1909             request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1910
1911             try:
1912                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1913             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1914                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1915                 return
1916
1917             # Extract video identifiers
1918             ids_in_page = []
1919
1920             for mobj in re.finditer(r'href="/([^"]+)"', page):
1921                 if mobj.group(1) not in ids_in_page:
1922                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1923
1924             video_ids.extend(ids_in_page)
1925
1926             # A little optimization - if current page is not
1927             # "full", ie. does not contain PAGE_SIZE video ids then
1928             # we can assume that this page is the last one - there
1929             # are no more ids on further pages - no need to query
1930             # again.
1931
1932             if len(ids_in_page) < self._PAGE_SIZE:
1933                 break
1934
1935             pagenum += 1
1936
1937         all_ids_count = len(video_ids)
1938         playliststart = self._downloader.params.get('playliststart', 1) - 1
1939         playlistend = self._downloader.params.get('playlistend', -1)
1940
1941         if playlistend == -1:
1942             video_ids = video_ids[playliststart:]
1943         else:
1944             video_ids = video_ids[playliststart:playlistend]
1945
1946         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1947                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1948
1949         for video_id in video_ids:
1950             self._downloader.download([u'http://blip.tv/'+video_id])
1951
1952
1953 class DepositFilesIE(InfoExtractor):
1954     """Information extractor for depositfiles.com"""
1955
1956     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1957
1958     def report_download_webpage(self, file_id):
1959         """Report webpage download."""
1960         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1961
1962     def report_extraction(self, file_id):
1963         """Report information extraction."""
1964         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1965
1966     def _real_extract(self, url):
1967         file_id = url.split('/')[-1]
1968         # Rebuild url in english locale
1969         url = 'http://depositfiles.com/en/files/' + file_id
1970
1971         # Retrieve file webpage with 'Free download' button pressed
1972         free_download_indication = { 'gateway_result' : '1' }
1973         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1974         try:
1975             self.report_download_webpage(file_id)
1976             webpage = compat_urllib_request.urlopen(request).read()
1977         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1978             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1979             return
1980
1981         # Search for the real file URL
1982         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1983         if (mobj is None) or (mobj.group(1) is None):
1984             # Try to figure out reason of the error.
1985             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1986             if (mobj is not None) and (mobj.group(1) is not None):
1987                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1988                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1989             else:
1990                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1991             return
1992
1993         file_url = mobj.group(1)
1994         file_extension = os.path.splitext(file_url)[1][1:]
1995
1996         # Search for file title
1997         mobj = re.search(r'<b title="(.*?)">', webpage)
1998         if mobj is None:
1999             self._downloader.trouble(u'ERROR: unable to extract title')
2000             return
2001         file_title = mobj.group(1).decode('utf-8')
2002
2003         return [{
2004             'id':       file_id.decode('utf-8'),
2005             'url':      file_url.decode('utf-8'),
2006             'uploader': None,
2007             'upload_date':  None,
2008             'title':    file_title,
2009             'ext':      file_extension.decode('utf-8'),
2010         }]
2011
2012
2013 class FacebookIE(InfoExtractor):
2014     """Information Extractor for Facebook"""
2015
2016     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2017     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2018     _NETRC_MACHINE = 'facebook'
2019     IE_NAME = u'facebook'
2020
2021     def report_login(self):
2022         """Report attempt to log in."""
2023         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2024
2025     def _real_initialize(self):
2026         if self._downloader is None:
2027             return
2028
2029         useremail = None
2030         password = None
2031         downloader_params = self._downloader.params
2032
2033         # Attempt to use provided username and password or .netrc data
2034         if downloader_params.get('username', None) is not None:
2035             useremail = downloader_params['username']
2036             password = downloader_params['password']
2037         elif downloader_params.get('usenetrc', False):
2038             try:
2039                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2040                 if info is not None:
2041                     useremail = info[0]
2042                     password = info[2]
2043                 else:
2044                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2045             except (IOError, netrc.NetrcParseError) as err:
2046                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2047                 return
2048
2049         if useremail is None:
2050             return
2051
2052         # Log in
2053         login_form = {
2054             'email': useremail,
2055             'pass': password,
2056             'login': 'Log+In'
2057             }
2058         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2059         try:
2060             self.report_login()
2061             login_results = compat_urllib_request.urlopen(request).read()
2062             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2063                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2064                 return
2065         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2066             self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2067             return
2068
2069     def _real_extract(self, url):
2070         mobj = re.match(self._VALID_URL, url)
2071         if mobj is None:
2072             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2073             return
2074         video_id = mobj.group('ID')
2075
2076         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2077         webpage = self._download_webpage(url, video_id)
2078
2079         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2080         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2081         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2082         if not m:
2083             raise ExtractorError(u'Cannot parse data')
2084         data = dict(json.loads(m.group(1)))
2085         params_raw = compat_urllib_parse.unquote(data['params'])
2086         params = json.loads(params_raw)
2087         video_url = params['hd_src']
2088         if not video_url:
2089             video_url = params['sd_src']
2090         if not video_url:
2091             raise ExtractorError(u'Cannot find video URL')
2092         video_duration = int(params['video_duration'])
2093
2094         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2095         if not m:
2096             raise ExtractorError(u'Cannot find title in webpage')
2097         video_title = unescapeHTML(m.group(1))
2098
2099         info = {
2100             'id': video_id,
2101             'title': video_title,
2102             'url': video_url,
2103             'ext': 'mp4',
2104             'duration': video_duration,
2105             'thumbnail': params['thumbnail_src'],
2106         }
2107         return [info]
2108
2109
2110 class BlipTVIE(InfoExtractor):
2111     """Information extractor for blip.tv"""
2112
2113     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2114     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2115     IE_NAME = u'blip.tv'
2116
2117     def report_extraction(self, file_id):
2118         """Report information extraction."""
2119         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2120
2121     def report_direct_download(self, title):
2122         """Report information extraction."""
2123         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2124
2125     def _real_extract(self, url):
2126         mobj = re.match(self._VALID_URL, url)
2127         if mobj is None:
2128             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2129             return
2130
2131         if '?' in url:
2132             cchar = '&'
2133         else:
2134             cchar = '?'
2135         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2136         request = compat_urllib_request.Request(json_url)
2137         request.add_header('User-Agent', 'iTunes/10.6.1')
2138         self.report_extraction(mobj.group(1))
2139         info = None
2140         try:
2141             urlh = compat_urllib_request.urlopen(request)
2142             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2143                 basename = url.split('/')[-1]
2144                 title,ext = os.path.splitext(basename)
2145                 title = title.decode('UTF-8')
2146                 ext = ext.replace('.', '')
2147                 self.report_direct_download(title)
2148                 info = {
2149                     'id': title,
2150                     'url': url,
2151                     'uploader': None,
2152                     'upload_date': None,
2153                     'title': title,
2154                     'ext': ext,
2155                     'urlhandle': urlh
2156                 }
2157         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2158             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2159         if info is None: # Regular URL
2160             try:
2161                 json_code_bytes = urlh.read()
2162                 json_code = json_code_bytes.decode('utf-8')
2163             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2164                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2165                 return
2166
2167             try:
2168                 json_data = json.loads(json_code)
2169                 if 'Post' in json_data:
2170                     data = json_data['Post']
2171                 else:
2172                     data = json_data
2173
2174                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2175                 video_url = data['media']['url']
2176                 umobj = re.match(self._URL_EXT, video_url)
2177                 if umobj is None:
2178                     raise ValueError('Can not determine filename extension')
2179                 ext = umobj.group(1)
2180
2181                 info = {
2182                     'id': data['item_id'],
2183                     'url': video_url,
2184                     'uploader': data['display_name'],
2185                     'upload_date': upload_date,
2186                     'title': data['title'],
2187                     'ext': ext,
2188                     'format': data['media']['mimeType'],
2189                     'thumbnail': data['thumbnailUrl'],
2190                     'description': data['description'],
2191                     'player_url': data['embedUrl'],
2192                     'user_agent': 'iTunes/10.6.1',
2193                 }
2194             except (ValueError,KeyError) as err:
2195                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2196                 return
2197
2198         return [info]
2199
2200
2201 class MyVideoIE(InfoExtractor):
2202     """Information Extractor for myvideo.de."""
2203
2204     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2205     IE_NAME = u'myvideo'
2206
2207     def __init__(self, downloader=None):
2208         InfoExtractor.__init__(self, downloader)
2209
2210     def report_extraction(self, video_id):
2211         """Report information extraction."""
2212         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2213
2214     def _real_extract(self,url):
2215         mobj = re.match(self._VALID_URL, url)
2216         if mobj is None:
2217             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2218             return
2219
2220         video_id = mobj.group(1)
2221
2222         # Get video webpage
2223         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2224         webpage = self._download_webpage(webpage_url, video_id)
2225
2226         self.report_extraction(video_id)
2227         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2228                  webpage)
2229         if mobj is None:
2230             self._downloader.trouble(u'ERROR: unable to extract media URL')
2231             return
2232         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2233
2234         mobj = re.search('<title>([^<]+)</title>', webpage)
2235         if mobj is None:
2236             self._downloader.trouble(u'ERROR: unable to extract title')
2237             return
2238
2239         video_title = mobj.group(1)
2240
2241         return [{
2242             'id':       video_id,
2243             'url':      video_url,
2244             'uploader': None,
2245             'upload_date':  None,
2246             'title':    video_title,
2247             'ext':      u'flv',
2248         }]
2249
2250 class ComedyCentralIE(InfoExtractor):
2251     """Information extractor for The Daily Show and Colbert Report """
2252
2253     # urls can be abbreviations like :thedailyshow or :colbert
2254     # urls for episodes like:
2255     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259                       |(https?://)?(www\.)?
2260                           (?P<showname>thedailyshow|colbertnation)\.com/
2261                          (full-episodes/(?P<episode>.*)|
2262                           (?P<clip>
2263                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2265                      $"""
2266
2267     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2268
2269     _video_extensions = {
2270         '3500': 'mp4',
2271         '2200': 'mp4',
2272         '1700': 'mp4',
2273         '1200': 'mp4',
2274         '750': 'mp4',
2275         '400': 'mp4',
2276     }
2277     _video_dimensions = {
2278         '3500': '1280x720',
2279         '2200': '960x540',
2280         '1700': '768x432',
2281         '1200': '640x360',
2282         '750': '512x288',
2283         '400': '384x216',
2284     }
2285
2286     def suitable(self, url):
2287         """Receives a URL and returns True if suitable for this IE."""
2288         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2289
2290     def report_extraction(self, episode_id):
2291         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2292
2293     def report_config_download(self, episode_id, media_id):
2294         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2295
2296     def report_index_download(self, episode_id):
2297         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2298
2299     def _print_formats(self, formats):
2300         print('Available formats:')
2301         for x in formats:
2302             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2303
2304
2305     def _real_extract(self, url):
2306         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2307         if mobj is None:
2308             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2309             return
2310
2311         if mobj.group('shortname'):
2312             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2313                 url = u'http://www.thedailyshow.com/full-episodes/'
2314             else:
2315                 url = u'http://www.colbertnation.com/full-episodes/'
2316             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2317             assert mobj is not None
2318
2319         if mobj.group('clip'):
2320             if mobj.group('showname') == 'thedailyshow':
2321                 epTitle = mobj.group('tdstitle')
2322             else:
2323                 epTitle = mobj.group('cntitle')
2324             dlNewest = False
2325         else:
2326             dlNewest = not mobj.group('episode')
2327             if dlNewest:
2328                 epTitle = mobj.group('showname')
2329             else:
2330                 epTitle = mobj.group('episode')
2331
2332         req = compat_urllib_request.Request(url)
2333         self.report_extraction(epTitle)
2334         try:
2335             htmlHandle = compat_urllib_request.urlopen(req)
2336             html = htmlHandle.read()
2337             webpage = html.decode('utf-8')
2338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2339             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2340             return
2341         if dlNewest:
2342             url = htmlHandle.geturl()
2343             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2344             if mobj is None:
2345                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2346                 return
2347             if mobj.group('episode') == '':
2348                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2349                 return
2350             epTitle = mobj.group('episode')
2351
2352         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2353
2354         if len(mMovieParams) == 0:
2355             # The Colbert Report embeds the information in a without
2356             # a URL prefix; so extract the alternate reference
2357             # and then add the URL prefix manually.
2358
2359             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2360             if len(altMovieParams) == 0:
2361                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2362                 return
2363             else:
2364                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2365
2366         uri = mMovieParams[0][1]
2367         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2368         self.report_index_download(epTitle)
2369         try:
2370             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2372             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2373             return
2374
2375         results = []
2376
2377         idoc = xml.etree.ElementTree.fromstring(indexXml)
2378         itemEls = idoc.findall('.//item')
2379         for partNum,itemEl in enumerate(itemEls):
2380             mediaId = itemEl.findall('./guid')[0].text
2381             shortMediaId = mediaId.split(':')[-1]
2382             showId = mediaId.split(':')[-2].replace('.com', '')
2383             officialTitle = itemEl.findall('./title')[0].text
2384             officialDate = itemEl.findall('./pubDate')[0].text
2385
2386             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2387                         compat_urllib_parse.urlencode({'uri': mediaId}))
2388             configReq = compat_urllib_request.Request(configUrl)
2389             self.report_config_download(epTitle, shortMediaId)
2390             try:
2391                 configXml = compat_urllib_request.urlopen(configReq).read()
2392             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2393                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2394                 return
2395
2396             cdoc = xml.etree.ElementTree.fromstring(configXml)
2397             turls = []
2398             for rendition in cdoc.findall('.//rendition'):
2399                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2400                 turls.append(finfo)
2401
2402             if len(turls) == 0:
2403                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2404                 continue
2405
2406             if self._downloader.params.get('listformats', None):
2407                 self._print_formats([i[0] for i in turls])
2408                 return
2409
2410             # For now, just pick the highest bitrate
2411             format,rtmp_video_url = turls[-1]
2412
2413             # Get the format arg from the arg stream
2414             req_format = self._downloader.params.get('format', None)
2415
2416             # Select format if we can find one
2417             for f,v in turls:
2418                 if f == req_format:
2419                     format, rtmp_video_url = f, v
2420                     break
2421
2422             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2423             if not m:
2424                 raise ExtractorError(u'Cannot transform RTMP url')
2425             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2426             video_url = base + m.group('finalid')
2427
2428             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2429             info = {
2430                 'id': shortMediaId,
2431                 'url': video_url,
2432                 'uploader': showId,
2433                 'upload_date': officialDate,
2434                 'title': effTitle,
2435                 'ext': 'mp4',
2436                 'format': format,
2437                 'thumbnail': None,
2438                 'description': officialTitle,
2439             }
2440             results.append(info)
2441
2442         return results
2443
2444
2445 class EscapistIE(InfoExtractor):
2446     """Information extractor for The Escapist """
2447
2448     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2449     IE_NAME = u'escapist'
2450
2451     def report_extraction(self, showName):
2452         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2453
2454     def report_config_download(self, showName):
2455         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2456
2457     def _real_extract(self, url):
2458         mobj = re.match(self._VALID_URL, url)
2459         if mobj is None:
2460             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2461             return
2462         showName = mobj.group('showname')
2463         videoId = mobj.group('episode')
2464
2465         self.report_extraction(showName)
2466         try:
2467             webPage = compat_urllib_request.urlopen(url)
2468             webPageBytes = webPage.read()
2469             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2470             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2471         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2472             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2473             return
2474
2475         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2476         description = unescapeHTML(descMatch.group(1))
2477         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2478         imgUrl = unescapeHTML(imgMatch.group(1))
2479         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2480         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2481         configUrlMatch = re.search('config=(.*)$', playerUrl)
2482         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2483
2484         self.report_config_download(showName)
2485         try:
2486             configJSON = compat_urllib_request.urlopen(configUrl)
2487             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2488             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2489         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2491             return
2492
2493         # Technically, it's JavaScript, not JSON
2494         configJSON = configJSON.replace("'", '"')
2495
2496         try:
2497             config = json.loads(configJSON)
2498         except (ValueError,) as err:
2499             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2500             return
2501
2502         playlist = config['playlist']
2503         videoUrl = playlist[1]['url']
2504
2505         info = {
2506             'id': videoId,
2507             'url': videoUrl,
2508             'uploader': showName,
2509             'upload_date': None,
2510             'title': showName,
2511             'ext': 'flv',
2512             'thumbnail': imgUrl,
2513             'description': description,
2514             'player_url': playerUrl,
2515         }
2516
2517         return [info]
2518
2519 class CollegeHumorIE(InfoExtractor):
2520     """Information extractor for collegehumor.com"""
2521
2522     _WORKING = False
2523     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2524     IE_NAME = u'collegehumor'
2525
2526     def report_manifest(self, video_id):
2527         """Report information extraction."""
2528         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2529
2530     def report_extraction(self, video_id):
2531         """Report information extraction."""
2532         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2533
2534     def _real_extract(self, url):
2535         mobj = re.match(self._VALID_URL, url)
2536         if mobj is None:
2537             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2538             return
2539         video_id = mobj.group('videoid')
2540
2541         info = {
2542             'id': video_id,
2543             'uploader': None,
2544             'upload_date': None,
2545         }
2546
2547         self.report_extraction(video_id)
2548         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2549         try:
2550             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2551         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2552             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2553             return
2554
2555         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2556         try:
2557             videoNode = mdoc.findall('./video')[0]
2558             info['description'] = videoNode.findall('./description')[0].text
2559             info['title'] = videoNode.findall('./caption')[0].text
2560             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2561             manifest_url = videoNode.findall('./file')[0].text
2562         except IndexError:
2563             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2564             return
2565
2566         manifest_url += '?hdcore=2.10.3'
2567         self.report_manifest(video_id)
2568         try:
2569             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2570         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2572             return
2573
2574         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2575         try:
2576             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2577             node_id = media_node.attrib['url']
2578             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2579         except IndexError as err:
2580             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2581             return
2582
2583         url_pr = compat_urllib_parse_urlparse(manifest_url)
2584         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2585
2586         info['url'] = url
2587         info['ext'] = 'f4f'
2588         return [info]
2589
2590
2591 class XVideosIE(InfoExtractor):
2592     """Information extractor for xvideos.com"""
2593
2594     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2595     IE_NAME = u'xvideos'
2596
2597     def report_extraction(self, video_id):
2598         """Report information extraction."""
2599         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2600
2601     def _real_extract(self, url):
2602         mobj = re.match(self._VALID_URL, url)
2603         if mobj is None:
2604             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2605             return
2606         video_id = mobj.group(1)
2607
2608         webpage = self._download_webpage(url, video_id)
2609
2610         self.report_extraction(video_id)
2611
2612
2613         # Extract video URL
2614         mobj = re.search(r'flv_url=(.+?)&', webpage)
2615         if mobj is None:
2616             self._downloader.trouble(u'ERROR: unable to extract video url')
2617             return
2618         video_url = compat_urllib_parse.unquote(mobj.group(1))
2619
2620
2621         # Extract title
2622         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2623         if mobj is None:
2624             self._downloader.trouble(u'ERROR: unable to extract video title')
2625             return
2626         video_title = mobj.group(1)
2627
2628
2629         # Extract video thumbnail
2630         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2631         if mobj is None:
2632             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2633             return
2634         video_thumbnail = mobj.group(0)
2635
2636         info = {
2637             'id': video_id,
2638             'url': video_url,
2639             'uploader': None,
2640             'upload_date': None,
2641             'title': video_title,
2642             'ext': 'flv',
2643             'thumbnail': video_thumbnail,
2644             'description': None,
2645         }
2646
2647         return [info]
2648
2649
2650 class SoundcloudIE(InfoExtractor):
2651     """Information extractor for soundcloud.com
2652        To access the media, the uid of the song and a stream token
2653        must be extracted from the page source and the script must make
2654        a request to media.soundcloud.com/crossdomain.xml. Then
2655        the media can be grabbed by requesting from an url composed
2656        of the stream token and uid
2657      """
2658
2659     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2660     IE_NAME = u'soundcloud'
2661
2662     def __init__(self, downloader=None):
2663         InfoExtractor.__init__(self, downloader)
2664
2665     def report_resolve(self, video_id):
2666         """Report information extraction."""
2667         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2668
2669     def report_extraction(self, video_id):
2670         """Report information extraction."""
2671         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2672
2673     def _real_extract(self, url):
2674         mobj = re.match(self._VALID_URL, url)
2675         if mobj is None:
2676             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2677             return
2678
2679         # extract uploader (which is in the url)
2680         uploader = mobj.group(1)
2681         # extract simple title (uploader + slug of song title)
2682         slug_title =  mobj.group(2)
2683         simple_title = uploader + u'-' + slug_title
2684
2685         self.report_resolve('%s/%s' % (uploader, slug_title))
2686
2687         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2688         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2689         request = compat_urllib_request.Request(resolv_url)
2690         try:
2691             info_json_bytes = compat_urllib_request.urlopen(request).read()
2692             info_json = info_json_bytes.decode('utf-8')
2693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2695             return
2696
2697         info = json.loads(info_json)
2698         video_id = info['id']
2699         self.report_extraction('%s/%s' % (uploader, slug_title))
2700
2701         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2702         request = compat_urllib_request.Request(streams_url)
2703         try:
2704             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2705             stream_json = stream_json_bytes.decode('utf-8')
2706         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2708             return
2709
2710         streams = json.loads(stream_json)
2711         mediaURL = streams['http_mp3_128_url']
2712
2713         return [{
2714             'id':       info['id'],
2715             'url':      mediaURL,
2716             'uploader': info['user']['username'],
2717             'upload_date':  info['created_at'],
2718             'title':    info['title'],
2719             'ext':      u'mp3',
2720             'description': info['description'],
2721         }]
2722
2723
2724 class InfoQIE(InfoExtractor):
2725     """Information extractor for infoq.com"""
2726     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2727
2728     def report_extraction(self, video_id):
2729         """Report information extraction."""
2730         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2731
2732     def _real_extract(self, url):
2733         mobj = re.match(self._VALID_URL, url)
2734         if mobj is None:
2735             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2736             return
2737
2738         webpage = self._download_webpage(url, video_id=url)
2739         self.report_extraction(url)
2740
2741         # Extract video URL
2742         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2743         if mobj is None:
2744             self._downloader.trouble(u'ERROR: unable to extract video url')
2745             return
2746         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2747         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2748
2749         # Extract title
2750         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2751         if mobj is None:
2752             self._downloader.trouble(u'ERROR: unable to extract video title')
2753             return
2754         video_title = mobj.group(1)
2755
2756         # Extract description
2757         video_description = u'No description available.'
2758         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2759         if mobj is not None:
2760             video_description = mobj.group(1)
2761
2762         video_filename = video_url.split('/')[-1]
2763         video_id, extension = video_filename.split('.')
2764
2765         info = {
2766             'id': video_id,
2767             'url': video_url,
2768             'uploader': None,
2769             'upload_date': None,
2770             'title': video_title,
2771             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2772             'thumbnail': None,
2773             'description': video_description,
2774         }
2775
2776         return [info]
2777
2778 class MixcloudIE(InfoExtractor):
2779     """Information extractor for www.mixcloud.com"""
2780
2781     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2782     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2783     IE_NAME = u'mixcloud'
2784
2785     def __init__(self, downloader=None):
2786         InfoExtractor.__init__(self, downloader)
2787
2788     def report_download_json(self, file_id):
2789         """Report JSON download."""
2790         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2791
2792     def report_extraction(self, file_id):
2793         """Report information extraction."""
2794         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2795
2796     def get_urls(self, jsonData, fmt, bitrate='best'):
2797         """Get urls from 'audio_formats' section in json"""
2798         file_url = None
2799         try:
2800             bitrate_list = jsonData[fmt]
2801             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2802                 bitrate = max(bitrate_list) # select highest
2803
2804             url_list = jsonData[fmt][bitrate]
2805         except TypeError: # we have no bitrate info.
2806             url_list = jsonData[fmt]
2807         return url_list
2808
2809     def check_urls(self, url_list):
2810         """Returns 1st active url from list"""
2811         for url in url_list:
2812             try:
2813                 compat_urllib_request.urlopen(url)
2814                 return url
2815             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2816                 url = None
2817
2818         return None
2819
2820     def _print_formats(self, formats):
2821         print('Available formats:')
2822         for fmt in formats.keys():
2823             for b in formats[fmt]:
2824                 try:
2825                     ext = formats[fmt][b][0]
2826                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2827                 except TypeError: # we have no bitrate info
2828                     ext = formats[fmt][0]
2829                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2830                     break
2831
2832     def _real_extract(self, url):
2833         mobj = re.match(self._VALID_URL, url)
2834         if mobj is None:
2835             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2836             return
2837         # extract uploader & filename from url
2838         uploader = mobj.group(1).decode('utf-8')
2839         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2840
2841         # construct API request
2842         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2843         # retrieve .json file with links to files
2844         request = compat_urllib_request.Request(file_url)
2845         try:
2846             self.report_download_json(file_url)
2847             jsonData = compat_urllib_request.urlopen(request).read()
2848         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2849             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2850             return
2851
2852         # parse JSON
2853         json_data = json.loads(jsonData)
2854         player_url = json_data['player_swf_url']
2855         formats = dict(json_data['audio_formats'])
2856
2857         req_format = self._downloader.params.get('format', None)
2858         bitrate = None
2859
2860         if self._downloader.params.get('listformats', None):
2861             self._print_formats(formats)
2862             return
2863
2864         if req_format is None or req_format == 'best':
2865             for format_param in formats.keys():
2866                 url_list = self.get_urls(formats, format_param)
2867                 # check urls
2868                 file_url = self.check_urls(url_list)
2869                 if file_url is not None:
2870                     break # got it!
2871         else:
2872             if req_format not in formats:
2873                 self._downloader.trouble(u'ERROR: format is not available')
2874                 return
2875
2876             url_list = self.get_urls(formats, req_format)
2877             file_url = self.check_urls(url_list)
2878             format_param = req_format
2879
2880         return [{
2881             'id': file_id.decode('utf-8'),
2882             'url': file_url.decode('utf-8'),
2883             'uploader': uploader.decode('utf-8'),
2884             'upload_date': None,
2885             'title': json_data['name'],
2886             'ext': file_url.split('.')[-1].decode('utf-8'),
2887             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2888             'thumbnail': json_data['thumbnail_url'],
2889             'description': json_data['description'],
2890             'player_url': player_url.decode('utf-8'),
2891         }]
2892
2893 class StanfordOpenClassroomIE(InfoExtractor):
2894     """Information extractor for Stanford's Open ClassRoom"""
2895
2896     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2897     IE_NAME = u'stanfordoc'
2898
2899     def report_download_webpage(self, objid):
2900         """Report information extraction."""
2901         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2902
2903     def report_extraction(self, video_id):
2904         """Report information extraction."""
2905         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2906
2907     def _real_extract(self, url):
2908         mobj = re.match(self._VALID_URL, url)
2909         if mobj is None:
2910             raise ExtractorError(u'Invalid URL: %s' % url)
2911
2912         if mobj.group('course') and mobj.group('video'): # A specific video
2913             course = mobj.group('course')
2914             video = mobj.group('video')
2915             info = {
2916                 'id': course + '_' + video,
2917                 'uploader': None,
2918                 'upload_date': None,
2919             }
2920
2921             self.report_extraction(info['id'])
2922             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2923             xmlUrl = baseUrl + video + '.xml'
2924             try:
2925                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2926             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2928                 return
2929             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2930             try:
2931                 info['title'] = mdoc.findall('./title')[0].text
2932                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2933             except IndexError:
2934                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2935                 return
2936             info['ext'] = info['url'].rpartition('.')[2]
2937             return [info]
2938         elif mobj.group('course'): # A course page
2939             course = mobj.group('course')
2940             info = {
2941                 'id': course,
2942                 'type': 'playlist',
2943                 'uploader': None,
2944                 'upload_date': None,
2945             }
2946
2947             coursepage = self._download_webpage(url, info['id'],
2948                                         note='Downloading course info page',
2949                                         errnote='Unable to download course info page')
2950
2951             m = re.search('<h1>([^<]+)</h1>', coursepage)
2952             if m:
2953                 info['title'] = unescapeHTML(m.group(1))
2954             else:
2955                 info['title'] = info['id']
2956
2957             m = re.search('<description>([^<]+)</description>', coursepage)
2958             if m:
2959                 info['description'] = unescapeHTML(m.group(1))
2960
2961             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2962             info['list'] = [
2963                 {
2964                     'type': 'reference',
2965                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2966                 }
2967                     for vpage in links]
2968             results = []
2969             for entry in info['list']:
2970                 assert entry['type'] == 'reference'
2971                 results += self.extract(entry['url'])
2972             return results
2973         else: # Root page
2974             info = {
2975                 'id': 'Stanford OpenClassroom',
2976                 'type': 'playlist',
2977                 'uploader': None,
2978                 'upload_date': None,
2979             }
2980
2981             self.report_download_webpage(info['id'])
2982             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2983             try:
2984                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2985             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2986                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2987                 return
2988
2989             info['title'] = info['id']
2990
2991             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2992             info['list'] = [
2993                 {
2994                     'type': 'reference',
2995                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2996                 }
2997                     for cpage in links]
2998
2999             results = []
3000             for entry in info['list']:
3001                 assert entry['type'] == 'reference'
3002                 results += self.extract(entry['url'])
3003             return results
3004
3005 class MTVIE(InfoExtractor):
3006     """Information extractor for MTV.com"""
3007
3008     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3009     IE_NAME = u'mtv'
3010
3011     def report_extraction(self, video_id):
3012         """Report information extraction."""
3013         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3014
3015     def _real_extract(self, url):
3016         mobj = re.match(self._VALID_URL, url)
3017         if mobj is None:
3018             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3019             return
3020         if not mobj.group('proto'):
3021             url = 'http://' + url
3022         video_id = mobj.group('videoid')
3023
3024         webpage = self._download_webpage(url, video_id)
3025
3026         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3027         if mobj is None:
3028             self._downloader.trouble(u'ERROR: unable to extract song name')
3029             return
3030         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3031         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3032         if mobj is None:
3033             self._downloader.trouble(u'ERROR: unable to extract performer')
3034             return
3035         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3036         video_title = performer + ' - ' + song_name
3037
3038         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3039         if mobj is None:
3040             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3041             return
3042         mtvn_uri = mobj.group(1)
3043
3044         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3045         if mobj is None:
3046             self._downloader.trouble(u'ERROR: unable to extract content id')
3047             return
3048         content_id = mobj.group(1)
3049
3050         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3051         self.report_extraction(video_id)
3052         request = compat_urllib_request.Request(videogen_url)
3053         try:
3054             metadataXml = compat_urllib_request.urlopen(request).read()
3055         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3056             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3057             return
3058
3059         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3060         renditions = mdoc.findall('.//rendition')
3061
3062         # For now, always pick the highest quality.
3063         rendition = renditions[-1]
3064
3065         try:
3066             _,_,ext = rendition.attrib['type'].partition('/')
3067             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3068             video_url = rendition.find('./src').text
3069         except KeyError:
3070             self._downloader.trouble('Invalid rendition field.')
3071             return
3072
3073         info = {
3074             'id': video_id,
3075             'url': video_url,
3076             'uploader': performer,
3077             'upload_date': None,
3078             'title': video_title,
3079             'ext': ext,
3080             'format': format,
3081         }
3082
3083         return [info]
3084
3085
3086 class YoukuIE(InfoExtractor):
3087     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3088
3089     def report_download_webpage(self, file_id):
3090         """Report webpage download."""
3091         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3092
3093     def report_extraction(self, file_id):
3094         """Report information extraction."""
3095         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3096
3097     def _gen_sid(self):
3098         nowTime = int(time.time() * 1000)
3099         random1 = random.randint(1000,1998)
3100         random2 = random.randint(1000,9999)
3101
3102         return "%d%d%d" %(nowTime,random1,random2)
3103
3104     def _get_file_ID_mix_string(self, seed):
3105         mixed = []
3106         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3107         seed = float(seed)
3108         for i in range(len(source)):
3109             seed  =  (seed * 211 + 30031 ) % 65536
3110             index  =  math.floor(seed / 65536 * len(source) )
3111             mixed.append(source[int(index)])
3112             source.remove(source[int(index)])
3113         #return ''.join(mixed)
3114         return mixed
3115
3116     def _get_file_id(self, fileId, seed):
3117         mixed = self._get_file_ID_mix_string(seed)
3118         ids = fileId.split('*')
3119         realId = []
3120         for ch in ids:
3121             if ch:
3122                 realId.append(mixed[int(ch)])
3123         return ''.join(realId)
3124
3125     def _real_extract(self, url):
3126         mobj = re.match(self._VALID_URL, url)
3127         if mobj is None:
3128             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3129             return
3130         video_id = mobj.group('ID')
3131
3132         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3133
3134         request = compat_urllib_request.Request(info_url, None, std_headers)
3135         try:
3136             self.report_download_webpage(video_id)
3137             jsondata = compat_urllib_request.urlopen(request).read()
3138         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3139             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3140             return
3141
3142         self.report_extraction(video_id)
3143         try:
3144             jsonstr = jsondata.decode('utf-8')
3145             config = json.loads(jsonstr)
3146
3147             video_title =  config['data'][0]['title']
3148             seed = config['data'][0]['seed']
3149
3150             format = self._downloader.params.get('format', None)
3151             supported_format = list(config['data'][0]['streamfileids'].keys())
3152
3153             if format is None or format == 'best':
3154                 if 'hd2' in supported_format:
3155                     format = 'hd2'
3156                 else:
3157                     format = 'flv'
3158                 ext = u'flv'
3159             elif format == 'worst':
3160                 format = 'mp4'
3161                 ext = u'mp4'
3162             else:
3163                 format = 'flv'
3164                 ext = u'flv'
3165
3166
3167             fileid = config['data'][0]['streamfileids'][format]
3168             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3169         except (UnicodeDecodeError, ValueError, KeyError):
3170             self._downloader.trouble(u'ERROR: unable to extract info section')
3171             return
3172
3173         files_info=[]
3174         sid = self._gen_sid()
3175         fileid = self._get_file_id(fileid, seed)
3176
3177         #column 8,9 of fileid represent the segment number
3178         #fileid[7:9] should be changed
3179         for index, key in enumerate(keys):
3180
3181             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3182             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3183
3184             info = {
3185                 'id': '%s_part%02d' % (video_id, index),
3186                 'url': download_url,
3187                 'uploader': None,
3188                 'upload_date': None,
3189                 'title': video_title,
3190                 'ext': ext,
3191             }
3192             files_info.append(info)
3193
3194         return files_info
3195
3196
3197 class XNXXIE(InfoExtractor):
3198     """Information extractor for xnxx.com"""
3199
3200     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3201     IE_NAME = u'xnxx'
3202     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3203     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3204     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3205
3206     def report_webpage(self, video_id):
3207         """Report information extraction"""
3208         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3209
3210     def report_extraction(self, video_id):
3211         """Report information extraction"""
3212         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3213
3214     def _real_extract(self, url):
3215         mobj = re.match(self._VALID_URL, url)
3216         if mobj is None:
3217             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3218             return
3219         video_id = mobj.group(1)
3220
3221         self.report_webpage(video_id)
3222
3223         # Get webpage content
3224         try:
3225             webpage_bytes = compat_urllib_request.urlopen(url).read()
3226             webpage = webpage_bytes.decode('utf-8')
3227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3229             return
3230
3231         result = re.search(self.VIDEO_URL_RE, webpage)
3232         if result is None:
3233             self._downloader.trouble(u'ERROR: unable to extract video url')
3234             return
3235         video_url = compat_urllib_parse.unquote(result.group(1))
3236
3237         result = re.search(self.VIDEO_TITLE_RE, webpage)
3238         if result is None:
3239             self._downloader.trouble(u'ERROR: unable to extract video title')
3240             return
3241         video_title = result.group(1)
3242
3243         result = re.search(self.VIDEO_THUMB_RE, webpage)
3244         if result is None:
3245             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3246             return
3247         video_thumbnail = result.group(1)
3248
3249         return [{
3250             'id': video_id,
3251             'url': video_url,
3252             'uploader': None,
3253             'upload_date': None,
3254             'title': video_title,
3255             'ext': 'flv',
3256             'thumbnail': video_thumbnail,
3257             'description': None,
3258         }]
3259
3260
3261 class GooglePlusIE(InfoExtractor):
3262     """Information extractor for plus.google.com."""
3263
3264     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3265     IE_NAME = u'plus.google'
3266
3267     def __init__(self, downloader=None):
3268         InfoExtractor.__init__(self, downloader)
3269
3270     def report_extract_entry(self, url):
3271         """Report downloading extry"""
3272         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3273
3274     def report_date(self, upload_date):
3275         """Report downloading extry"""
3276         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3277
3278     def report_uploader(self, uploader):
3279         """Report downloading extry"""
3280         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3281
3282     def report_title(self, video_title):
3283         """Report downloading extry"""
3284         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3285
3286     def report_extract_vid_page(self, video_page):
3287         """Report information extraction."""
3288         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3289
3290     def _real_extract(self, url):
3291         # Extract id from URL
3292         mobj = re.match(self._VALID_URL, url)
3293         if mobj is None:
3294             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3295             return
3296
3297         post_url = mobj.group(0)
3298         video_id = mobj.group(1)
3299
3300         video_extension = 'flv'
3301
3302         # Step 1, Retrieve post webpage to extract further information
3303         self.report_extract_entry(post_url)
3304         request = compat_urllib_request.Request(post_url)
3305         try:
3306             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3309             return
3310
3311         # Extract update date
3312         upload_date = None
3313         pattern = 'title="Timestamp">(.*?)</a>'
3314         mobj = re.search(pattern, webpage)
3315         if mobj:
3316             upload_date = mobj.group(1)
3317             # Convert timestring to a format suitable for filename
3318             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3319             upload_date = upload_date.strftime('%Y%m%d')
3320         self.report_date(upload_date)
3321
3322         # Extract uploader
3323         uploader = None
3324         pattern = r'rel\="author".*?>(.*?)</a>'
3325         mobj = re.search(pattern, webpage)
3326         if mobj:
3327             uploader = mobj.group(1)
3328         self.report_uploader(uploader)
3329
3330         # Extract title
3331         # Get the first line for title
3332         video_title = u'NA'
3333         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3334         mobj = re.search(pattern, webpage)
3335         if mobj:
3336             video_title = mobj.group(1)
3337         self.report_title(video_title)
3338
3339         # Step 2, Stimulate clicking the image box to launch video
3340         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3341         mobj = re.search(pattern, webpage)
3342         if mobj is None:
3343             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3344
3345         video_page = mobj.group(1)
3346         request = compat_urllib_request.Request(video_page)
3347         try:
3348             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3349         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3350             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3351             return
3352         self.report_extract_vid_page(video_page)
3353
3354
3355         # Extract video links on video page
3356         """Extract video links of all sizes"""
3357         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3358         mobj = re.findall(pattern, webpage)
3359         if len(mobj) == 0:
3360             self._downloader.trouble(u'ERROR: unable to extract video links')
3361
3362         # Sort in resolution
3363         links = sorted(mobj)
3364
3365         # Choose the lowest of the sort, i.e. highest resolution
3366         video_url = links[-1]
3367         # Only get the url. The resolution part in the tuple has no use anymore
3368         video_url = video_url[-1]
3369         # Treat escaped \u0026 style hex
3370         try:
3371             video_url = video_url.decode("unicode_escape")
3372         except AttributeError: # Python 3
3373             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3374
3375
3376         return [{
3377             'id':       video_id,
3378             'url':      video_url,
3379             'uploader': uploader,
3380             'upload_date':  upload_date,
3381             'title':    video_title,
3382             'ext':      video_extension,
3383         }]
3384
3385 class NBAIE(InfoExtractor):
3386     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3387     IE_NAME = u'nba'
3388
3389     def _real_extract(self, url):
3390         mobj = re.match(self._VALID_URL, url)
3391         if mobj is None:
3392             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3393             return
3394
3395         video_id = mobj.group(1)
3396         if video_id.endswith('/index.html'):
3397             video_id = video_id[:-len('/index.html')]
3398
3399         webpage = self._download_webpage(url, video_id)
3400
3401         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3402         def _findProp(rexp, default=None):
3403             m = re.search(rexp, webpage)
3404             if m:
3405                 return unescapeHTML(m.group(1))
3406             else:
3407                 return default
3408
3409         shortened_video_id = video_id.rpartition('/')[2]
3410         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3411         info = {
3412             'id': shortened_video_id,
3413             'url': video_url,
3414             'ext': 'mp4',
3415             'title': title,
3416             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3417             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3418         }
3419         return [info]
3420
3421 class JustinTVIE(InfoExtractor):
3422     """Information extractor for justin.tv and twitch.tv"""
3423     # TODO: One broadcast may be split into multiple videos. The key
3424     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3425     # starts at 1 and increases. Can we treat all parts as one video?
3426
3427     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3428         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3429     _JUSTIN_PAGE_LIMIT = 100
3430     IE_NAME = u'justin.tv'
3431
3432     def report_extraction(self, file_id):
3433         """Report information extraction."""
3434         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3435
3436     def report_download_page(self, channel, offset):
3437         """Report attempt to download a single page of videos."""
3438         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3439                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3440
3441     # Return count of items, list of *valid* items
3442     def _parse_page(self, url):
3443         try:
3444             urlh = compat_urllib_request.urlopen(url)
3445             webpage_bytes = urlh.read()
3446             webpage = webpage_bytes.decode('utf-8', 'ignore')
3447         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3448             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3449             return
3450
3451         response = json.loads(webpage)
3452         if type(response) != list:
3453             error_text = response.get('error', 'unknown error')
3454             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3455             return
3456         info = []
3457         for clip in response:
3458             video_url = clip['video_file_url']
3459             if video_url:
3460                 video_extension = os.path.splitext(video_url)[1][1:]
3461                 video_date = re.sub('-', '', clip['start_time'][:10])
3462                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3463                 video_id = clip['id']
3464                 video_title = clip.get('title', video_id)
3465                 info.append({
3466                     'id': video_id,
3467                     'url': video_url,
3468                     'title': video_title,
3469                     'uploader': clip.get('channel_name', video_uploader_id),
3470                     'uploader_id': video_uploader_id,
3471                     'upload_date': video_date,
3472                     'ext': video_extension,
3473                 })
3474         return (len(response), info)
3475
3476     def _real_extract(self, url):
3477         mobj = re.match(self._VALID_URL, url)
3478         if mobj is None:
3479             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3480             return
3481
3482         api = 'http://api.justin.tv'
3483         video_id = mobj.group(mobj.lastindex)
3484         paged = False
3485         if mobj.lastindex == 1:
3486             paged = True
3487             api += '/channel/archives/%s.json'
3488         else:
3489             api += '/broadcast/by_archive/%s.json'
3490         api = api % (video_id,)
3491
3492         self.report_extraction(video_id)
3493
3494         info = []
3495         offset = 0
3496         limit = self._JUSTIN_PAGE_LIMIT
3497         while True:
3498             if paged:
3499                 self.report_download_page(video_id, offset)
3500             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3501             page_count, page_info = self._parse_page(page_url)
3502             info.extend(page_info)
3503             if not paged or page_count != limit:
3504                 break
3505             offset += limit
3506         return info
3507
3508 class FunnyOrDieIE(InfoExtractor):
3509     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3510
3511     def _real_extract(self, url):
3512         mobj = re.match(self._VALID_URL, url)
3513         if mobj is None:
3514             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3515             return
3516
3517         video_id = mobj.group('id')
3518         webpage = self._download_webpage(url, video_id)
3519
3520         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3521         if not m:
3522             self._downloader.trouble(u'ERROR: unable to find video information')
3523         video_url = unescapeHTML(m.group('url'))
3524
3525         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3526         if not m:
3527             self._downloader.trouble(u'Cannot find video title')
3528         title = unescapeHTML(m.group('title'))
3529
3530         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3531         if m:
3532             desc = unescapeHTML(m.group('desc'))
3533         else:
3534             desc = None
3535
3536         info = {
3537             'id': video_id,
3538             'url': video_url,
3539             'ext': 'mp4',
3540             'title': title,
3541             'description': desc,
3542         }
3543         return [info]
3544
3545 class TweetReelIE(InfoExtractor):
3546     _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3547
3548     def _real_extract(self, url):
3549         mobj = re.match(self._VALID_URL, url)
3550         if mobj is None:
3551             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3552             return
3553
3554         video_id = mobj.group('id')
3555         webpage = self._download_webpage(url, video_id)
3556
3557         m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3558         if not m:
3559             self._downloader.trouble(u'ERROR: Cannot find status ID')
3560         status_id = m.group(1)
3561
3562         m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3563         if not m:
3564             self._downloader.trouble(u'WARNING: Cannot find description')
3565         desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3566
3567         m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3568         if not m:
3569             self._downloader.trouble(u'ERROR: Cannot find uploader')
3570         uploader = unescapeHTML(m.group('uploader'))
3571         uploader_id = unescapeHTML(m.group('uploader_id'))
3572
3573         m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3574         if not m:
3575             self._downloader.trouble(u'ERROR: Cannot find upload date')
3576         upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3577
3578         title = desc
3579         video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3580
3581         info = {
3582             'id': video_id,
3583             'url': video_url,
3584             'ext': 'mov',
3585             'title': title,
3586             'description': desc,
3587             'uploader': uploader,
3588             'uploader_id': uploader_id,
3589             'internal_id': status_id,
3590             'upload_date': upload_date
3591         }
3592         return [info]
3593
3594 class SteamIE(InfoExtractor):
3595     _VALID_URL = r"""http://store.steampowered.com/
3596                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3597                 (?P<gameID>\d+)/?
3598                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3599                 """
3600
3601     def suitable(self, url):
3602         """Receives a URL and returns True if suitable for this IE."""
3603         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3604
3605     def _real_extract(self, url):
3606         m = re.match(self._VALID_URL, url, re.VERBOSE)
3607         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3608         gameID = m.group('gameID')
3609         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3610         webpage = self._download_webpage(videourl, gameID)
3611         mweb = re.finditer(urlRE, webpage)
3612         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3613         titles = re.finditer(namesRE, webpage)
3614         videos = []
3615         for vid,vtitle in zip(mweb,titles):
3616             video_id = vid.group('videoID')
3617             title = vtitle.group('videoName')
3618             video_url = vid.group('videoURL')
3619             if not video_url:
3620                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3621             info = {
3622                 'id':video_id,
3623                 'url':video_url,
3624                 'ext': 'flv',
3625                 'title': unescapeHTML(title)
3626                   }
3627             videos.append(info)
3628         return videos
3629
3630 class UstreamIE(InfoExtractor):
3631     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3632     IE_NAME = u'ustream'
3633
3634     def _real_extract(self, url):
3635         m = re.match(self._VALID_URL, url)
3636         video_id = m.group('videoID')
3637         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3638         webpage = self._download_webpage(url, video_id)
3639         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3640         title = m.group('title')
3641         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3642         uploader = m.group('uploader')
3643         info = {
3644                 'id':video_id,
3645                 'url':video_url,
3646                 'ext': 'flv',
3647                 'title': title,
3648                 'uploader': uploader
3649                   }
3650         return [info]
3651
3652 class RBMARadioIE(InfoExtractor):
3653     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3654
3655     def _real_extract(self, url):
3656         m = re.match(self._VALID_URL, url)
3657         video_id = m.group('videoID')
3658
3659         webpage = self._download_webpage(url, video_id)
3660         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3661         if not m:
3662             raise ExtractorError(u'Cannot find metadata')
3663         json_data = m.group(1)
3664
3665         try:
3666             data = json.loads(json_data)
3667         except ValueError as e:
3668             raise ExtractorError(u'Invalid JSON: ' + str(e))
3669
3670         video_url = data['akamai_url'] + '&cbr=256'
3671         url_parts = compat_urllib_parse_urlparse(video_url)
3672         video_ext = url_parts.path.rpartition('.')[2]
3673         info = {
3674                 'id': video_id,
3675                 'url': video_url,
3676                 'ext': video_ext,
3677                 'title': data['title'],
3678                 'description': data.get('teaser_text'),
3679                 'location': data.get('country_of_origin'),
3680                 'uploader': data.get('host', {}).get('name'),
3681                 'uploader_id': data.get('host', {}).get('slug'),
3682                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3683                 'duration': data.get('duration'),
3684         }
3685         return [info]
3686
3687
3688 class YouPornIE(InfoExtractor):
3689     """Information extractor for youporn.com."""
3690     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3691
3692     def _print_formats(self, formats):
3693         """Print all available formats"""
3694         print(u'Available formats:')
3695         print(u'ext\t\tformat')
3696         print(u'---------------------------------')
3697         for format in formats:
3698             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3699
3700     def _specific(self, req_format, formats):
3701         for x in formats:
3702             if(x["format"]==req_format):
3703                 return x
3704         return None
3705
3706     def _real_extract(self, url):
3707         mobj = re.match(self._VALID_URL, url)
3708         if mobj is None:
3709             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3710             return
3711
3712         video_id = mobj.group('videoid')
3713
3714         req = compat_urllib_request.Request(url)
3715         req.add_header('Cookie', 'age_verified=1')
3716         webpage = self._download_webpage(req, video_id)
3717
3718         # Get the video title
3719         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3720         if result is None:
3721             raise ExtractorError(u'Unable to extract video title')
3722         video_title = result.group('title').strip()
3723
3724         # Get the video date
3725         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3726         if result is None:
3727             self._downloader.to_stderr(u'WARNING: unable to extract video date')
3728             upload_date = None
3729         else:
3730             upload_date = result.group('date').strip()
3731
3732         # Get the video uploader
3733         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3734         if result is None:
3735             self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3736             video_uploader = None
3737         else:
3738             video_uploader = result.group('uploader').strip()
3739             video_uploader = clean_html( video_uploader )
3740
3741         # Get all of the formats available
3742         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3743         result = re.search(DOWNLOAD_LIST_RE, webpage)
3744         if result is None:
3745             raise ExtractorError(u'Unable to extract download list')
3746         download_list_html = result.group('download_list').strip()
3747
3748         # Get all of the links from the page
3749         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3750         links = re.findall(LINK_RE, download_list_html)
3751         if(len(links) == 0):
3752             raise ExtractorError(u'ERROR: no known formats available for video')
3753
3754         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3755
3756         formats = []
3757         for link in links:
3758
3759             # A link looks like this:
3760             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3761             # A path looks like this:
3762             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3763             video_url = unescapeHTML( link )
3764             path = compat_urllib_parse_urlparse( video_url ).path
3765             extension = os.path.splitext( path )[1][1:]
3766             format = path.split('/')[4].split('_')[:2]
3767             size = format[0]
3768             bitrate = format[1]
3769             format = "-".join( format )
3770             title = u'%s-%s-%s' % (video_title, size, bitrate)
3771
3772             formats.append({
3773                 'id': video_id,
3774                 'url': video_url,
3775                 'uploader': video_uploader,
3776                 'upload_date': upload_date,
3777                 'title': title,
3778                 'ext': extension,
3779                 'format': format,
3780                 'thumbnail': None,
3781                 'description': None,
3782                 'player_url': None
3783             })
3784
3785         if self._downloader.params.get('listformats', None):
3786             self._print_formats(formats)
3787             return
3788
3789         req_format = self._downloader.params.get('format', None)
3790         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3791
3792         if req_format is None or req_format == 'best':
3793             return [formats[0]]
3794         elif req_format == 'worst':
3795             return [formats[-1]]
3796         elif req_format in ('-1', 'all'):
3797             return formats
3798         else:
3799             format = self._specific( req_format, formats )
3800             if result is None:
3801                 self._downloader.trouble(u'ERROR: requested format not available')
3802                 return
3803             return [format]
3804
3805
3806
3807 class PornotubeIE(InfoExtractor):
3808     """Information extractor for pornotube.com."""
3809     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3810
3811     def _real_extract(self, url):
3812         mobj = re.match(self._VALID_URL, url)
3813         if mobj is None:
3814             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3815             return
3816
3817         video_id = mobj.group('videoid')
3818         video_title = mobj.group('title')
3819
3820         # Get webpage content
3821         webpage = self._download_webpage(url, video_id)
3822
3823         # Get the video URL
3824         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3825         result = re.search(VIDEO_URL_RE, webpage)
3826         if result is None:
3827             self._downloader.trouble(u'ERROR: unable to extract video url')
3828             return
3829         video_url = compat_urllib_parse.unquote(result.group('url'))
3830
3831         #Get the uploaded date
3832         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3833         result = re.search(VIDEO_UPLOADED_RE, webpage)
3834         if result is None:
3835             self._downloader.trouble(u'ERROR: unable to extract video title')
3836             return
3837         upload_date = result.group('date')
3838
3839         info = {'id': video_id,
3840                 'url': video_url,
3841                 'uploader': None,
3842                 'upload_date': upload_date,
3843                 'title': video_title,
3844                 'ext': 'flv',
3845                 'format': 'flv'}
3846
3847         return [info]
3848
3849 class YouJizzIE(InfoExtractor):
3850     """Information extractor for youjizz.com."""
3851     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3852
3853     def _real_extract(self, url):
3854         mobj = re.match(self._VALID_URL, url)
3855         if mobj is None:
3856             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3857             return
3858
3859         video_id = mobj.group('videoid')
3860
3861         # Get webpage content
3862         webpage = self._download_webpage(url, video_id)
3863
3864         # Get the video title
3865         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3866         if result is None:
3867             raise ExtractorError(u'ERROR: unable to extract video title')
3868         video_title = result.group('title').strip()
3869
3870         # Get the embed page
3871         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3872         if result is None:
3873             raise ExtractorError(u'ERROR: unable to extract embed page')
3874
3875         embed_page_url = result.group(0).strip()
3876         video_id = result.group('videoid')
3877
3878         webpage = self._download_webpage(embed_page_url, video_id)
3879
3880         # Get the video URL
3881         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3882         if result is None:
3883             raise ExtractorError(u'ERROR: unable to extract video url')
3884         video_url = result.group('source')
3885
3886         info = {'id': video_id,
3887                 'url': video_url,
3888                 'title': video_title,
3889                 'ext': 'flv',
3890                 'format': 'flv',
3891                 'player_url': embed_page_url}
3892
3893         return [info]
3894
3895 class EightTracksIE(InfoExtractor):
3896     IE_NAME = '8tracks'
3897     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3898
3899     def _real_extract(self, url):
3900         mobj = re.match(self._VALID_URL, url)
3901         if mobj is None:
3902             raise ExtractorError(u'Invalid URL: %s' % url)
3903         playlist_id = mobj.group('id')
3904
3905         webpage = self._download_webpage(url, playlist_id)
3906
3907         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3908         if not m:
3909             raise ExtractorError(u'Cannot find trax information')
3910         json_like = m.group(1)
3911         data = json.loads(json_like)
3912
3913         session = str(random.randint(0, 1000000000))
3914         mix_id = data['id']
3915         track_count = data['tracks_count']
3916         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3917         next_url = first_url
3918         res = []
3919         for i in itertools.count():
3920             api_json = self._download_webpage(next_url, playlist_id,
3921                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3922                 errnote=u'Failed to download song information')
3923             api_data = json.loads(api_json)
3924             track_data = api_data[u'set']['track']
3925             info = {
3926                 'id': track_data['id'],
3927                 'url': track_data['track_file_stream_url'],
3928                 'title': track_data['performer'] + u' - ' + track_data['name'],
3929                 'raw_title': track_data['name'],
3930                 'uploader_id': data['user']['login'],
3931                 'ext': 'm4a',
3932             }
3933             res.append(info)
3934             if api_data['set']['at_last_track']:
3935                 break
3936             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3937         return res
3938
3939 class KeekIE(InfoExtractor):
3940     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3941     IE_NAME = u'keek'
3942
3943     def _real_extract(self, url):
3944         m = re.match(self._VALID_URL, url)
3945         video_id = m.group('videoID')
3946         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3947         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3948         webpage = self._download_webpage(url, video_id)
3949         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3950         title = unescapeHTML(m.group('title'))
3951         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3952         uploader = unescapeHTML(m.group('uploader'))
3953         info = {
3954                 'id':video_id,
3955                 'url':video_url,
3956                 'ext': 'mp4',
3957                 'title': title,
3958                 'thumbnail': thumbnail,
3959                 'uploader': uploader
3960         }
3961         return [info]
3962
3963 class TEDIE(InfoExtractor):
3964     _VALID_URL=r'''http://www.ted.com/
3965                    (
3966                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3967                         |
3968                         ((?P<type_talk>talks)) # We have a simple talk
3969                    )
3970                    /(?P<name>\w+) # Here goes the name and then ".html"
3971                    '''
3972
3973     def suitable(self, url):
3974         """Receives a URL and returns True if suitable for this IE."""
3975         return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3976
3977     def _real_extract(self, url):
3978         m=re.match(self._VALID_URL, url, re.VERBOSE)
3979         if m.group('type_talk'):
3980             return [self._talk_info(url)]
3981         else :
3982             playlist_id=m.group('playlist_id')
3983             name=m.group('name')
3984             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3985             return self._playlist_videos_info(url,name,playlist_id)
3986
3987     def _talk_video_link(self,mediaSlug):
3988         '''Returns the video link for that mediaSlug'''
3989         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3990
3991     def _playlist_videos_info(self,url,name,playlist_id=0):
3992         '''Returns the videos of the playlist'''
3993         video_RE=r'''
3994                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3995                      ([.\s]*?)data-playlist_item_id="(\d+)"
3996                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3997                      '''
3998         video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
3999         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4000         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4001         m_names=re.finditer(video_name_RE,webpage)
4002         info=[]
4003         for m_video, m_name in zip(m_videos,m_names):
4004             video_dic={
4005                        'id': m_video.group('video_id'),
4006                        'url': self._talk_video_link(m_video.group('mediaSlug')),
4007                        'ext': 'mp4',
4008                        'title': m_name.group('fullname')
4009                        }
4010             info.append(video_dic)
4011         return info
4012     def _talk_info(self, url, video_id=0):
4013         """Return the video for the talk in the url"""
4014         m=re.match(self._VALID_URL, url,re.VERBOSE)
4015         videoName=m.group('name')
4016         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4017         # If the url includes the language we get the title translated
4018         title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4019         title=re.search(title_RE, webpage).group('title')
4020         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4021                         "id":(?P<videoID>[\d]+).*?
4022                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4023         info_match=re.search(info_RE,webpage,re.VERBOSE)
4024         video_id=info_match.group('videoID')
4025         mediaSlug=info_match.group('mediaSlug')
4026         video_url=self._talk_video_link(mediaSlug)
4027         info = {
4028                 'id': video_id,
4029                 'url': video_url,
4030                 'ext': 'mp4',
4031                 'title': title
4032                 }
4033         return info
4034
4035 class MySpassIE(InfoExtractor):
4036     _VALID_URL = r'http://www.myspass.de/.*'
4037
4038     def _real_extract(self, url):
4039         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4040
4041         # video id is the last path element of the URL
4042         # usually there is a trailing slash, so also try the second but last
4043         url_path = compat_urllib_parse_urlparse(url).path
4044         url_parent_path, video_id = os.path.split(url_path)
4045         if not video_id:
4046             _, video_id = os.path.split(url_parent_path)
4047
4048         # get metadata
4049         metadata_url = META_DATA_URL_TEMPLATE % video_id
4050         metadata_text = self._download_webpage(metadata_url, video_id)
4051         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4052
4053         # extract values from metadata
4054         url_flv_el = metadata.find('url_flv')
4055         if url_flv_el is None:
4056             self._downloader.trouble(u'ERROR: unable to extract download url')
4057             return
4058         video_url = url_flv_el.text
4059         extension = os.path.splitext(video_url)[1][1:]
4060         title_el = metadata.find('title')
4061         if title_el is None:
4062             self._downloader.trouble(u'ERROR: unable to extract title')
4063             return
4064         title = title_el.text
4065         format_id_el = metadata.find('format_id')
4066         if format_id_el is None:
4067             format = ext
4068         else:
4069             format = format_id_el.text
4070         description_el = metadata.find('description')
4071         if description_el is not None:
4072             description = description_el.text
4073         else:
4074             description = None
4075         imagePreview_el = metadata.find('imagePreview')
4076         if imagePreview_el is not None:
4077             thumbnail = imagePreview_el.text
4078         else:
4079             thumbnail = None
4080         info = {
4081             'id': video_id,
4082             'url': video_url,
4083             'title': title,
4084             'ext': extension,
4085             'format': format,
4086             'thumbnail': thumbnail,
4087             'description': description
4088         }
4089         return [info]
4090
4091 def gen_extractors():
4092     """ Return a list of an instance of every supported extractor.
4093     The order does matter; the first extractor matched is the one handling the URL.
4094     """
4095     return [
4096         YoutubePlaylistIE(),
4097         YoutubeChannelIE(),
4098         YoutubeUserIE(),
4099         YoutubeSearchIE(),
4100         YoutubeIE(),
4101         MetacafeIE(),
4102         DailymotionIE(),
4103         GoogleSearchIE(),
4104         PhotobucketIE(),
4105         YahooIE(),
4106         YahooSearchIE(),
4107         DepositFilesIE(),
4108         FacebookIE(),
4109         BlipTVUserIE(),
4110         BlipTVIE(),
4111         VimeoIE(),
4112         MyVideoIE(),
4113         ComedyCentralIE(),
4114         EscapistIE(),
4115         CollegeHumorIE(),
4116         XVideosIE(),
4117         SoundcloudIE(),
4118         InfoQIE(),
4119         MixcloudIE(),
4120         StanfordOpenClassroomIE(),
4121         MTVIE(),
4122         YoukuIE(),
4123         XNXXIE(),
4124         YouJizzIE(),
4125         PornotubeIE(),
4126         YouPornIE(),
4127         GooglePlusIE(),
4128         ArteTvIE(),
4129         NBAIE(),
4130         JustinTVIE(),
4131         FunnyOrDieIE(),
4132         TweetReelIE(),
4133         SteamIE(),
4134         UstreamIE(),
4135         RBMARadioIE(),
4136         EightTracksIE(),
4137         KeekIE(),
4138         TEDIE(),
4139         MySpassIE(),
4140         GenericIE()
4141     ]
4142
4143