_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self._downloader.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147
 148 class YoutubeIE(InfoExtractor):
 149     """Information extractor for youtube.com."""
 150
 151     _VALID_URL = r"""^
 152                      (
 153                          (?:https?://)?                                       # http(s):// (optional)
 154                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 155                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 156                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 157                          (?:                                                  # the various things that can precede the ID:
 158                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 159                              |(?:                                             # or the v= param in all its forms
 160                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 161                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 162                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 163                                  v=
 164                              )
 165                          )?                                                   # optional -> youtube.com/xxxx is OK
 166                      )?                                                       # all until now is optional -> you can pass the naked ID
 167                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 168                      (?(1).+)?                                                # if we found the ID, everything can follow
 169                      $"""
 170     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 171     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 172     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 173     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 174     _NETRC_MACHINE = 'youtube'
 175     # Listed in order of quality
 176     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 177     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 178     _video_extensions = {
 179         '13': '3gp',
 180         '17': 'mp4',
 181         '18': 'mp4',
 182         '22': 'mp4',
 183         '37': 'mp4',
 184         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 185         '43': 'webm',
 186         '44': 'webm',
 187         '45': 'webm',
 188         '46': 'webm',
 189     }
 190     _video_dimensions = {
 191         '5': '240x400',
 192         '6': '???',
 193         '13': '???',
 194         '17': '144x176',
 195         '18': '360x640',
 196         '22': '720x1280',
 197         '34': '360x640',
 198         '35': '480x854',
 199         '37': '1080x1920',
 200         '38': '3072x4096',
 201         '43': '360x640',
 202         '44': '480x854',
 203         '45': '720x1280',
 204         '46': '1080x1920',
 205     }
 206     IE_NAME = u'youtube'
 207
 208     @classmethod
 209     def suitable(cls, url):
 210         """Receives a URL and returns True if suitable for this IE."""
 211         if YoutubePlaylistIE.suitable(url): return False
 212         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 213
 214     def report_lang(self):
 215         """Report attempt to set language."""
 216         self._downloader.to_screen(u'[youtube] Setting language')
 217
 218     def report_login(self):
 219         """Report attempt to log in."""
 220         self._downloader.to_screen(u'[youtube] Logging in')
 221
 222     def report_age_confirmation(self):
 223         """Report attempt to confirm age."""
 224         self._downloader.to_screen(u'[youtube] Confirming age')
 225
 226     def report_video_webpage_download(self, video_id):
 227         """Report attempt to download video webpage."""
 228         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 229
 230     def report_video_info_webpage_download(self, video_id):
 231         """Report attempt to download video info webpage."""
 232         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 233
 234     def report_video_subtitles_download(self, video_id):
 235         """Report attempt to download video info webpage."""
 236         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 237
 238     def report_video_subtitles_request(self, video_id, sub_lang, format):
 239         """Report attempt to download video info webpage."""
 240         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 241
 242     def report_video_subtitles_available(self, video_id, sub_lang_list):
 243         """Report available subtitles."""
 244         sub_lang = ",".join(list(sub_lang_list.keys()))
 245         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 246
 247     def report_information_extraction(self, video_id):
 248         """Report attempt to extract video information."""
 249         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 250
 251     def report_unavailable_format(self, video_id, format):
 252         """Report extracted video URL."""
 253         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 254
 255     def report_rtmp_download(self):
 256         """Indicate the download will use the RTMP protocol."""
 257         self._downloader.to_screen(u'[youtube] RTMP download detected')
 258
 259     def _get_available_subtitles(self, video_id):
 260         self.report_video_subtitles_download(video_id)
 261         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 262         try:
 263             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 265             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 266         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 267         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 268         if not sub_lang_list:
 269             return (u'video doesn\'t have subtitles', None)
 270         return sub_lang_list
 271
 272     def _list_available_subtitles(self, video_id):
 273         sub_lang_list = self._get_available_subtitles(video_id)
 274         self.report_video_subtitles_available(video_id, sub_lang_list)
 275
 276     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 277         """
 278         Return tuple:
 279         (error_message, sub_lang, sub)
 280         """
 281         self.report_video_subtitles_request(video_id, sub_lang, format)
 282         params = compat_urllib_parse.urlencode({
 283             'lang': sub_lang,
 284             'name': sub_name,
 285             'v': video_id,
 286             'fmt': format,
 287         })
 288         url = 'http://www.youtube.com/api/timedtext?' + params
 289         try:
 290             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 292             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 293         if not sub:
 294             return (u'Did not fetch video subtitles', None, None)
 295         return (None, sub_lang, sub)
 296
 297     def _extract_subtitle(self, video_id):
 298         """
 299         Return a list with a tuple:
 300         [(error_message, sub_lang, sub)]
 301         """
 302         sub_lang_list = self._get_available_subtitles(video_id)
 303         sub_format = self._downloader.params.get('subtitlesformat')
 304         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 305             return [(sub_lang_list[0], None, None)]
 306         if self._downloader.params.get('subtitleslang', False):
 307             sub_lang = self._downloader.params.get('subtitleslang')
 308         elif 'en' in sub_lang_list:
 309             sub_lang = 'en'
 310         else:
 311             sub_lang = list(sub_lang_list.keys())[0]
 312         if not sub_lang in sub_lang_list:
 313             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 314
 315         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 316         return [subtitle]
 317
 318     def _extract_all_subtitles(self, video_id):
 319         sub_lang_list = self._get_available_subtitles(video_id)
 320         sub_format = self._downloader.params.get('subtitlesformat')
 321         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 322             return [(sub_lang_list[0], None, None)]
 323         subtitles = []
 324         for sub_lang in sub_lang_list:
 325             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 326             subtitles.append(subtitle)
 327         return subtitles
 328
 329     def _print_formats(self, formats):
 330         print('Available formats:')
 331         for x in formats:
 332             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 333
 334     def _real_initialize(self):
 335         if self._downloader is None:
 336             return
 337
 338         username = None
 339         password = None
 340         downloader_params = self._downloader.params
 341
 342         # Attempt to use provided username and password or .netrc data
 343         if downloader_params.get('username', None) is not None:
 344             username = downloader_params['username']
 345             password = downloader_params['password']
 346         elif downloader_params.get('usenetrc', False):
 347             try:
 348                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 349                 if info is not None:
 350                     username = info[0]
 351                     password = info[2]
 352                 else:
 353                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 354             except (IOError, netrc.NetrcParseError) as err:
 355                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 356                 return
 357
 358         # Set language
 359         request = compat_urllib_request.Request(self._LANG_URL)
 360         try:
 361             self.report_lang()
 362             compat_urllib_request.urlopen(request).read()
 363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 364             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 365             return
 366
 367         # No authentication to be performed
 368         if username is None:
 369             return
 370
 371         request = compat_urllib_request.Request(self._LOGIN_URL)
 372         try:
 373             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 376             return
 377
 378         galx = None
 379         dsh = None
 380         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 381         if match:
 382           galx = match.group(1)
 383
 384         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 385         if match:
 386           dsh = match.group(1)
 387
 388         # Log in
 389         login_form_strs = {
 390                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 391                 u'Email': username,
 392                 u'GALX': galx,
 393                 u'Passwd': password,
 394                 u'PersistentCookie': u'yes',
 395                 u'_utf8': u'霱',
 396                 u'bgresponse': u'js_disabled',
 397                 u'checkConnection': u'',
 398                 u'checkedDomains': u'youtube',
 399                 u'dnConn': u'',
 400                 u'dsh': dsh,
 401                 u'pstMsg': u'0',
 402                 u'rmShown': u'1',
 403                 u'secTok': u'',
 404                 u'signIn': u'Sign in',
 405                 u'timeStmp': u'',
 406                 u'service': u'youtube',
 407                 u'uilel': u'3',
 408                 u'hl': u'en_US',
 409         }
 410         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 411         # chokes on unicode
 412         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 413         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 414         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 415         try:
 416             self.report_login()
 417             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 418             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 419                 self._downloader.report_warning(u'unable to log in: bad username or password')
 420                 return
 421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 422             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 423             return
 424
 425         # Confirm age
 426         age_form = {
 427                 'next_url':     '/',
 428                 'action_confirm':   'Confirm',
 429                 }
 430         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 431         try:
 432             self.report_age_confirmation()
 433             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 435             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 436             return
 437
 438     def _extract_id(self, url):
 439         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 440         if mobj is None:
 441             self._downloader.report_error(u'invalid URL: %s' % url)
 442             return
 443         video_id = mobj.group(2)
 444         return video_id
 445
 446     def _real_extract(self, url):
 447         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 448         mobj = re.search(self._NEXT_URL_RE, url)
 449         if mobj:
 450             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 451         video_id = self._extract_id(url)
 452
 453         # Get video webpage
 454         self.report_video_webpage_download(video_id)
 455         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 456         request = compat_urllib_request.Request(url)
 457         try:
 458             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 459         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 460             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 461             return
 462
 463         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 464
 465         # Attempt to extract SWF player URL
 466         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 467         if mobj is not None:
 468             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 469         else:
 470             player_url = None
 471
 472         # Get video info
 473         self.report_video_info_webpage_download(video_id)
 474         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 475             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 476                     % (video_id, el_type))
 477             video_info_webpage = self._download_webpage(video_info_url, video_id,
 478                                     note=False,
 479                                     errnote='unable to download video info webpage')
 480             video_info = compat_parse_qs(video_info_webpage)
 481             if 'token' in video_info:
 482                 break
 483         if 'token' not in video_info:
 484             if 'reason' in video_info:
 485                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 486             else:
 487                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 488             return
 489
 490         # Check for "rental" videos
 491         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 492             self._downloader.report_error(u'"rental" videos not supported')
 493             return
 494
 495         # Start extracting information
 496         self.report_information_extraction(video_id)
 497
 498         # uploader
 499         if 'author' not in video_info:
 500             self._downloader.report_error(u'unable to extract uploader name')
 501             return
 502         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 503
 504         # uploader_id
 505         video_uploader_id = None
 506         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 507         if mobj is not None:
 508             video_uploader_id = mobj.group(1)
 509         else:
 510             self._downloader.report_warning(u'unable to extract uploader nickname')
 511
 512         # title
 513         if 'title' not in video_info:
 514             self._downloader.report_error(u'unable to extract video title')
 515             return
 516         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 517
 518         # thumbnail image
 519         if 'thumbnail_url' not in video_info:
 520             self._downloader.report_warning(u'unable to extract video thumbnail')
 521             video_thumbnail = ''
 522         else:   # don't panic if we can't find it
 523             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 524
 525         # upload date
 526         upload_date = None
 527         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 528         if mobj is not None:
 529             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 530             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 531             for expression in format_expressions:
 532                 try:
 533                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 534                 except:
 535                     pass
 536
 537         # description
 538         video_description = get_element_by_id("eow-description", video_webpage)
 539         if video_description:
 540             video_description = clean_html(video_description)
 541         else:
 542             video_description = ''
 543
 544         # subtitles
 545         video_subtitles = None
 546
 547         if self._downloader.params.get('writesubtitles', False):
 548             video_subtitles = self._extract_subtitle(video_id)
 549             if video_subtitles:
 550                 (sub_error, sub_lang, sub) = video_subtitles[0]
 551                 if sub_error:
 552                     self._downloader.report_error(sub_error)
 553
 554         if self._downloader.params.get('allsubtitles', False):
 555             video_subtitles = self._extract_all_subtitles(video_id)
 556             for video_subtitle in video_subtitles:
 557                 (sub_error, sub_lang, sub) = video_subtitle
 558                 if sub_error:
 559                     self._downloader.report_error(sub_error)
 560
 561         if self._downloader.params.get('listsubtitles', False):
 562             sub_lang_list = self._list_available_subtitles(video_id)
 563             return
 564
 565         if 'length_seconds' not in video_info:
 566             self._downloader.report_warning(u'unable to extract video duration')
 567             video_duration = ''
 568         else:
 569             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 570
 571         # token
 572         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 573
 574         # Decide which formats to download
 575         req_format = self._downloader.params.get('format', None)
 576
 577         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 578             self.report_rtmp_download()
 579             video_url_list = [(None, video_info['conn'][0])]
 580         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 581             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 582             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 583             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 584             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 585
 586             format_limit = self._downloader.params.get('format_limit', None)
 587             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 588             if format_limit is not None and format_limit in available_formats:
 589                 format_list = available_formats[available_formats.index(format_limit):]
 590             else:
 591                 format_list = available_formats
 592             existing_formats = [x for x in format_list if x in url_map]
 593             if len(existing_formats) == 0:
 594                 self._downloader.report_error(u'no known formats available for video')
 595                 return
 596             if self._downloader.params.get('listformats', None):
 597                 self._print_formats(existing_formats)
 598                 return
 599             if req_format is None or req_format == 'best':
 600                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 601             elif req_format == 'worst':
 602                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 603             elif req_format in ('-1', 'all'):
 604                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 605             else:
 606                 # Specific formats. We pick the first in a slash-delimeted sequence.
 607                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 608                 req_formats = req_format.split('/')
 609                 video_url_list = None
 610                 for rf in req_formats:
 611                     if rf in url_map:
 612                         video_url_list = [(rf, url_map[rf])]
 613                         break
 614                 if video_url_list is None:
 615                     self._downloader.report_error(u'requested format not available')
 616                     return
 617         else:
 618             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 619             return
 620
 621         results = []
 622         for format_param, video_real_url in video_url_list:
 623             # Extension
 624             video_extension = self._video_extensions.get(format_param, 'flv')
 625
 626             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 627                                               self._video_dimensions.get(format_param, '???'))
 628
 629             results.append({
 630                 'id':       video_id,
 631                 'url':      video_real_url,
 632                 'uploader': video_uploader,
 633                 'uploader_id': video_uploader_id,
 634                 'upload_date':  upload_date,
 635                 'title':    video_title,
 636                 'ext':      video_extension,
 637                 'format':   video_format,
 638                 'thumbnail':    video_thumbnail,
 639                 'description':  video_description,
 640                 'player_url':   player_url,
 641                 'subtitles':    video_subtitles,
 642                 'duration':     video_duration
 643             })
 644         return results
 645
 646
 647 class MetacafeIE(InfoExtractor):
 648     """Information Extractor for metacafe.com."""
 649
 650     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 651     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 652     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 653     IE_NAME = u'metacafe'
 654
 655     def __init__(self, downloader=None):
 656         InfoExtractor.__init__(self, downloader)
 657
 658     def report_disclaimer(self):
 659         """Report disclaimer retrieval."""
 660         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 661
 662     def report_age_confirmation(self):
 663         """Report attempt to confirm age."""
 664         self._downloader.to_screen(u'[metacafe] Confirming age')
 665
 666     def report_download_webpage(self, video_id):
 667         """Report webpage download."""
 668         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 669
 670     def report_extraction(self, video_id):
 671         """Report information extraction."""
 672         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 673
 674     def _real_initialize(self):
 675         # Retrieve disclaimer
 676         request = compat_urllib_request.Request(self._DISCLAIMER)
 677         try:
 678             self.report_disclaimer()
 679             disclaimer = compat_urllib_request.urlopen(request).read()
 680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 681             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 682             return
 683
 684         # Confirm age
 685         disclaimer_form = {
 686             'filters': '0',
 687             'submit': "Continue - I'm over 18",
 688             }
 689         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 690         try:
 691             self.report_age_confirmation()
 692             disclaimer = compat_urllib_request.urlopen(request).read()
 693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 694             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 695             return
 696
 697     def _real_extract(self, url):
 698         # Extract id and simplified title from URL
 699         mobj = re.match(self._VALID_URL, url)
 700         if mobj is None:
 701             self._downloader.report_error(u'invalid URL: %s' % url)
 702             return
 703
 704         video_id = mobj.group(1)
 705
 706         # Check if video comes from YouTube
 707         mobj2 = re.match(r'^yt-(.*)$', video_id)
 708         if mobj2 is not None:
 709             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 710             return
 711
 712         # Retrieve video webpage to extract further information
 713         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 714         try:
 715             self.report_download_webpage(video_id)
 716             webpage = compat_urllib_request.urlopen(request).read()
 717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 718             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
 719             return
 720
 721         # Extract URL, uploader and title from webpage
 722         self.report_extraction(video_id)
 723         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 724         if mobj is not None:
 725             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 726             video_extension = mediaURL[-3:]
 727
 728             # Extract gdaKey if available
 729             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 730             if mobj is None:
 731                 video_url = mediaURL
 732             else:
 733                 gdaKey = mobj.group(1)
 734                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 735         else:
 736             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 737             if mobj is None:
 738                 self._downloader.report_error(u'unable to extract media URL')
 739                 return
 740             vardict = compat_parse_qs(mobj.group(1))
 741             if 'mediaData' not in vardict:
 742                 self._downloader.report_error(u'unable to extract media URL')
 743                 return
 744             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 745             if mobj is None:
 746                 self._downloader.report_error(u'unable to extract media URL')
 747                 return
 748             mediaURL = mobj.group(1).replace('\\/', '/')
 749             video_extension = mediaURL[-3:]
 750             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 751
 752         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 753         if mobj is None:
 754             self._downloader.report_error(u'unable to extract title')
 755             return
 756         video_title = mobj.group(1).decode('utf-8')
 757
 758         mobj = re.search(r'submitter=(.*?);', webpage)
 759         if mobj is None:
 760             self._downloader.report_error(u'unable to extract uploader nickname')
 761             return
 762         video_uploader = mobj.group(1)
 763
 764         return [{
 765             'id':       video_id.decode('utf-8'),
 766             'url':      video_url.decode('utf-8'),
 767             'uploader': video_uploader.decode('utf-8'),
 768             'upload_date':  None,
 769             'title':    video_title,
 770             'ext':      video_extension.decode('utf-8'),
 771         }]
 772
 773
 774 class DailymotionIE(InfoExtractor):
 775     """Information Extractor for Dailymotion"""
 776
 777     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 778     IE_NAME = u'dailymotion'
 779     _WORKING = False
 780
 781     def __init__(self, downloader=None):
 782         InfoExtractor.__init__(self, downloader)
 783
 784     def report_extraction(self, video_id):
 785         """Report information extraction."""
 786         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 787
 788     def _real_extract(self, url):
 789         # Extract id and simplified title from URL
 790         mobj = re.match(self._VALID_URL, url)
 791         if mobj is None:
 792             self._downloader.report_error(u'invalid URL: %s' % url)
 793             return
 794
 795         video_id = mobj.group(1).split('_')[0].split('?')[0]
 796
 797         video_extension = 'mp4'
 798
 799         # Retrieve video webpage to extract further information
 800         request = compat_urllib_request.Request(url)
 801         request.add_header('Cookie', 'family_filter=off')
 802         webpage = self._download_webpage(request, video_id)
 803
 804         # Extract URL, uploader and title from webpage
 805         self.report_extraction(video_id)
 806         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 807         if mobj is None:
 808             self._downloader.report_error(u'unable to extract media URL')
 809             return
 810         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 811
 812         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 813             if key in flashvars:
 814                 max_quality = key
 815                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 816                 break
 817         else:
 818             self._downloader.report_error(u'unable to extract video URL')
 819             return
 820
 821         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 822         if mobj is None:
 823             self._downloader.report_error(u'unable to extract video URL')
 824             return
 825
 826         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 827
 828         # TODO: support choosing qualities
 829
 830         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 831         if mobj is None:
 832             self._downloader.report_error(u'unable to extract title')
 833             return
 834         video_title = unescapeHTML(mobj.group('title'))
 835
 836         video_uploader = None
 837         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 838         if mobj is None:
 839             # lookin for official user
 840             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 841             if mobj_official is None:
 842                 self._downloader.report_warning(u'unable to extract uploader nickname')
 843             else:
 844                 video_uploader = mobj_official.group(1)
 845         else:
 846             video_uploader = mobj.group(1)
 847
 848         video_upload_date = None
 849         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 850         if mobj is not None:
 851             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 852
 853         return [{
 854             'id':       video_id,
 855             'url':      video_url,
 856             'uploader': video_uploader,
 857             'upload_date':  video_upload_date,
 858             'title':    video_title,
 859             'ext':      video_extension,
 860         }]
 861
 862
 863 class PhotobucketIE(InfoExtractor):
 864     """Information extractor for photobucket.com."""
 865
 866     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 867     IE_NAME = u'photobucket'
 868
 869     def __init__(self, downloader=None):
 870         InfoExtractor.__init__(self, downloader)
 871
 872     def report_download_webpage(self, video_id):
 873         """Report webpage download."""
 874         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 875
 876     def report_extraction(self, video_id):
 877         """Report information extraction."""
 878         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 879
 880     def _real_extract(self, url):
 881         # Extract id from URL
 882         mobj = re.match(self._VALID_URL, url)
 883         if mobj is None:
 884             self._downloader.report_error(u'Invalid URL: %s' % url)
 885             return
 886
 887         video_id = mobj.group(1)
 888
 889         video_extension = 'flv'
 890
 891         # Retrieve video webpage to extract further information
 892         request = compat_urllib_request.Request(url)
 893         try:
 894             self.report_download_webpage(video_id)
 895             webpage = compat_urllib_request.urlopen(request).read()
 896         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 897             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 898             return
 899
 900         # Extract URL, uploader, and title from webpage
 901         self.report_extraction(video_id)
 902         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 903         if mobj is None:
 904             self._downloader.report_error(u'unable to extract media URL')
 905             return
 906         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 907
 908         video_url = mediaURL
 909
 910         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 911         if mobj is None:
 912             self._downloader.report_error(u'unable to extract title')
 913             return
 914         video_title = mobj.group(1).decode('utf-8')
 915
 916         video_uploader = mobj.group(2).decode('utf-8')
 917
 918         return [{
 919             'id':       video_id.decode('utf-8'),
 920             'url':      video_url.decode('utf-8'),
 921             'uploader': video_uploader,
 922             'upload_date':  None,
 923             'title':    video_title,
 924             'ext':      video_extension.decode('utf-8'),
 925         }]
 926
 927
 928 class YahooIE(InfoExtractor):
 929     """Information extractor for video.yahoo.com."""
 930
 931     _WORKING = False
 932     # _VALID_URL matches all Yahoo! Video URLs
 933     # _VPAGE_URL matches only the extractable '/watch/' URLs
 934     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 935     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 936     IE_NAME = u'video.yahoo'
 937
 938     def __init__(self, downloader=None):
 939         InfoExtractor.__init__(self, downloader)
 940
 941     def report_download_webpage(self, video_id):
 942         """Report webpage download."""
 943         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 944
 945     def report_extraction(self, video_id):
 946         """Report information extraction."""
 947         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 948
 949     def _real_extract(self, url, new_video=True):
 950         # Extract ID from URL
 951         mobj = re.match(self._VALID_URL, url)
 952         if mobj is None:
 953             self._downloader.report_error(u'Invalid URL: %s' % url)
 954             return
 955
 956         video_id = mobj.group(2)
 957         video_extension = 'flv'
 958
 959         # Rewrite valid but non-extractable URLs as
 960         # extractable English language /watch/ URLs
 961         if re.match(self._VPAGE_URL, url) is None:
 962             request = compat_urllib_request.Request(url)
 963             try:
 964                 webpage = compat_urllib_request.urlopen(request).read()
 965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 966                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 967                 return
 968
 969             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 970             if mobj is None:
 971                 self._downloader.report_error(u'Unable to extract id field')
 972                 return
 973             yahoo_id = mobj.group(1)
 974
 975             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 976             if mobj is None:
 977                 self._downloader.report_error(u'Unable to extract vid field')
 978                 return
 979             yahoo_vid = mobj.group(1)
 980
 981             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 982             return self._real_extract(url, new_video=False)
 983
 984         # Retrieve video webpage to extract further information
 985         request = compat_urllib_request.Request(url)
 986         try:
 987             self.report_download_webpage(video_id)
 988             webpage = compat_urllib_request.urlopen(request).read()
 989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 990             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 991             return
 992
 993         # Extract uploader and title from webpage
 994         self.report_extraction(video_id)
 995         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 996         if mobj is None:
 997             self._downloader.report_error(u'unable to extract video title')
 998             return
 999         video_title = mobj.group(1).decode('utf-8')
1000
1001         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1002         if mobj is None:
1003             self._downloader.report_error(u'unable to extract video uploader')
1004             return
1005         video_uploader = mobj.group(1).decode('utf-8')
1006
1007         # Extract video thumbnail
1008         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1009         if mobj is None:
1010             self._downloader.report_error(u'unable to extract video thumbnail')
1011             return
1012         video_thumbnail = mobj.group(1).decode('utf-8')
1013
1014         # Extract video description
1015         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video description')
1018             return
1019         video_description = mobj.group(1).decode('utf-8')
1020         if not video_description:
1021             video_description = 'No description available.'
1022
1023         # Extract video height and width
1024         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video height')
1027             return
1028         yv_video_height = mobj.group(1)
1029
1030         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1031         if mobj is None:
1032             self._downloader.report_error(u'unable to extract video width')
1033             return
1034         yv_video_width = mobj.group(1)
1035
1036         # Retrieve video playlist to extract media URL
1037         # I'm not completely sure what all these options are, but we
1038         # seem to need most of them, otherwise the server sends a 401.
1039         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1040         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1041         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1044         try:
1045             self.report_download_webpage(video_id)
1046             webpage = compat_urllib_request.urlopen(request).read()
1047         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1049             return
1050
1051         # Extract media URL from playlist XML
1052         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1053         if mobj is None:
1054             self._downloader.report_error(u'Unable to extract media URL')
1055             return
1056         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057         video_url = unescapeHTML(video_url)
1058
1059         return [{
1060             'id':       video_id.decode('utf-8'),
1061             'url':      video_url,
1062             'uploader': video_uploader,
1063             'upload_date':  None,
1064             'title':    video_title,
1065             'ext':      video_extension.decode('utf-8'),
1066             'thumbnail':    video_thumbnail.decode('utf-8'),
1067             'description':  video_description,
1068         }]
1069
1070
1071 class VimeoIE(InfoExtractor):
1072     """Information extractor for vimeo.com."""
1073
1074     # _VALID_URL matches Vimeo URLs
1075     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1076     IE_NAME = u'vimeo'
1077
1078     def __init__(self, downloader=None):
1079         InfoExtractor.__init__(self, downloader)
1080
1081     def report_download_webpage(self, video_id):
1082         """Report webpage download."""
1083         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1084
1085     def report_extraction(self, video_id):
1086         """Report information extraction."""
1087         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1088
1089     def _real_extract(self, url, new_video=True):
1090         # Extract ID from URL
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             self._downloader.report_error(u'Invalid URL: %s' % url)
1094             return
1095
1096         video_id = mobj.group('id')
1097         if not mobj.group('proto'):
1098             url = 'https://' + url
1099         if mobj.group('direct_link'):
1100             url = 'https://vimeo.com/' + video_id
1101
1102         # Retrieve video webpage to extract further information
1103         request = compat_urllib_request.Request(url, None, std_headers)
1104         try:
1105             self.report_download_webpage(video_id)
1106             webpage_bytes = compat_urllib_request.urlopen(request).read()
1107             webpage = webpage_bytes.decode('utf-8')
1108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1110             return
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             self._downloader.report_error(u'unable to extract info section')
1123             return
1124
1125         # Extract title
1126         video_title = config["video"]["title"]
1127
1128         # Extract uploader and uploader_id
1129         video_uploader = config["video"]["owner"]["name"]
1130         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1131
1132         # Extract video thumbnail
1133         video_thumbnail = config["video"]["thumbnail"]
1134
1135         # Extract video description
1136         video_description = get_element_by_attribute("itemprop", "description", webpage)
1137         if video_description: video_description = clean_html(video_description)
1138         else: video_description = u''
1139
1140         # Extract upload date
1141         video_upload_date = None
1142         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143         if mobj is not None:
1144             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145
1146         # Vimeo specific: extract request signature and timestamp
1147         sig = config['request']['signature']
1148         timestamp = config['request']['timestamp']
1149
1150         # Vimeo specific: extract video codec and quality information
1151         # First consider quality, then codecs, then take everything
1152         # TODO bind to format param
1153         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154         files = { 'hd': [], 'sd': [], 'other': []}
1155         for codec_name, codec_extension in codecs:
1156             if codec_name in config["video"]["files"]:
1157                 if 'hd' in config["video"]["files"][codec_name]:
1158                     files['hd'].append((codec_name, codec_extension, 'hd'))
1159                 elif 'sd' in config["video"]["files"][codec_name]:
1160                     files['sd'].append((codec_name, codec_extension, 'sd'))
1161                 else:
1162                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163
1164         for quality in ('hd', 'sd', 'other'):
1165             if len(files[quality]) > 0:
1166                 video_quality = files[quality][0][2]
1167                 video_codec = files[quality][0][0]
1168                 video_extension = files[quality][0][1]
1169                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170                 break
1171         else:
1172             self._downloader.report_error(u'no known codec found')
1173             return
1174
1175         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1177
1178         return [{
1179             'id':       video_id,
1180             'url':      video_url,
1181             'uploader': video_uploader,
1182             'uploader_id': video_uploader_id,
1183             'upload_date':  video_upload_date,
1184             'title':    video_title,
1185             'ext':      video_extension,
1186             'thumbnail':    video_thumbnail,
1187             'description':  video_description,
1188         }]
1189
1190
1191 class ArteTvIE(InfoExtractor):
1192     """arte.tv information extractor."""
1193
1194     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195     _LIVE_URL = r'index-[0-9]+\.html$'
1196
1197     IE_NAME = u'arte.tv'
1198
1199     def __init__(self, downloader=None):
1200         InfoExtractor.__init__(self, downloader)
1201
1202     def report_download_webpage(self, video_id):
1203         """Report webpage download."""
1204         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1205
1206     def report_extraction(self, video_id):
1207         """Report information extraction."""
1208         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1209
1210     def fetch_webpage(self, url):
1211         request = compat_urllib_request.Request(url)
1212         try:
1213             self.report_download_webpage(url)
1214             webpage = compat_urllib_request.urlopen(request).read()
1215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1217             return
1218         except ValueError as err:
1219             self._downloader.report_error(u'Invalid URL: %s' % url)
1220             return
1221         return webpage
1222
1223     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224         page = self.fetch_webpage(url)
1225         mobj = re.search(regex, page, regexFlags)
1226         info = {}
1227
1228         if mobj is None:
1229             self._downloader.report_error(u'Invalid URL: %s' % url)
1230             return
1231
1232         for (i, key, err) in matchTuples:
1233             if mobj.group(i) is None:
1234                 self._downloader.trouble(err)
1235                 return
1236             else:
1237                 info[key] = mobj.group(i)
1238
1239         return info
1240
1241     def extractLiveStream(self, url):
1242         video_lang = url.split('/')[-4]
1243         info = self.grep_webpage(
1244             url,
1245             r'src="(.*?/videothek_js.*?\.js)',
1246             0,
1247             [
1248                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1249             ]
1250         )
1251         http_host = url.split('/')[2]
1252         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253         info = self.grep_webpage(
1254             next_url,
1255             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256                 '(http://.*?\.swf).*?' +
1257                 '(rtmp://.*?)\'',
1258             re.DOTALL,
1259             [
1260                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1261                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1263             ]
1264         )
1265         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1266
1267     def extractPlus7Stream(self, url):
1268         video_lang = url.split('/')[-3]
1269         info = self.grep_webpage(
1270             url,
1271             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1272             0,
1273             [
1274                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1275             ]
1276         )
1277         next_url = compat_urllib_parse.unquote(info.get('url'))
1278         info = self.grep_webpage(
1279             next_url,
1280             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1281             0,
1282             [
1283                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1284             ]
1285         )
1286         next_url = compat_urllib_parse.unquote(info.get('url'))
1287
1288         info = self.grep_webpage(
1289             next_url,
1290             r'<video id="(.*?)".*?>.*?' +
1291                 '<name>(.*?)</name>.*?' +
1292                 '<dateVideo>(.*?)</dateVideo>.*?' +
1293                 '<url quality="hd">(.*?)</url>',
1294             re.DOTALL,
1295             [
1296                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1297                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1299                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1300             ]
1301         )
1302
1303         return {
1304             'id':           info.get('id'),
1305             'url':          compat_urllib_parse.unquote(info.get('url')),
1306             'uploader':     u'arte.tv',
1307             'upload_date':  info.get('date'),
1308             'title':        info.get('title').decode('utf-8'),
1309             'ext':          u'mp4',
1310             'format':       u'NA',
1311             'player_url':   None,
1312         }
1313
1314     def _real_extract(self, url):
1315         video_id = url.split('/')[-1]
1316         self.report_extraction(video_id)
1317
1318         if re.search(self._LIVE_URL, video_id) is not None:
1319             self.extractLiveStream(url)
1320             return
1321         else:
1322             info = self.extractPlus7Stream(url)
1323
1324         return [info]
1325
1326
1327 class GenericIE(InfoExtractor):
1328     """Generic last-resort information extractor."""
1329
1330     _VALID_URL = r'.*'
1331     IE_NAME = u'generic'
1332
1333     def __init__(self, downloader=None):
1334         InfoExtractor.__init__(self, downloader)
1335
1336     def report_download_webpage(self, video_id):
1337         """Report webpage download."""
1338         if not self._downloader.params.get('test', False):
1339             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1341
1342     def report_extraction(self, video_id):
1343         """Report information extraction."""
1344         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1345
1346     def report_following_redirect(self, new_url):
1347         """Report information extraction."""
1348         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1349
1350     def _test_redirect(self, url):
1351         """Check if it is a redirect, like url shorteners, in case restart chain."""
1352         class HeadRequest(compat_urllib_request.Request):
1353             def get_method(self):
1354                 return "HEAD"
1355
1356         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1357             """
1358             Subclass the HTTPRedirectHandler to make it use our
1359             HeadRequest also on the redirected URL
1360             """
1361             def redirect_request(self, req, fp, code, msg, headers, newurl):
1362                 if code in (301, 302, 303, 307):
1363                     newurl = newurl.replace(' ', '%20')
1364                     newheaders = dict((k,v) for k,v in req.headers.items()
1365                                       if k.lower() not in ("content-length", "content-type"))
1366                     return HeadRequest(newurl,
1367                                        headers=newheaders,
1368                                        origin_req_host=req.get_origin_req_host(),
1369                                        unverifiable=True)
1370                 else:
1371                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1372
1373         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1374             """
1375             Fallback to GET if HEAD is not allowed (405 HTTP error)
1376             """
1377             def http_error_405(self, req, fp, code, msg, headers):
1378                 fp.read()
1379                 fp.close()
1380
1381                 newheaders = dict((k,v) for k,v in req.headers.items()
1382                                   if k.lower() not in ("content-length", "content-type"))
1383                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1384                                                  headers=newheaders,
1385                                                  origin_req_host=req.get_origin_req_host(),
1386                                                  unverifiable=True))
1387
1388         # Build our opener
1389         opener = compat_urllib_request.OpenerDirector()
1390         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391                         HTTPMethodFallback, HEADRedirectHandler,
1392                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393             opener.add_handler(handler())
1394
1395         response = opener.open(HeadRequest(url))
1396         new_url = response.geturl()
1397
1398         if url == new_url:
1399             return False
1400
1401         self.report_following_redirect(new_url)
1402         self._downloader.download([new_url])
1403         return True
1404
1405     def _real_extract(self, url):
1406         if self._test_redirect(url): return
1407
1408         video_id = url.split('/')[-1]
1409         try:
1410             webpage = self._download_webpage(url, video_id)
1411         except ValueError as err:
1412             # since this is the last-resort InfoExtractor, if
1413             # this error is thrown, it'll be thrown here
1414             self._downloader.report_error(u'Invalid URL: %s' % url)
1415             return
1416
1417         self.report_extraction(video_id)
1418         # Start with something easy: JW Player in SWFObject
1419         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1420         if mobj is None:
1421             # Broaden the search a little bit
1422             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1423         if mobj is None:
1424             # Broaden the search a little bit: JWPlayer JS loader
1425             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1426         if mobj is None:
1427             self._downloader.report_error(u'Invalid URL: %s' % url)
1428             return
1429
1430         # It's possible that one of the regexes
1431         # matched, but returned an empty group:
1432         if mobj.group(1) is None:
1433             self._downloader.report_error(u'Invalid URL: %s' % url)
1434             return
1435
1436         video_url = compat_urllib_parse.unquote(mobj.group(1))
1437         video_id = os.path.basename(video_url)
1438
1439         # here's a fun little line of code for you:
1440         video_extension = os.path.splitext(video_id)[1][1:]
1441         video_id = os.path.splitext(video_id)[0]
1442
1443         # it's tempting to parse this further, but you would
1444         # have to take into account all the variations like
1445         #   Video Title - Site Name
1446         #   Site Name | Video Title
1447         #   Video Title - Tagline | Site Name
1448         # and so on and so forth; it's just not practical
1449         mobj = re.search(r'<title>(.*)</title>', webpage)
1450         if mobj is None:
1451             self._downloader.report_error(u'unable to extract title')
1452             return
1453         video_title = mobj.group(1)
1454
1455         # video uploader is domain name
1456         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1457         if mobj is None:
1458             self._downloader.report_error(u'unable to extract title')
1459             return
1460         video_uploader = mobj.group(1)
1461
1462         return [{
1463             'id':       video_id,
1464             'url':      video_url,
1465             'uploader': video_uploader,
1466             'upload_date':  None,
1467             'title':    video_title,
1468             'ext':      video_extension,
1469         }]
1470
1471
1472 class YoutubeSearchIE(InfoExtractor):
1473     """Information Extractor for YouTube search queries."""
1474     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476     _max_youtube_results = 1000
1477     IE_NAME = u'youtube:search'
1478
1479     def __init__(self, downloader=None):
1480         InfoExtractor.__init__(self, downloader)
1481
1482     def report_download_page(self, query, pagenum):
1483         """Report attempt to download search page with given number."""
1484         query = query.decode(preferredencoding())
1485         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1486
1487     def _real_extract(self, query):
1488         mobj = re.match(self._VALID_URL, query)
1489         if mobj is None:
1490             self._downloader.report_error(u'invalid search query "%s"' % query)
1491             return
1492
1493         prefix, query = query.split(':')
1494         prefix = prefix[8:]
1495         query = query.encode('utf-8')
1496         if prefix == '':
1497             self._download_n_results(query, 1)
1498             return
1499         elif prefix == 'all':
1500             self._download_n_results(query, self._max_youtube_results)
1501             return
1502         else:
1503             try:
1504                 n = int(prefix)
1505                 if n <= 0:
1506                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1507                     return
1508                 elif n > self._max_youtube_results:
1509                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510                     n = self._max_youtube_results
1511                 self._download_n_results(query, n)
1512                 return
1513             except ValueError: # parsing prefix as integer fails
1514                 self._download_n_results(query, 1)
1515                 return
1516
1517     def _download_n_results(self, query, n):
1518         """Downloads a specified number of results for a query"""
1519
1520         video_ids = []
1521         pagenum = 0
1522         limit = n
1523
1524         while (50 * pagenum) < limit:
1525             self.report_download_page(query, pagenum+1)
1526             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527             request = compat_urllib_request.Request(result_url)
1528             try:
1529                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1532                 return
1533             api_response = json.loads(data)['data']
1534
1535             if not 'items' in api_response:
1536                 self._downloader.trouble(u'[youtube] No video results')
1537                 return
1538
1539             new_ids = list(video['id'] for video in api_response['items'])
1540             video_ids += new_ids
1541
1542             limit = min(n, api_response['totalItems'])
1543             pagenum += 1
1544
1545         if len(video_ids) > n:
1546             video_ids = video_ids[:n]
1547         for id in video_ids:
1548             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1549         return
1550
1551
1552 class GoogleSearchIE(InfoExtractor):
1553     """Information Extractor for Google Video search queries."""
1554     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558     _max_google_results = 1000
1559     IE_NAME = u'video.google:search'
1560
1561     def __init__(self, downloader=None):
1562         InfoExtractor.__init__(self, downloader)
1563
1564     def report_download_page(self, query, pagenum):
1565         """Report attempt to download playlist page with given number."""
1566         query = query.decode(preferredencoding())
1567         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1568
1569     def _real_extract(self, query):
1570         mobj = re.match(self._VALID_URL, query)
1571         if mobj is None:
1572             self._downloader.report_error(u'invalid search query "%s"' % query)
1573             return
1574
1575         prefix, query = query.split(':')
1576         prefix = prefix[8:]
1577         query = query.encode('utf-8')
1578         if prefix == '':
1579             self._download_n_results(query, 1)
1580             return
1581         elif prefix == 'all':
1582             self._download_n_results(query, self._max_google_results)
1583             return
1584         else:
1585             try:
1586                 n = int(prefix)
1587                 if n <= 0:
1588                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1589                     return
1590                 elif n > self._max_google_results:
1591                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592                     n = self._max_google_results
1593                 self._download_n_results(query, n)
1594                 return
1595             except ValueError: # parsing prefix as integer fails
1596                 self._download_n_results(query, 1)
1597                 return
1598
1599     def _download_n_results(self, query, n):
1600         """Downloads a specified number of results for a query"""
1601
1602         video_ids = []
1603         pagenum = 0
1604
1605         while True:
1606             self.report_download_page(query, pagenum)
1607             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608             request = compat_urllib_request.Request(result_url)
1609             try:
1610                 page = compat_urllib_request.urlopen(request).read()
1611             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1613                 return
1614
1615             # Extract video identifiers
1616             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617                 video_id = mobj.group(1)
1618                 if video_id not in video_ids:
1619                     video_ids.append(video_id)
1620                     if len(video_ids) == n:
1621                         # Specified n videos reached
1622                         for id in video_ids:
1623                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624                         return
1625
1626             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627                 for id in video_ids:
1628                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1629                 return
1630
1631             pagenum = pagenum + 1
1632
1633
1634 class YahooSearchIE(InfoExtractor):
1635     """Information Extractor for Yahoo! Video search queries."""
1636
1637     _WORKING = False
1638     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641     _MORE_PAGES_INDICATOR = r'\s*Next'
1642     _max_yahoo_results = 1000
1643     IE_NAME = u'video.yahoo:search'
1644
1645     def __init__(self, downloader=None):
1646         InfoExtractor.__init__(self, downloader)
1647
1648     def report_download_page(self, query, pagenum):
1649         """Report attempt to download playlist page with given number."""
1650         query = query.decode(preferredencoding())
1651         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1652
1653     def _real_extract(self, query):
1654         mobj = re.match(self._VALID_URL, query)
1655         if mobj is None:
1656             self._downloader.report_error(u'invalid search query "%s"' % query)
1657             return
1658
1659         prefix, query = query.split(':')
1660         prefix = prefix[8:]
1661         query = query.encode('utf-8')
1662         if prefix == '':
1663             self._download_n_results(query, 1)
1664             return
1665         elif prefix == 'all':
1666             self._download_n_results(query, self._max_yahoo_results)
1667             return
1668         else:
1669             try:
1670                 n = int(prefix)
1671                 if n <= 0:
1672                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1673                     return
1674                 elif n > self._max_yahoo_results:
1675                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676                     n = self._max_yahoo_results
1677                 self._download_n_results(query, n)
1678                 return
1679             except ValueError: # parsing prefix as integer fails
1680                 self._download_n_results(query, 1)
1681                 return
1682
1683     def _download_n_results(self, query, n):
1684         """Downloads a specified number of results for a query"""
1685
1686         video_ids = []
1687         already_seen = set()
1688         pagenum = 1
1689
1690         while True:
1691             self.report_download_page(query, pagenum)
1692             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693             request = compat_urllib_request.Request(result_url)
1694             try:
1695                 page = compat_urllib_request.urlopen(request).read()
1696             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1698                 return
1699
1700             # Extract video identifiers
1701             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702                 video_id = mobj.group(1)
1703                 if video_id not in already_seen:
1704                     video_ids.append(video_id)
1705                     already_seen.add(video_id)
1706                     if len(video_ids) == n:
1707                         # Specified n videos reached
1708                         for id in video_ids:
1709                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710                         return
1711
1712             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713                 for id in video_ids:
1714                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1715                 return
1716
1717             pagenum = pagenum + 1
1718
1719
1720 class YoutubePlaylistIE(InfoExtractor):
1721     """Information Extractor for YouTube playlists."""
1722
1723     _VALID_URL = r"""(?:
1724                         (?:https?://)?
1725                         (?:\w+\.)?
1726                         youtube\.com/
1727                         (?:
1728                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729                            \? (?:.*?&)*? (?:p|a|list)=
1730                         |  p/
1731                         )
1732                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1733                         .*
1734                      |
1735                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1736                      )"""
1737     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1738     _MAX_RESULTS = 50
1739     IE_NAME = u'youtube:playlist'
1740
1741     def __init__(self, downloader=None):
1742         InfoExtractor.__init__(self, downloader)
1743
1744     @classmethod
1745     def suitable(cls, url):
1746         """Receives a URL and returns True if suitable for this IE."""
1747         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1748
1749     def report_download_page(self, playlist_id, pagenum):
1750         """Report attempt to download playlist page with given number."""
1751         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1752
1753     def _real_extract(self, url):
1754         # Extract playlist id
1755         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1756         if mobj is None:
1757             self._downloader.report_error(u'invalid url: %s' % url)
1758             return
1759
1760         # Download playlist videos from API
1761         playlist_id = mobj.group(1) or mobj.group(2)
1762         page_num = 1
1763         videos = []
1764
1765         while True:
1766             self.report_download_page(playlist_id, page_num)
1767
1768             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1769             try:
1770                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1773                 return
1774
1775             try:
1776                 response = json.loads(page)
1777             except ValueError as err:
1778                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1779                 return
1780
1781             if not 'feed' in response or not 'entry' in response['feed']:
1782                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1783                 return
1784             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785                         for entry in response['feed']['entry']
1786                         if 'content' in entry ]
1787
1788             if len(response['feed']['entry']) < self._MAX_RESULTS:
1789                 break
1790             page_num += 1
1791
1792         videos = [v[1] for v in sorted(videos)]
1793         total = len(videos)
1794
1795         playliststart = self._downloader.params.get('playliststart', 1) - 1
1796         playlistend = self._downloader.params.get('playlistend', -1)
1797         if playlistend == -1:
1798             videos = videos[playliststart:]
1799         else:
1800             videos = videos[playliststart:playlistend]
1801
1802         if len(videos) == total:
1803             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1804         else:
1805             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1806
1807         for video in videos:
1808             self._downloader.download([video])
1809         return
1810
1811
1812 class YoutubeChannelIE(InfoExtractor):
1813     """Information Extractor for YouTube channels."""
1814
1815     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1817     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1818     IE_NAME = u'youtube:channel'
1819
1820     def report_download_page(self, channel_id, pagenum):
1821         """Report attempt to download channel page with given number."""
1822         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1823
1824     def _real_extract(self, url):
1825         # Extract channel id
1826         mobj = re.match(self._VALID_URL, url)
1827         if mobj is None:
1828             self._downloader.report_error(u'invalid url: %s' % url)
1829             return
1830
1831         # Download channel pages
1832         channel_id = mobj.group(1)
1833         video_ids = []
1834         pagenum = 1
1835
1836         while True:
1837             self.report_download_page(channel_id, pagenum)
1838             url = self._TEMPLATE_URL % (channel_id, pagenum)
1839             request = compat_urllib_request.Request(url)
1840             try:
1841                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1842             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1843                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1844                 return
1845
1846             # Extract video identifiers
1847             ids_in_page = []
1848             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849                 if mobj.group(1) not in ids_in_page:
1850                     ids_in_page.append(mobj.group(1))
1851             video_ids.extend(ids_in_page)
1852
1853             if self._MORE_PAGES_INDICATOR not in page:
1854                 break
1855             pagenum = pagenum + 1
1856
1857         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1858
1859         for id in video_ids:
1860             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1861         return
1862
1863
1864 class YoutubeUserIE(InfoExtractor):
1865     """Information Extractor for YouTube users."""
1866
1867     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869     _GDATA_PAGE_SIZE = 50
1870     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872     IE_NAME = u'youtube:user'
1873
1874     def __init__(self, downloader=None):
1875         InfoExtractor.__init__(self, downloader)
1876
1877     def report_download_page(self, username, start_index):
1878         """Report attempt to download user page."""
1879         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1881
1882     def _real_extract(self, url):
1883         # Extract username
1884         mobj = re.match(self._VALID_URL, url)
1885         if mobj is None:
1886             self._downloader.report_error(u'invalid url: %s' % url)
1887             return
1888
1889         username = mobj.group(1)
1890
1891         # Download video ids using YouTube Data API. Result size per
1892         # query is limited (currently to 50 videos) so we need to query
1893         # page by page until there are no video ids - it means we got
1894         # all of them.
1895
1896         video_ids = []
1897         pagenum = 0
1898
1899         while True:
1900             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901             self.report_download_page(username, start_index)
1902
1903             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1904
1905             try:
1906                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1909                 return
1910
1911             # Extract video identifiers
1912             ids_in_page = []
1913
1914             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915                 if mobj.group(1) not in ids_in_page:
1916                     ids_in_page.append(mobj.group(1))
1917
1918             video_ids.extend(ids_in_page)
1919
1920             # A little optimization - if current page is not
1921             # "full", ie. does not contain PAGE_SIZE video ids then
1922             # we can assume that this page is the last one - there
1923             # are no more ids on further pages - no need to query
1924             # again.
1925
1926             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1927                 break
1928
1929             pagenum += 1
1930
1931         all_ids_count = len(video_ids)
1932         playliststart = self._downloader.params.get('playliststart', 1) - 1
1933         playlistend = self._downloader.params.get('playlistend', -1)
1934
1935         if playlistend == -1:
1936             video_ids = video_ids[playliststart:]
1937         else:
1938             video_ids = video_ids[playliststart:playlistend]
1939
1940         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941                 (username, all_ids_count, len(video_ids)))
1942
1943         for video_id in video_ids:
1944             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1945
1946
1947 class BlipTVUserIE(InfoExtractor):
1948     """Information Extractor for blip.tv users."""
1949
1950     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1951     _PAGE_SIZE = 12
1952     IE_NAME = u'blip.tv:user'
1953
1954     def __init__(self, downloader=None):
1955         InfoExtractor.__init__(self, downloader)
1956
1957     def report_download_page(self, username, pagenum):
1958         """Report attempt to download user page."""
1959         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960                 (self.IE_NAME, username, pagenum))
1961
1962     def _real_extract(self, url):
1963         # Extract username
1964         mobj = re.match(self._VALID_URL, url)
1965         if mobj is None:
1966             self._downloader.report_error(u'invalid url: %s' % url)
1967             return
1968
1969         username = mobj.group(1)
1970
1971         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1972
1973         request = compat_urllib_request.Request(url)
1974
1975         try:
1976             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977             mobj = re.search(r'data-users-id="([^"]+)"', page)
1978             page_base = page_base % mobj.group(1)
1979         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1981             return
1982
1983
1984         # Download video ids using BlipTV Ajax calls. Result size per
1985         # query is limited (currently to 12 videos) so we need to query
1986         # page by page until there are no video ids - it means we got
1987         # all of them.
1988
1989         video_ids = []
1990         pagenum = 1
1991
1992         while True:
1993             self.report_download_page(username, pagenum)
1994             url = page_base + "&page=" + str(pagenum)
1995             request = compat_urllib_request.Request( url )
1996             try:
1997                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2000                 return
2001
2002             # Extract video identifiers
2003             ids_in_page = []
2004
2005             for mobj in re.finditer(r'href="/([^"]+)"', page):
2006                 if mobj.group(1) not in ids_in_page:
2007                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2008
2009             video_ids.extend(ids_in_page)
2010
2011             # A little optimization - if current page is not
2012             # "full", ie. does not contain PAGE_SIZE video ids then
2013             # we can assume that this page is the last one - there
2014             # are no more ids on further pages - no need to query
2015             # again.
2016
2017             if len(ids_in_page) < self._PAGE_SIZE:
2018                 break
2019
2020             pagenum += 1
2021
2022         all_ids_count = len(video_ids)
2023         playliststart = self._downloader.params.get('playliststart', 1) - 1
2024         playlistend = self._downloader.params.get('playlistend', -1)
2025
2026         if playlistend == -1:
2027             video_ids = video_ids[playliststart:]
2028         else:
2029             video_ids = video_ids[playliststart:playlistend]
2030
2031         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2033
2034         for video_id in video_ids:
2035             self._downloader.download([u'http://blip.tv/'+video_id])
2036
2037
2038 class DepositFilesIE(InfoExtractor):
2039     """Information extractor for depositfiles.com"""
2040
2041     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2042
2043     def report_download_webpage(self, file_id):
2044         """Report webpage download."""
2045         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2046
2047     def report_extraction(self, file_id):
2048         """Report information extraction."""
2049         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2050
2051     def _real_extract(self, url):
2052         file_id = url.split('/')[-1]
2053         # Rebuild url in english locale
2054         url = 'http://depositfiles.com/en/files/' + file_id
2055
2056         # Retrieve file webpage with 'Free download' button pressed
2057         free_download_indication = { 'gateway_result' : '1' }
2058         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2059         try:
2060             self.report_download_webpage(file_id)
2061             webpage = compat_urllib_request.urlopen(request).read()
2062         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2063             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2064             return
2065
2066         # Search for the real file URL
2067         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068         if (mobj is None) or (mobj.group(1) is None):
2069             # Try to figure out reason of the error.
2070             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071             if (mobj is not None) and (mobj.group(1) is not None):
2072                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2073                 self._downloader.report_error(u'%s' % restriction_message)
2074             else:
2075                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2076             return
2077
2078         file_url = mobj.group(1)
2079         file_extension = os.path.splitext(file_url)[1][1:]
2080
2081         # Search for file title
2082         mobj = re.search(r'<b title="(.*?)">', webpage)
2083         if mobj is None:
2084             self._downloader.report_error(u'unable to extract title')
2085             return
2086         file_title = mobj.group(1).decode('utf-8')
2087
2088         return [{
2089             'id':       file_id.decode('utf-8'),
2090             'url':      file_url.decode('utf-8'),
2091             'uploader': None,
2092             'upload_date':  None,
2093             'title':    file_title,
2094             'ext':      file_extension.decode('utf-8'),
2095         }]
2096
2097
2098 class FacebookIE(InfoExtractor):
2099     """Information Extractor for Facebook"""
2100
2101     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103     _NETRC_MACHINE = 'facebook'
2104     IE_NAME = u'facebook'
2105
2106     def report_login(self):
2107         """Report attempt to log in."""
2108         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2109
2110     def _real_initialize(self):
2111         if self._downloader is None:
2112             return
2113
2114         useremail = None
2115         password = None
2116         downloader_params = self._downloader.params
2117
2118         # Attempt to use provided username and password or .netrc data
2119         if downloader_params.get('username', None) is not None:
2120             useremail = downloader_params['username']
2121             password = downloader_params['password']
2122         elif downloader_params.get('usenetrc', False):
2123             try:
2124                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125                 if info is not None:
2126                     useremail = info[0]
2127                     password = info[2]
2128                 else:
2129                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130             except (IOError, netrc.NetrcParseError) as err:
2131                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2132                 return
2133
2134         if useremail is None:
2135             return
2136
2137         # Log in
2138         login_form = {
2139             'email': useremail,
2140             'pass': password,
2141             'login': 'Log+In'
2142             }
2143         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2144         try:
2145             self.report_login()
2146             login_results = compat_urllib_request.urlopen(request).read()
2147             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2148                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2149                 return
2150         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2152             return
2153
2154     def _real_extract(self, url):
2155         mobj = re.match(self._VALID_URL, url)
2156         if mobj is None:
2157             self._downloader.report_error(u'invalid URL: %s' % url)
2158             return
2159         video_id = mobj.group('ID')
2160
2161         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162         webpage = self._download_webpage(url, video_id)
2163
2164         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2165         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2167         if not m:
2168             raise ExtractorError(u'Cannot parse data')
2169         data = dict(json.loads(m.group(1)))
2170         params_raw = compat_urllib_parse.unquote(data['params'])
2171         params = json.loads(params_raw)
2172         video_url = params['hd_src']
2173         if not video_url:
2174             video_url = params['sd_src']
2175         if not video_url:
2176             raise ExtractorError(u'Cannot find video URL')
2177         video_duration = int(params['video_duration'])
2178
2179         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2180         if not m:
2181             raise ExtractorError(u'Cannot find title in webpage')
2182         video_title = unescapeHTML(m.group(1))
2183
2184         info = {
2185             'id': video_id,
2186             'title': video_title,
2187             'url': video_url,
2188             'ext': 'mp4',
2189             'duration': video_duration,
2190             'thumbnail': params['thumbnail_src'],
2191         }
2192         return [info]
2193
2194
2195 class BlipTVIE(InfoExtractor):
2196     """Information extractor for blip.tv"""
2197
2198     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2199     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2200     IE_NAME = u'blip.tv'
2201
2202     def report_extraction(self, file_id):
2203         """Report information extraction."""
2204         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2205
2206     def report_direct_download(self, title):
2207         """Report information extraction."""
2208         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2209
2210     def _real_extract(self, url):
2211         mobj = re.match(self._VALID_URL, url)
2212         if mobj is None:
2213             self._downloader.report_error(u'invalid URL: %s' % url)
2214             return
2215
2216         urlp = compat_urllib_parse_urlparse(url)
2217         if urlp.path.startswith('/play/'):
2218             request = compat_urllib_request.Request(url)
2219             response = compat_urllib_request.urlopen(request)
2220             redirecturl = response.geturl()
2221             rurlp = compat_urllib_parse_urlparse(redirecturl)
2222             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2223             url = 'http://blip.tv/a/a-' + file_id
2224             return self._real_extract(url)
2225
2226
2227         if '?' in url:
2228             cchar = '&'
2229         else:
2230             cchar = '?'
2231         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2232         request = compat_urllib_request.Request(json_url)
2233         request.add_header('User-Agent', 'iTunes/10.6.1')
2234         self.report_extraction(mobj.group(1))
2235         info = None
2236         try:
2237             urlh = compat_urllib_request.urlopen(request)
2238             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2239                 basename = url.split('/')[-1]
2240                 title,ext = os.path.splitext(basename)
2241                 title = title.decode('UTF-8')
2242                 ext = ext.replace('.', '')
2243                 self.report_direct_download(title)
2244                 info = {
2245                     'id': title,
2246                     'url': url,
2247                     'uploader': None,
2248                     'upload_date': None,
2249                     'title': title,
2250                     'ext': ext,
2251                     'urlhandle': urlh
2252                 }
2253         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2254             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2255         if info is None: # Regular URL
2256             try:
2257                 json_code_bytes = urlh.read()
2258                 json_code = json_code_bytes.decode('utf-8')
2259             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2260                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2261                 return
2262
2263             try:
2264                 json_data = json.loads(json_code)
2265                 if 'Post' in json_data:
2266                     data = json_data['Post']
2267                 else:
2268                     data = json_data
2269
2270                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2271                 video_url = data['media']['url']
2272                 umobj = re.match(self._URL_EXT, video_url)
2273                 if umobj is None:
2274                     raise ValueError('Can not determine filename extension')
2275                 ext = umobj.group(1)
2276
2277                 info = {
2278                     'id': data['item_id'],
2279                     'url': video_url,
2280                     'uploader': data['display_name'],
2281                     'upload_date': upload_date,
2282                     'title': data['title'],
2283                     'ext': ext,
2284                     'format': data['media']['mimeType'],
2285                     'thumbnail': data['thumbnailUrl'],
2286                     'description': data['description'],
2287                     'player_url': data['embedUrl'],
2288                     'user_agent': 'iTunes/10.6.1',
2289                 }
2290             except (ValueError,KeyError) as err:
2291                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2292                 return
2293
2294         return [info]
2295
2296
2297 class MyVideoIE(InfoExtractor):
2298     """Information Extractor for myvideo.de."""
2299
2300     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2301     IE_NAME = u'myvideo'
2302
2303     def __init__(self, downloader=None):
2304         InfoExtractor.__init__(self, downloader)
2305
2306     def report_extraction(self, video_id):
2307         """Report information extraction."""
2308         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2309
2310     def _real_extract(self,url):
2311         mobj = re.match(self._VALID_URL, url)
2312         if mobj is None:
2313             self._download.report_error(u'invalid URL: %s' % url)
2314             return
2315
2316         video_id = mobj.group(1)
2317
2318         # Get video webpage
2319         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2320         webpage = self._download_webpage(webpage_url, video_id)
2321
2322         self.report_extraction(video_id)
2323         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2324                  webpage)
2325         if mobj is None:
2326             self._downloader.report_error(u'unable to extract media URL')
2327             return
2328         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2329
2330         mobj = re.search('<title>([^<]+)</title>', webpage)
2331         if mobj is None:
2332             self._downloader.report_error(u'unable to extract title')
2333             return
2334
2335         video_title = mobj.group(1)
2336
2337         return [{
2338             'id':       video_id,
2339             'url':      video_url,
2340             'uploader': None,
2341             'upload_date':  None,
2342             'title':    video_title,
2343             'ext':      u'flv',
2344         }]
2345
2346 class ComedyCentralIE(InfoExtractor):
2347     """Information extractor for The Daily Show and Colbert Report """
2348
2349     # urls can be abbreviations like :thedailyshow or :colbert
2350     # urls for episodes like:
2351     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2352     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2353     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2354     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2355                       |(https?://)?(www\.)?
2356                           (?P<showname>thedailyshow|colbertnation)\.com/
2357                          (full-episodes/(?P<episode>.*)|
2358                           (?P<clip>
2359                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2360                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2361                      $"""
2362
2363     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2364
2365     _video_extensions = {
2366         '3500': 'mp4',
2367         '2200': 'mp4',
2368         '1700': 'mp4',
2369         '1200': 'mp4',
2370         '750': 'mp4',
2371         '400': 'mp4',
2372     }
2373     _video_dimensions = {
2374         '3500': '1280x720',
2375         '2200': '960x540',
2376         '1700': '768x432',
2377         '1200': '640x360',
2378         '750': '512x288',
2379         '400': '384x216',
2380     }
2381
2382     @classmethod
2383     def suitable(cls, url):
2384         """Receives a URL and returns True if suitable for this IE."""
2385         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2386
2387     def report_extraction(self, episode_id):
2388         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2389
2390     def report_config_download(self, episode_id, media_id):
2391         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2392
2393     def report_index_download(self, episode_id):
2394         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2395
2396     def _print_formats(self, formats):
2397         print('Available formats:')
2398         for x in formats:
2399             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2400
2401
2402     def _real_extract(self, url):
2403         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2404         if mobj is None:
2405             self._downloader.report_error(u'invalid URL: %s' % url)
2406             return
2407
2408         if mobj.group('shortname'):
2409             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2410                 url = u'http://www.thedailyshow.com/full-episodes/'
2411             else:
2412                 url = u'http://www.colbertnation.com/full-episodes/'
2413             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414             assert mobj is not None
2415
2416         if mobj.group('clip'):
2417             if mobj.group('showname') == 'thedailyshow':
2418                 epTitle = mobj.group('tdstitle')
2419             else:
2420                 epTitle = mobj.group('cntitle')
2421             dlNewest = False
2422         else:
2423             dlNewest = not mobj.group('episode')
2424             if dlNewest:
2425                 epTitle = mobj.group('showname')
2426             else:
2427                 epTitle = mobj.group('episode')
2428
2429         req = compat_urllib_request.Request(url)
2430         self.report_extraction(epTitle)
2431         try:
2432             htmlHandle = compat_urllib_request.urlopen(req)
2433             html = htmlHandle.read()
2434             webpage = html.decode('utf-8')
2435         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2436             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2437             return
2438         if dlNewest:
2439             url = htmlHandle.geturl()
2440             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2441             if mobj is None:
2442                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2443                 return
2444             if mobj.group('episode') == '':
2445                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2446                 return
2447             epTitle = mobj.group('episode')
2448
2449         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2450
2451         if len(mMovieParams) == 0:
2452             # The Colbert Report embeds the information in a without
2453             # a URL prefix; so extract the alternate reference
2454             # and then add the URL prefix manually.
2455
2456             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2457             if len(altMovieParams) == 0:
2458                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2459                 return
2460             else:
2461                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2462
2463         uri = mMovieParams[0][1]
2464         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2465         self.report_index_download(epTitle)
2466         try:
2467             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2468         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2470             return
2471
2472         results = []
2473
2474         idoc = xml.etree.ElementTree.fromstring(indexXml)
2475         itemEls = idoc.findall('.//item')
2476         for partNum,itemEl in enumerate(itemEls):
2477             mediaId = itemEl.findall('./guid')[0].text
2478             shortMediaId = mediaId.split(':')[-1]
2479             showId = mediaId.split(':')[-2].replace('.com', '')
2480             officialTitle = itemEl.findall('./title')[0].text
2481             officialDate = itemEl.findall('./pubDate')[0].text
2482
2483             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2484                         compat_urllib_parse.urlencode({'uri': mediaId}))
2485             configReq = compat_urllib_request.Request(configUrl)
2486             self.report_config_download(epTitle, shortMediaId)
2487             try:
2488                 configXml = compat_urllib_request.urlopen(configReq).read()
2489             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2491                 return
2492
2493             cdoc = xml.etree.ElementTree.fromstring(configXml)
2494             turls = []
2495             for rendition in cdoc.findall('.//rendition'):
2496                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2497                 turls.append(finfo)
2498
2499             if len(turls) == 0:
2500                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2501                 continue
2502
2503             if self._downloader.params.get('listformats', None):
2504                 self._print_formats([i[0] for i in turls])
2505                 return
2506
2507             # For now, just pick the highest bitrate
2508             format,rtmp_video_url = turls[-1]
2509
2510             # Get the format arg from the arg stream
2511             req_format = self._downloader.params.get('format', None)
2512
2513             # Select format if we can find one
2514             for f,v in turls:
2515                 if f == req_format:
2516                     format, rtmp_video_url = f, v
2517                     break
2518
2519             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2520             if not m:
2521                 raise ExtractorError(u'Cannot transform RTMP url')
2522             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2523             video_url = base + m.group('finalid')
2524
2525             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2526             info = {
2527                 'id': shortMediaId,
2528                 'url': video_url,
2529                 'uploader': showId,
2530                 'upload_date': officialDate,
2531                 'title': effTitle,
2532                 'ext': 'mp4',
2533                 'format': format,
2534                 'thumbnail': None,
2535                 'description': officialTitle,
2536             }
2537             results.append(info)
2538
2539         return results
2540
2541
2542 class EscapistIE(InfoExtractor):
2543     """Information extractor for The Escapist """
2544
2545     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2546     IE_NAME = u'escapist'
2547
2548     def report_extraction(self, showName):
2549         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2550
2551     def report_config_download(self, showName):
2552         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2553
2554     def _real_extract(self, url):
2555         mobj = re.match(self._VALID_URL, url)
2556         if mobj is None:
2557             self._downloader.report_error(u'invalid URL: %s' % url)
2558             return
2559         showName = mobj.group('showname')
2560         videoId = mobj.group('episode')
2561
2562         self.report_extraction(showName)
2563         try:
2564             webPage = compat_urllib_request.urlopen(url)
2565             webPageBytes = webPage.read()
2566             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2567             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2568         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2570             return
2571
2572         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2573         description = unescapeHTML(descMatch.group(1))
2574         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2575         imgUrl = unescapeHTML(imgMatch.group(1))
2576         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2577         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2578         configUrlMatch = re.search('config=(.*)$', playerUrl)
2579         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2580
2581         self.report_config_download(showName)
2582         try:
2583             configJSON = compat_urllib_request.urlopen(configUrl)
2584             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2585             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2586         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2588             return
2589
2590         # Technically, it's JavaScript, not JSON
2591         configJSON = configJSON.replace("'", '"')
2592
2593         try:
2594             config = json.loads(configJSON)
2595         except (ValueError,) as err:
2596             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2597             return
2598
2599         playlist = config['playlist']
2600         videoUrl = playlist[1]['url']
2601
2602         info = {
2603             'id': videoId,
2604             'url': videoUrl,
2605             'uploader': showName,
2606             'upload_date': None,
2607             'title': showName,
2608             'ext': 'mp4',
2609             'thumbnail': imgUrl,
2610             'description': description,
2611             'player_url': playerUrl,
2612         }
2613
2614         return [info]
2615
2616 class CollegeHumorIE(InfoExtractor):
2617     """Information extractor for collegehumor.com"""
2618
2619     _WORKING = False
2620     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2621     IE_NAME = u'collegehumor'
2622
2623     def report_manifest(self, video_id):
2624         """Report information extraction."""
2625         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2626
2627     def report_extraction(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2630
2631     def _real_extract(self, url):
2632         mobj = re.match(self._VALID_URL, url)
2633         if mobj is None:
2634             self._downloader.report_error(u'invalid URL: %s' % url)
2635             return
2636         video_id = mobj.group('videoid')
2637
2638         info = {
2639             'id': video_id,
2640             'uploader': None,
2641             'upload_date': None,
2642         }
2643
2644         self.report_extraction(video_id)
2645         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2646         try:
2647             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2648         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2649             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2650             return
2651
2652         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2653         try:
2654             videoNode = mdoc.findall('./video')[0]
2655             info['description'] = videoNode.findall('./description')[0].text
2656             info['title'] = videoNode.findall('./caption')[0].text
2657             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2658             manifest_url = videoNode.findall('./file')[0].text
2659         except IndexError:
2660             self._downloader.report_error(u'Invalid metadata XML file')
2661             return
2662
2663         manifest_url += '?hdcore=2.10.3'
2664         self.report_manifest(video_id)
2665         try:
2666             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2667         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2669             return
2670
2671         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2672         try:
2673             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2674             node_id = media_node.attrib['url']
2675             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2676         except IndexError as err:
2677             self._downloader.report_error(u'Invalid manifest file')
2678             return
2679
2680         url_pr = compat_urllib_parse_urlparse(manifest_url)
2681         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2682
2683         info['url'] = url
2684         info['ext'] = 'f4f'
2685         return [info]
2686
2687
2688 class XVideosIE(InfoExtractor):
2689     """Information extractor for xvideos.com"""
2690
2691     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2692     IE_NAME = u'xvideos'
2693
2694     def report_extraction(self, video_id):
2695         """Report information extraction."""
2696         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2697
2698     def _real_extract(self, url):
2699         mobj = re.match(self._VALID_URL, url)
2700         if mobj is None:
2701             self._downloader.report_error(u'invalid URL: %s' % url)
2702             return
2703         video_id = mobj.group(1)
2704
2705         webpage = self._download_webpage(url, video_id)
2706
2707         self.report_extraction(video_id)
2708
2709
2710         # Extract video URL
2711         mobj = re.search(r'flv_url=(.+?)&', webpage)
2712         if mobj is None:
2713             self._downloader.report_error(u'unable to extract video url')
2714             return
2715         video_url = compat_urllib_parse.unquote(mobj.group(1))
2716
2717
2718         # Extract title
2719         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2720         if mobj is None:
2721             self._downloader.report_error(u'unable to extract video title')
2722             return
2723         video_title = mobj.group(1)
2724
2725
2726         # Extract video thumbnail
2727         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2728         if mobj is None:
2729             self._downloader.report_error(u'unable to extract video thumbnail')
2730             return
2731         video_thumbnail = mobj.group(0)
2732
2733         info = {
2734             'id': video_id,
2735             'url': video_url,
2736             'uploader': None,
2737             'upload_date': None,
2738             'title': video_title,
2739             'ext': 'flv',
2740             'thumbnail': video_thumbnail,
2741             'description': None,
2742         }
2743
2744         return [info]
2745
2746
2747 class SoundcloudIE(InfoExtractor):
2748     """Information extractor for soundcloud.com
2749        To access the media, the uid of the song and a stream token
2750        must be extracted from the page source and the script must make
2751        a request to media.soundcloud.com/crossdomain.xml. Then
2752        the media can be grabbed by requesting from an url composed
2753        of the stream token and uid
2754      """
2755
2756     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2757     IE_NAME = u'soundcloud'
2758
2759     def __init__(self, downloader=None):
2760         InfoExtractor.__init__(self, downloader)
2761
2762     def report_resolve(self, video_id):
2763         """Report information extraction."""
2764         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2765
2766     def report_extraction(self, video_id):
2767         """Report information extraction."""
2768         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2769
2770     def _real_extract(self, url):
2771         mobj = re.match(self._VALID_URL, url)
2772         if mobj is None:
2773             self._downloader.report_error(u'invalid URL: %s' % url)
2774             return
2775
2776         # extract uploader (which is in the url)
2777         uploader = mobj.group(1)
2778         # extract simple title (uploader + slug of song title)
2779         slug_title =  mobj.group(2)
2780         simple_title = uploader + u'-' + slug_title
2781
2782         self.report_resolve('%s/%s' % (uploader, slug_title))
2783
2784         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2785         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2786         request = compat_urllib_request.Request(resolv_url)
2787         try:
2788             info_json_bytes = compat_urllib_request.urlopen(request).read()
2789             info_json = info_json_bytes.decode('utf-8')
2790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2791             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2792             return
2793
2794         info = json.loads(info_json)
2795         video_id = info['id']
2796         self.report_extraction('%s/%s' % (uploader, slug_title))
2797
2798         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2799         request = compat_urllib_request.Request(streams_url)
2800         try:
2801             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2802             stream_json = stream_json_bytes.decode('utf-8')
2803         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2804             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2805             return
2806
2807         streams = json.loads(stream_json)
2808         mediaURL = streams['http_mp3_128_url']
2809
2810         return [{
2811             'id':       info['id'],
2812             'url':      mediaURL,
2813             'uploader': info['user']['username'],
2814             'upload_date':  info['created_at'],
2815             'title':    info['title'],
2816             'ext':      u'mp3',
2817             'description': info['description'],
2818         }]
2819
2820 class SoundcloudSetIE(InfoExtractor):
2821     """Information extractor for soundcloud.com sets
2822        To access the media, the uid of the song and a stream token
2823        must be extracted from the page source and the script must make
2824        a request to media.soundcloud.com/crossdomain.xml. Then
2825        the media can be grabbed by requesting from an url composed
2826        of the stream token and uid
2827      """
2828
2829     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2830     IE_NAME = u'soundcloud'
2831
2832     def __init__(self, downloader=None):
2833         InfoExtractor.__init__(self, downloader)
2834
2835     def report_resolve(self, video_id):
2836         """Report information extraction."""
2837         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2838
2839     def report_extraction(self, video_id):
2840         """Report information extraction."""
2841         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2842
2843     def _real_extract(self, url):
2844         mobj = re.match(self._VALID_URL, url)
2845         if mobj is None:
2846             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2847             return
2848
2849         # extract uploader (which is in the url)
2850         uploader = mobj.group(1)
2851         # extract simple title (uploader + slug of song title)
2852         slug_title =  mobj.group(2)
2853         simple_title = uploader + u'-' + slug_title
2854
2855         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2856
2857         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2858         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2859         request = compat_urllib_request.Request(resolv_url)
2860         try:
2861             info_json_bytes = compat_urllib_request.urlopen(request).read()
2862             info_json = info_json_bytes.decode('utf-8')
2863         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2865             return
2866
2867         videos = []
2868         info = json.loads(info_json)
2869         if 'errors' in info:
2870             for err in info['errors']:
2871                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2872             return
2873
2874         for track in info['tracks']:
2875             video_id = track['id']
2876             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2877
2878             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2879             request = compat_urllib_request.Request(streams_url)
2880             try:
2881                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2882                 stream_json = stream_json_bytes.decode('utf-8')
2883             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2885                 return
2886
2887             streams = json.loads(stream_json)
2888             mediaURL = streams['http_mp3_128_url']
2889
2890             videos.append({
2891                 'id':       video_id,
2892                 'url':      mediaURL,
2893                 'uploader': track['user']['username'],
2894                 'upload_date':  track['created_at'],
2895                 'title':    track['title'],
2896                 'ext':      u'mp3',
2897                 'description': track['description'],
2898             })
2899         return videos
2900
2901
2902 class InfoQIE(InfoExtractor):
2903     """Information extractor for infoq.com"""
2904     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2905
2906     def report_extraction(self, video_id):
2907         """Report information extraction."""
2908         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2909
2910     def _real_extract(self, url):
2911         mobj = re.match(self._VALID_URL, url)
2912         if mobj is None:
2913             self._downloader.report_error(u'invalid URL: %s' % url)
2914             return
2915
2916         webpage = self._download_webpage(url, video_id=url)
2917         self.report_extraction(url)
2918
2919         # Extract video URL
2920         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2921         if mobj is None:
2922             self._downloader.report_error(u'unable to extract video url')
2923             return
2924         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2925         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2926
2927         # Extract title
2928         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2929         if mobj is None:
2930             self._downloader.report_error(u'unable to extract video title')
2931             return
2932         video_title = mobj.group(1)
2933
2934         # Extract description
2935         video_description = u'No description available.'
2936         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2937         if mobj is not None:
2938             video_description = mobj.group(1)
2939
2940         video_filename = video_url.split('/')[-1]
2941         video_id, extension = video_filename.split('.')
2942
2943         info = {
2944             'id': video_id,
2945             'url': video_url,
2946             'uploader': None,
2947             'upload_date': None,
2948             'title': video_title,
2949             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2950             'thumbnail': None,
2951             'description': video_description,
2952         }
2953
2954         return [info]
2955
2956 class MixcloudIE(InfoExtractor):
2957     """Information extractor for www.mixcloud.com"""
2958
2959     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2960     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2961     IE_NAME = u'mixcloud'
2962
2963     def __init__(self, downloader=None):
2964         InfoExtractor.__init__(self, downloader)
2965
2966     def report_download_json(self, file_id):
2967         """Report JSON download."""
2968         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2969
2970     def report_extraction(self, file_id):
2971         """Report information extraction."""
2972         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2973
2974     def get_urls(self, jsonData, fmt, bitrate='best'):
2975         """Get urls from 'audio_formats' section in json"""
2976         file_url = None
2977         try:
2978             bitrate_list = jsonData[fmt]
2979             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2980                 bitrate = max(bitrate_list) # select highest
2981
2982             url_list = jsonData[fmt][bitrate]
2983         except TypeError: # we have no bitrate info.
2984             url_list = jsonData[fmt]
2985         return url_list
2986
2987     def check_urls(self, url_list):
2988         """Returns 1st active url from list"""
2989         for url in url_list:
2990             try:
2991                 compat_urllib_request.urlopen(url)
2992                 return url
2993             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2994                 url = None
2995
2996         return None
2997
2998     def _print_formats(self, formats):
2999         print('Available formats:')
3000         for fmt in formats.keys():
3001             for b in formats[fmt]:
3002                 try:
3003                     ext = formats[fmt][b][0]
3004                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3005                 except TypeError: # we have no bitrate info
3006                     ext = formats[fmt][0]
3007                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3008                     break
3009
3010     def _real_extract(self, url):
3011         mobj = re.match(self._VALID_URL, url)
3012         if mobj is None:
3013             self._downloader.report_error(u'invalid URL: %s' % url)
3014             return
3015         # extract uploader & filename from url
3016         uploader = mobj.group(1).decode('utf-8')
3017         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3018
3019         # construct API request
3020         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3021         # retrieve .json file with links to files
3022         request = compat_urllib_request.Request(file_url)
3023         try:
3024             self.report_download_json(file_url)
3025             jsonData = compat_urllib_request.urlopen(request).read()
3026         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3028             return
3029
3030         # parse JSON
3031         json_data = json.loads(jsonData)
3032         player_url = json_data['player_swf_url']
3033         formats = dict(json_data['audio_formats'])
3034
3035         req_format = self._downloader.params.get('format', None)
3036         bitrate = None
3037
3038         if self._downloader.params.get('listformats', None):
3039             self._print_formats(formats)
3040             return
3041
3042         if req_format is None or req_format == 'best':
3043             for format_param in formats.keys():
3044                 url_list = self.get_urls(formats, format_param)
3045                 # check urls
3046                 file_url = self.check_urls(url_list)
3047                 if file_url is not None:
3048                     break # got it!
3049         else:
3050             if req_format not in formats:
3051                 self._downloader.report_error(u'format is not available')
3052                 return
3053
3054             url_list = self.get_urls(formats, req_format)
3055             file_url = self.check_urls(url_list)
3056             format_param = req_format
3057
3058         return [{
3059             'id': file_id.decode('utf-8'),
3060             'url': file_url.decode('utf-8'),
3061             'uploader': uploader.decode('utf-8'),
3062             'upload_date': None,
3063             'title': json_data['name'],
3064             'ext': file_url.split('.')[-1].decode('utf-8'),
3065             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3066             'thumbnail': json_data['thumbnail_url'],
3067             'description': json_data['description'],
3068             'player_url': player_url.decode('utf-8'),
3069         }]
3070
3071 class StanfordOpenClassroomIE(InfoExtractor):
3072     """Information extractor for Stanford's Open ClassRoom"""
3073
3074     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3075     IE_NAME = u'stanfordoc'
3076
3077     def report_download_webpage(self, objid):
3078         """Report information extraction."""
3079         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3080
3081     def report_extraction(self, video_id):
3082         """Report information extraction."""
3083         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3084
3085     def _real_extract(self, url):
3086         mobj = re.match(self._VALID_URL, url)
3087         if mobj is None:
3088             raise ExtractorError(u'Invalid URL: %s' % url)
3089
3090         if mobj.group('course') and mobj.group('video'): # A specific video
3091             course = mobj.group('course')
3092             video = mobj.group('video')
3093             info = {
3094                 'id': course + '_' + video,
3095                 'uploader': None,
3096                 'upload_date': None,
3097             }
3098
3099             self.report_extraction(info['id'])
3100             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3101             xmlUrl = baseUrl + video + '.xml'
3102             try:
3103                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3104             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3105                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3106                 return
3107             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3108             try:
3109                 info['title'] = mdoc.findall('./title')[0].text
3110                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3111             except IndexError:
3112                 self._downloader.report_error(u'Invalid metadata XML file')
3113                 return
3114             info['ext'] = info['url'].rpartition('.')[2]
3115             return [info]
3116         elif mobj.group('course'): # A course page
3117             course = mobj.group('course')
3118             info = {
3119                 'id': course,
3120                 'type': 'playlist',
3121                 'uploader': None,
3122                 'upload_date': None,
3123             }
3124
3125             coursepage = self._download_webpage(url, info['id'],
3126                                         note='Downloading course info page',
3127                                         errnote='Unable to download course info page')
3128
3129             m = re.search('<h1>([^<]+)</h1>', coursepage)
3130             if m:
3131                 info['title'] = unescapeHTML(m.group(1))
3132             else:
3133                 info['title'] = info['id']
3134
3135             m = re.search('<description>([^<]+)</description>', coursepage)
3136             if m:
3137                 info['description'] = unescapeHTML(m.group(1))
3138
3139             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3140             info['list'] = [
3141                 {
3142                     'type': 'reference',
3143                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3144                 }
3145                     for vpage in links]
3146             results = []
3147             for entry in info['list']:
3148                 assert entry['type'] == 'reference'
3149                 results += self.extract(entry['url'])
3150             return results
3151         else: # Root page
3152             info = {
3153                 'id': 'Stanford OpenClassroom',
3154                 'type': 'playlist',
3155                 'uploader': None,
3156                 'upload_date': None,
3157             }
3158
3159             self.report_download_webpage(info['id'])
3160             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3161             try:
3162                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3163             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3164                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3165                 return
3166
3167             info['title'] = info['id']
3168
3169             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3170             info['list'] = [
3171                 {
3172                     'type': 'reference',
3173                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3174                 }
3175                     for cpage in links]
3176
3177             results = []
3178             for entry in info['list']:
3179                 assert entry['type'] == 'reference'
3180                 results += self.extract(entry['url'])
3181             return results
3182
3183 class MTVIE(InfoExtractor):
3184     """Information extractor for MTV.com"""
3185
3186     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3187     IE_NAME = u'mtv'
3188
3189     def report_extraction(self, video_id):
3190         """Report information extraction."""
3191         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3192
3193     def _real_extract(self, url):
3194         mobj = re.match(self._VALID_URL, url)
3195         if mobj is None:
3196             self._downloader.report_error(u'invalid URL: %s' % url)
3197             return
3198         if not mobj.group('proto'):
3199             url = 'http://' + url
3200         video_id = mobj.group('videoid')
3201
3202         webpage = self._download_webpage(url, video_id)
3203
3204         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3205         if mobj is None:
3206             self._downloader.report_error(u'unable to extract song name')
3207             return
3208         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3210         if mobj is None:
3211             self._downloader.report_error(u'unable to extract performer')
3212             return
3213         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214         video_title = performer + ' - ' + song_name
3215
3216         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3217         if mobj is None:
3218             self._downloader.report_error(u'unable to mtvn_uri')
3219             return
3220         mtvn_uri = mobj.group(1)
3221
3222         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3223         if mobj is None:
3224             self._downloader.report_error(u'unable to extract content id')
3225             return
3226         content_id = mobj.group(1)
3227
3228         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3229         self.report_extraction(video_id)
3230         request = compat_urllib_request.Request(videogen_url)
3231         try:
3232             metadataXml = compat_urllib_request.urlopen(request).read()
3233         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3234             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3235             return
3236
3237         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3238         renditions = mdoc.findall('.//rendition')
3239
3240         # For now, always pick the highest quality.
3241         rendition = renditions[-1]
3242
3243         try:
3244             _,_,ext = rendition.attrib['type'].partition('/')
3245             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3246             video_url = rendition.find('./src').text
3247         except KeyError:
3248             self._downloader.trouble('Invalid rendition field.')
3249             return
3250
3251         info = {
3252             'id': video_id,
3253             'url': video_url,
3254             'uploader': performer,
3255             'upload_date': None,
3256             'title': video_title,
3257             'ext': ext,
3258             'format': format,
3259         }
3260
3261         return [info]
3262
3263
3264 class YoukuIE(InfoExtractor):
3265     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3266
3267     def report_download_webpage(self, file_id):
3268         """Report webpage download."""
3269         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3270
3271     def report_extraction(self, file_id):
3272         """Report information extraction."""
3273         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3274
3275     def _gen_sid(self):
3276         nowTime = int(time.time() * 1000)
3277         random1 = random.randint(1000,1998)
3278         random2 = random.randint(1000,9999)
3279
3280         return "%d%d%d" %(nowTime,random1,random2)
3281
3282     def _get_file_ID_mix_string(self, seed):
3283         mixed = []
3284         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3285         seed = float(seed)
3286         for i in range(len(source)):
3287             seed  =  (seed * 211 + 30031 ) % 65536
3288             index  =  math.floor(seed / 65536 * len(source) )
3289             mixed.append(source[int(index)])
3290             source.remove(source[int(index)])
3291         #return ''.join(mixed)
3292         return mixed
3293
3294     def _get_file_id(self, fileId, seed):
3295         mixed = self._get_file_ID_mix_string(seed)
3296         ids = fileId.split('*')
3297         realId = []
3298         for ch in ids:
3299             if ch:
3300                 realId.append(mixed[int(ch)])
3301         return ''.join(realId)
3302
3303     def _real_extract(self, url):
3304         mobj = re.match(self._VALID_URL, url)
3305         if mobj is None:
3306             self._downloader.report_error(u'invalid URL: %s' % url)
3307             return
3308         video_id = mobj.group('ID')
3309
3310         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3311
3312         request = compat_urllib_request.Request(info_url, None, std_headers)
3313         try:
3314             self.report_download_webpage(video_id)
3315             jsondata = compat_urllib_request.urlopen(request).read()
3316         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3317             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3318             return
3319
3320         self.report_extraction(video_id)
3321         try:
3322             jsonstr = jsondata.decode('utf-8')
3323             config = json.loads(jsonstr)
3324
3325             video_title =  config['data'][0]['title']
3326             seed = config['data'][0]['seed']
3327
3328             format = self._downloader.params.get('format', None)
3329             supported_format = list(config['data'][0]['streamfileids'].keys())
3330
3331             if format is None or format == 'best':
3332                 if 'hd2' in supported_format:
3333                     format = 'hd2'
3334                 else:
3335                     format = 'flv'
3336                 ext = u'flv'
3337             elif format == 'worst':
3338                 format = 'mp4'
3339                 ext = u'mp4'
3340             else:
3341                 format = 'flv'
3342                 ext = u'flv'
3343
3344
3345             fileid = config['data'][0]['streamfileids'][format]
3346             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3347         except (UnicodeDecodeError, ValueError, KeyError):
3348             self._downloader.report_error(u'unable to extract info section')
3349             return
3350
3351         files_info=[]
3352         sid = self._gen_sid()
3353         fileid = self._get_file_id(fileid, seed)
3354
3355         #column 8,9 of fileid represent the segment number
3356         #fileid[7:9] should be changed
3357         for index, key in enumerate(keys):
3358
3359             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3360             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3361
3362             info = {
3363                 'id': '%s_part%02d' % (video_id, index),
3364                 'url': download_url,
3365                 'uploader': None,
3366                 'upload_date': None,
3367                 'title': video_title,
3368                 'ext': ext,
3369             }
3370             files_info.append(info)
3371
3372         return files_info
3373
3374
3375 class XNXXIE(InfoExtractor):
3376     """Information extractor for xnxx.com"""
3377
3378     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3379     IE_NAME = u'xnxx'
3380     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3381     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3382     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3383
3384     def report_webpage(self, video_id):
3385         """Report information extraction"""
3386         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3387
3388     def report_extraction(self, video_id):
3389         """Report information extraction"""
3390         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3391
3392     def _real_extract(self, url):
3393         mobj = re.match(self._VALID_URL, url)
3394         if mobj is None:
3395             self._downloader.report_error(u'invalid URL: %s' % url)
3396             return
3397         video_id = mobj.group(1)
3398
3399         self.report_webpage(video_id)
3400
3401         # Get webpage content
3402         try:
3403             webpage_bytes = compat_urllib_request.urlopen(url).read()
3404             webpage = webpage_bytes.decode('utf-8')
3405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3406             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3407             return
3408
3409         result = re.search(self.VIDEO_URL_RE, webpage)
3410         if result is None:
3411             self._downloader.report_error(u'unable to extract video url')
3412             return
3413         video_url = compat_urllib_parse.unquote(result.group(1))
3414
3415         result = re.search(self.VIDEO_TITLE_RE, webpage)
3416         if result is None:
3417             self._downloader.report_error(u'unable to extract video title')
3418             return
3419         video_title = result.group(1)
3420
3421         result = re.search(self.VIDEO_THUMB_RE, webpage)
3422         if result is None:
3423             self._downloader.report_error(u'unable to extract video thumbnail')
3424             return
3425         video_thumbnail = result.group(1)
3426
3427         return [{
3428             'id': video_id,
3429             'url': video_url,
3430             'uploader': None,
3431             'upload_date': None,
3432             'title': video_title,
3433             'ext': 'flv',
3434             'thumbnail': video_thumbnail,
3435             'description': None,
3436         }]
3437
3438
3439 class GooglePlusIE(InfoExtractor):
3440     """Information extractor for plus.google.com."""
3441
3442     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3443     IE_NAME = u'plus.google'
3444
3445     def __init__(self, downloader=None):
3446         InfoExtractor.__init__(self, downloader)
3447
3448     def report_extract_entry(self, url):
3449         """Report downloading extry"""
3450         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3451
3452     def report_date(self, upload_date):
3453         """Report downloading extry"""
3454         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3455
3456     def report_uploader(self, uploader):
3457         """Report downloading extry"""
3458         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3459
3460     def report_title(self, video_title):
3461         """Report downloading extry"""
3462         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3463
3464     def report_extract_vid_page(self, video_page):
3465         """Report information extraction."""
3466         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3467
3468     def _real_extract(self, url):
3469         # Extract id from URL
3470         mobj = re.match(self._VALID_URL, url)
3471         if mobj is None:
3472             self._downloader.report_error(u'Invalid URL: %s' % url)
3473             return
3474
3475         post_url = mobj.group(0)
3476         video_id = mobj.group(1)
3477
3478         video_extension = 'flv'
3479
3480         # Step 1, Retrieve post webpage to extract further information
3481         self.report_extract_entry(post_url)
3482         request = compat_urllib_request.Request(post_url)
3483         try:
3484             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3485         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3486             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3487             return
3488
3489         # Extract update date
3490         upload_date = None
3491         pattern = 'title="Timestamp">(.*?)</a>'
3492         mobj = re.search(pattern, webpage)
3493         if mobj:
3494             upload_date = mobj.group(1)
3495             # Convert timestring to a format suitable for filename
3496             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3497             upload_date = upload_date.strftime('%Y%m%d')
3498         self.report_date(upload_date)
3499
3500         # Extract uploader
3501         uploader = None
3502         pattern = r'rel\="author".*?>(.*?)</a>'
3503         mobj = re.search(pattern, webpage)
3504         if mobj:
3505             uploader = mobj.group(1)
3506         self.report_uploader(uploader)
3507
3508         # Extract title
3509         # Get the first line for title
3510         video_title = u'NA'
3511         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3512         mobj = re.search(pattern, webpage)
3513         if mobj:
3514             video_title = mobj.group(1)
3515         self.report_title(video_title)
3516
3517         # Step 2, Stimulate clicking the image box to launch video
3518         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3519         mobj = re.search(pattern, webpage)
3520         if mobj is None:
3521             self._downloader.report_error(u'unable to extract video page URL')
3522
3523         video_page = mobj.group(1)
3524         request = compat_urllib_request.Request(video_page)
3525         try:
3526             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3527         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3528             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3529             return
3530         self.report_extract_vid_page(video_page)
3531
3532
3533         # Extract video links on video page
3534         """Extract video links of all sizes"""
3535         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3536         mobj = re.findall(pattern, webpage)
3537         if len(mobj) == 0:
3538             self._downloader.report_error(u'unable to extract video links')
3539
3540         # Sort in resolution
3541         links = sorted(mobj)
3542
3543         # Choose the lowest of the sort, i.e. highest resolution
3544         video_url = links[-1]
3545         # Only get the url. The resolution part in the tuple has no use anymore
3546         video_url = video_url[-1]
3547         # Treat escaped \u0026 style hex
3548         try:
3549             video_url = video_url.decode("unicode_escape")
3550         except AttributeError: # Python 3
3551             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3552
3553
3554         return [{
3555             'id':       video_id,
3556             'url':      video_url,
3557             'uploader': uploader,
3558             'upload_date':  upload_date,
3559             'title':    video_title,
3560             'ext':      video_extension,
3561         }]
3562
3563 class NBAIE(InfoExtractor):
3564     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3565     IE_NAME = u'nba'
3566
3567     def _real_extract(self, url):
3568         mobj = re.match(self._VALID_URL, url)
3569         if mobj is None:
3570             self._downloader.report_error(u'invalid URL: %s' % url)
3571             return
3572
3573         video_id = mobj.group(1)
3574         if video_id.endswith('/index.html'):
3575             video_id = video_id[:-len('/index.html')]
3576
3577         webpage = self._download_webpage(url, video_id)
3578
3579         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3580         def _findProp(rexp, default=None):
3581             m = re.search(rexp, webpage)
3582             if m:
3583                 return unescapeHTML(m.group(1))
3584             else:
3585                 return default
3586
3587         shortened_video_id = video_id.rpartition('/')[2]
3588         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3589         info = {
3590             'id': shortened_video_id,
3591             'url': video_url,
3592             'ext': 'mp4',
3593             'title': title,
3594             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3595             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3596         }
3597         return [info]
3598
3599 class JustinTVIE(InfoExtractor):
3600     """Information extractor for justin.tv and twitch.tv"""
3601     # TODO: One broadcast may be split into multiple videos. The key
3602     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3603     # starts at 1 and increases. Can we treat all parts as one video?
3604
3605     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3606         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3607     _JUSTIN_PAGE_LIMIT = 100
3608     IE_NAME = u'justin.tv'
3609
3610     def report_extraction(self, file_id):
3611         """Report information extraction."""
3612         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3613
3614     def report_download_page(self, channel, offset):
3615         """Report attempt to download a single page of videos."""
3616         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3617                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3618
3619     # Return count of items, list of *valid* items
3620     def _parse_page(self, url):
3621         try:
3622             urlh = compat_urllib_request.urlopen(url)
3623             webpage_bytes = urlh.read()
3624             webpage = webpage_bytes.decode('utf-8', 'ignore')
3625         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3626             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3627             return
3628
3629         response = json.loads(webpage)
3630         if type(response) != list:
3631             error_text = response.get('error', 'unknown error')
3632             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3633             return
3634         info = []
3635         for clip in response:
3636             video_url = clip['video_file_url']
3637             if video_url:
3638                 video_extension = os.path.splitext(video_url)[1][1:]
3639                 video_date = re.sub('-', '', clip['start_time'][:10])
3640                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3641                 video_id = clip['id']
3642                 video_title = clip.get('title', video_id)
3643                 info.append({
3644                     'id': video_id,
3645                     'url': video_url,
3646                     'title': video_title,
3647                     'uploader': clip.get('channel_name', video_uploader_id),
3648                     'uploader_id': video_uploader_id,
3649                     'upload_date': video_date,
3650                     'ext': video_extension,
3651                 })
3652         return (len(response), info)
3653
3654     def _real_extract(self, url):
3655         mobj = re.match(self._VALID_URL, url)
3656         if mobj is None:
3657             self._downloader.report_error(u'invalid URL: %s' % url)
3658             return
3659
3660         api = 'http://api.justin.tv'
3661         video_id = mobj.group(mobj.lastindex)
3662         paged = False
3663         if mobj.lastindex == 1:
3664             paged = True
3665             api += '/channel/archives/%s.json'
3666         else:
3667             api += '/broadcast/by_archive/%s.json'
3668         api = api % (video_id,)
3669
3670         self.report_extraction(video_id)
3671
3672         info = []
3673         offset = 0
3674         limit = self._JUSTIN_PAGE_LIMIT
3675         while True:
3676             if paged:
3677                 self.report_download_page(video_id, offset)
3678             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3679             page_count, page_info = self._parse_page(page_url)
3680             info.extend(page_info)
3681             if not paged or page_count != limit:
3682                 break
3683             offset += limit
3684         return info
3685
3686 class FunnyOrDieIE(InfoExtractor):
3687     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3688
3689     def _real_extract(self, url):
3690         mobj = re.match(self._VALID_URL, url)
3691         if mobj is None:
3692             self._downloader.report_error(u'invalid URL: %s' % url)
3693             return
3694
3695         video_id = mobj.group('id')
3696         webpage = self._download_webpage(url, video_id)
3697
3698         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3699         if not m:
3700             self._downloader.report_error(u'unable to find video information')
3701         video_url = unescapeHTML(m.group('url'))
3702
3703         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3704         if not m:
3705             self._downloader.trouble(u'Cannot find video title')
3706         title = clean_html(m.group('title'))
3707
3708         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3709         if m:
3710             desc = unescapeHTML(m.group('desc'))
3711         else:
3712             desc = None
3713
3714         info = {
3715             'id': video_id,
3716             'url': video_url,
3717             'ext': 'mp4',
3718             'title': title,
3719             'description': desc,
3720         }
3721         return [info]
3722
3723 class SteamIE(InfoExtractor):
3724     _VALID_URL = r"""http://store.steampowered.com/
3725                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3726                 (?P<gameID>\d+)/?
3727                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3728                 """
3729
3730     @classmethod
3731     def suitable(cls, url):
3732         """Receives a URL and returns True if suitable for this IE."""
3733         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3734
3735     def _real_extract(self, url):
3736         m = re.match(self._VALID_URL, url, re.VERBOSE)
3737         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3738         gameID = m.group('gameID')
3739         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3740         webpage = self._download_webpage(videourl, gameID)
3741         mweb = re.finditer(urlRE, webpage)
3742         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3743         titles = re.finditer(namesRE, webpage)
3744         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3745         thumbs = re.finditer(thumbsRE, webpage)
3746         videos = []
3747         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3748             video_id = vid.group('videoID')
3749             title = vtitle.group('videoName')
3750             video_url = vid.group('videoURL')
3751             video_thumb = thumb.group('thumbnail')
3752             if not video_url:
3753                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3754             info = {
3755                 'id':video_id,
3756                 'url':video_url,
3757                 'ext': 'flv',
3758                 'title': unescapeHTML(title),
3759                 'thumbnail': video_thumb
3760                   }
3761             videos.append(info)
3762         return videos
3763
3764 class UstreamIE(InfoExtractor):
3765     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3766     IE_NAME = u'ustream'
3767
3768     def _real_extract(self, url):
3769         m = re.match(self._VALID_URL, url)
3770         video_id = m.group('videoID')
3771         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3772         webpage = self._download_webpage(url, video_id)
3773         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3774         title = m.group('title')
3775         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3776         uploader = m.group('uploader')
3777         info = {
3778                 'id':video_id,
3779                 'url':video_url,
3780                 'ext': 'flv',
3781                 'title': title,
3782                 'uploader': uploader
3783                   }
3784         return [info]
3785
3786 class WorldStarHipHopIE(InfoExtractor):
3787     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3788     IE_NAME = u'WorldStarHipHop'
3789
3790     def _real_extract(self, url):
3791         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3792
3793         webpage_src = compat_urllib_request.urlopen(url).read()
3794         webpage_src = webpage_src.decode('utf-8')
3795
3796         mobj = re.search(_src_url, webpage_src)
3797
3798         m = re.match(self._VALID_URL, url)
3799         video_id = m.group('id')
3800
3801         if mobj is not None:
3802             video_url = mobj.group()
3803             if 'mp4' in video_url:
3804                 ext = 'mp4'
3805             else:
3806                 ext = 'flv'
3807         else:
3808             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3809             return
3810
3811         _title = r"""<title>(.*)</title>"""
3812
3813         mobj = re.search(_title, webpage_src)
3814
3815         if mobj is not None:
3816             title = mobj.group(1)
3817         else:
3818             title = 'World Start Hip Hop - %s' % time.ctime()
3819
3820         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3821         mobj = re.search(_thumbnail, webpage_src)
3822
3823         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3824         if mobj is not None:
3825             thumbnail = mobj.group(1)
3826         else:
3827             _title = r"""candytitles.*>(.*)</span>"""
3828             mobj = re.search(_title, webpage_src)
3829             if mobj is not None:
3830                 title = mobj.group(1)
3831             thumbnail = None
3832
3833         results = [{
3834                     'id': video_id,
3835                     'url' : video_url,
3836                     'title' : title,
3837                     'thumbnail' : thumbnail,
3838                     'ext' : ext,
3839                     }]
3840         return results
3841
3842 class RBMARadioIE(InfoExtractor):
3843     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3844
3845     def _real_extract(self, url):
3846         m = re.match(self._VALID_URL, url)
3847         video_id = m.group('videoID')
3848
3849         webpage = self._download_webpage(url, video_id)
3850         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3851         if not m:
3852             raise ExtractorError(u'Cannot find metadata')
3853         json_data = m.group(1)
3854
3855         try:
3856             data = json.loads(json_data)
3857         except ValueError as e:
3858             raise ExtractorError(u'Invalid JSON: ' + str(e))
3859
3860         video_url = data['akamai_url'] + '&cbr=256'
3861         url_parts = compat_urllib_parse_urlparse(video_url)
3862         video_ext = url_parts.path.rpartition('.')[2]
3863         info = {
3864                 'id': video_id,
3865                 'url': video_url,
3866                 'ext': video_ext,
3867                 'title': data['title'],
3868                 'description': data.get('teaser_text'),
3869                 'location': data.get('country_of_origin'),
3870                 'uploader': data.get('host', {}).get('name'),
3871                 'uploader_id': data.get('host', {}).get('slug'),
3872                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3873                 'duration': data.get('duration'),
3874         }
3875         return [info]
3876
3877
3878 class YouPornIE(InfoExtractor):
3879     """Information extractor for youporn.com."""
3880     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3881
3882     def _print_formats(self, formats):
3883         """Print all available formats"""
3884         print(u'Available formats:')
3885         print(u'ext\t\tformat')
3886         print(u'---------------------------------')
3887         for format in formats:
3888             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3889
3890     def _specific(self, req_format, formats):
3891         for x in formats:
3892             if(x["format"]==req_format):
3893                 return x
3894         return None
3895
3896     def _real_extract(self, url):
3897         mobj = re.match(self._VALID_URL, url)
3898         if mobj is None:
3899             self._downloader.report_error(u'invalid URL: %s' % url)
3900             return
3901
3902         video_id = mobj.group('videoid')
3903
3904         req = compat_urllib_request.Request(url)
3905         req.add_header('Cookie', 'age_verified=1')
3906         webpage = self._download_webpage(req, video_id)
3907
3908         # Get the video title
3909         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3910         if result is None:
3911             raise ExtractorError(u'Unable to extract video title')
3912         video_title = result.group('title').strip()
3913
3914         # Get the video date
3915         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3916         if result is None:
3917             self._downloader.report_warning(u'unable to extract video date')
3918             upload_date = None
3919         else:
3920             upload_date = result.group('date').strip()
3921
3922         # Get the video uploader
3923         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3924         if result is None:
3925             self._downloader.report_warning(u'unable to extract uploader')
3926             video_uploader = None
3927         else:
3928             video_uploader = result.group('uploader').strip()
3929             video_uploader = clean_html( video_uploader )
3930
3931         # Get all of the formats available
3932         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3933         result = re.search(DOWNLOAD_LIST_RE, webpage)
3934         if result is None:
3935             raise ExtractorError(u'Unable to extract download list')
3936         download_list_html = result.group('download_list').strip()
3937
3938         # Get all of the links from the page
3939         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3940         links = re.findall(LINK_RE, download_list_html)
3941         if(len(links) == 0):
3942             raise ExtractorError(u'ERROR: no known formats available for video')
3943
3944         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3945
3946         formats = []
3947         for link in links:
3948
3949             # A link looks like this:
3950             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3951             # A path looks like this:
3952             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3953             video_url = unescapeHTML( link )
3954             path = compat_urllib_parse_urlparse( video_url ).path
3955             extension = os.path.splitext( path )[1][1:]
3956             format = path.split('/')[4].split('_')[:2]
3957             size = format[0]
3958             bitrate = format[1]
3959             format = "-".join( format )
3960             title = u'%s-%s-%s' % (video_title, size, bitrate)
3961
3962             formats.append({
3963                 'id': video_id,
3964                 'url': video_url,
3965                 'uploader': video_uploader,
3966                 'upload_date': upload_date,
3967                 'title': title,
3968                 'ext': extension,
3969                 'format': format,
3970                 'thumbnail': None,
3971                 'description': None,
3972                 'player_url': None
3973             })
3974
3975         if self._downloader.params.get('listformats', None):
3976             self._print_formats(formats)
3977             return
3978
3979         req_format = self._downloader.params.get('format', None)
3980         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3981
3982         if req_format is None or req_format == 'best':
3983             return [formats[0]]
3984         elif req_format == 'worst':
3985             return [formats[-1]]
3986         elif req_format in ('-1', 'all'):
3987             return formats
3988         else:
3989             format = self._specific( req_format, formats )
3990             if result is None:
3991                 self._downloader.report_error(u'requested format not available')
3992                 return
3993             return [format]
3994
3995
3996
3997 class PornotubeIE(InfoExtractor):
3998     """Information extractor for pornotube.com."""
3999     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4000
4001     def _real_extract(self, url):
4002         mobj = re.match(self._VALID_URL, url)
4003         if mobj is None:
4004             self._downloader.report_error(u'invalid URL: %s' % url)
4005             return
4006
4007         video_id = mobj.group('videoid')
4008         video_title = mobj.group('title')
4009
4010         # Get webpage content
4011         webpage = self._download_webpage(url, video_id)
4012
4013         # Get the video URL
4014         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4015         result = re.search(VIDEO_URL_RE, webpage)
4016         if result is None:
4017             self._downloader.report_error(u'unable to extract video url')
4018             return
4019         video_url = compat_urllib_parse.unquote(result.group('url'))
4020
4021         #Get the uploaded date
4022         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4023         result = re.search(VIDEO_UPLOADED_RE, webpage)
4024         if result is None:
4025             self._downloader.report_error(u'unable to extract video title')
4026             return
4027         upload_date = result.group('date')
4028
4029         info = {'id': video_id,
4030                 'url': video_url,
4031                 'uploader': None,
4032                 'upload_date': upload_date,
4033                 'title': video_title,
4034                 'ext': 'flv',
4035                 'format': 'flv'}
4036
4037         return [info]
4038
4039 class YouJizzIE(InfoExtractor):
4040     """Information extractor for youjizz.com."""
4041     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4042
4043     def _real_extract(self, url):
4044         mobj = re.match(self._VALID_URL, url)
4045         if mobj is None:
4046             self._downloader.report_error(u'invalid URL: %s' % url)
4047             return
4048
4049         video_id = mobj.group('videoid')
4050
4051         # Get webpage content
4052         webpage = self._download_webpage(url, video_id)
4053
4054         # Get the video title
4055         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4056         if result is None:
4057             raise ExtractorError(u'ERROR: unable to extract video title')
4058         video_title = result.group('title').strip()
4059
4060         # Get the embed page
4061         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4062         if result is None:
4063             raise ExtractorError(u'ERROR: unable to extract embed page')
4064
4065         embed_page_url = result.group(0).strip()
4066         video_id = result.group('videoid')
4067
4068         webpage = self._download_webpage(embed_page_url, video_id)
4069
4070         # Get the video URL
4071         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4072         if result is None:
4073             raise ExtractorError(u'ERROR: unable to extract video url')
4074         video_url = result.group('source')
4075
4076         info = {'id': video_id,
4077                 'url': video_url,
4078                 'title': video_title,
4079                 'ext': 'flv',
4080                 'format': 'flv',
4081                 'player_url': embed_page_url}
4082
4083         return [info]
4084
4085 class EightTracksIE(InfoExtractor):
4086     IE_NAME = '8tracks'
4087     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4088
4089     def _real_extract(self, url):
4090         mobj = re.match(self._VALID_URL, url)
4091         if mobj is None:
4092             raise ExtractorError(u'Invalid URL: %s' % url)
4093         playlist_id = mobj.group('id')
4094
4095         webpage = self._download_webpage(url, playlist_id)
4096
4097         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4098         if not m:
4099             raise ExtractorError(u'Cannot find trax information')
4100         json_like = m.group(1)
4101         data = json.loads(json_like)
4102
4103         session = str(random.randint(0, 1000000000))
4104         mix_id = data['id']
4105         track_count = data['tracks_count']
4106         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4107         next_url = first_url
4108         res = []
4109         for i in itertools.count():
4110             api_json = self._download_webpage(next_url, playlist_id,
4111                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4112                 errnote=u'Failed to download song information')
4113             api_data = json.loads(api_json)
4114             track_data = api_data[u'set']['track']
4115             info = {
4116                 'id': track_data['id'],
4117                 'url': track_data['track_file_stream_url'],
4118                 'title': track_data['performer'] + u' - ' + track_data['name'],
4119                 'raw_title': track_data['name'],
4120                 'uploader_id': data['user']['login'],
4121                 'ext': 'm4a',
4122             }
4123             res.append(info)
4124             if api_data['set']['at_last_track']:
4125                 break
4126             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4127         return res
4128
4129 class KeekIE(InfoExtractor):
4130     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4131     IE_NAME = u'keek'
4132
4133     def _real_extract(self, url):
4134         m = re.match(self._VALID_URL, url)
4135         video_id = m.group('videoID')
4136         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4137         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4138         webpage = self._download_webpage(url, video_id)
4139         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4140         title = unescapeHTML(m.group('title'))
4141         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4142         uploader = clean_html(m.group('uploader'))
4143         info = {
4144                 'id': video_id,
4145                 'url': video_url,
4146                 'ext': 'mp4',
4147                 'title': title,
4148                 'thumbnail': thumbnail,
4149                 'uploader': uploader
4150         }
4151         return [info]
4152
4153 class TEDIE(InfoExtractor):
4154     _VALID_URL=r'''http://www.ted.com/
4155                    (
4156                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4157                         |
4158                         ((?P<type_talk>talks)) # We have a simple talk
4159                    )
4160                    /(?P<name>\w+) # Here goes the name and then ".html"
4161                    '''
4162
4163     @classmethod
4164     def suitable(cls, url):
4165         """Receives a URL and returns True if suitable for this IE."""
4166         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4167
4168     def _real_extract(self, url):
4169         m=re.match(self._VALID_URL, url, re.VERBOSE)
4170         if m.group('type_talk'):
4171             return [self._talk_info(url)]
4172         else :
4173             playlist_id=m.group('playlist_id')
4174             name=m.group('name')
4175             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4176             return self._playlist_videos_info(url,name,playlist_id)
4177
4178     def _talk_video_link(self,mediaSlug):
4179         '''Returns the video link for that mediaSlug'''
4180         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4181
4182     def _playlist_videos_info(self,url,name,playlist_id=0):
4183         '''Returns the videos of the playlist'''
4184         video_RE=r'''
4185                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4186                      ([.\s]*?)data-playlist_item_id="(\d+)"
4187                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4188                      '''
4189         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4190         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4191         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4192         m_names=re.finditer(video_name_RE,webpage)
4193         info=[]
4194         for m_video, m_name in zip(m_videos,m_names):
4195             video_id=m_video.group('video_id')
4196             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4197             info.append(self._talk_info(talk_url,video_id))
4198         return info
4199
4200     def _talk_info(self, url, video_id=0):
4201         """Return the video for the talk in the url"""
4202         m=re.match(self._VALID_URL, url,re.VERBOSE)
4203         videoName=m.group('name')
4204         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4205         # If the url includes the language we get the title translated
4206         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4207         title=re.search(title_RE, webpage).group('title')
4208         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4209                         "id":(?P<videoID>[\d]+).*?
4210                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4211         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4212         thumb_match=re.search(thumb_RE,webpage)
4213         info_match=re.search(info_RE,webpage,re.VERBOSE)
4214         video_id=info_match.group('videoID')
4215         mediaSlug=info_match.group('mediaSlug')
4216         video_url=self._talk_video_link(mediaSlug)
4217         info = {
4218                 'id': video_id,
4219                 'url': video_url,
4220                 'ext': 'mp4',
4221                 'title': title,
4222                 'thumbnail': thumb_match.group('thumbnail')
4223                 }
4224         return info
4225
4226 class MySpassIE(InfoExtractor):
4227     _VALID_URL = r'http://www.myspass.de/.*'
4228
4229     def _real_extract(self, url):
4230         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4231
4232         # video id is the last path element of the URL
4233         # usually there is a trailing slash, so also try the second but last
4234         url_path = compat_urllib_parse_urlparse(url).path
4235         url_parent_path, video_id = os.path.split(url_path)
4236         if not video_id:
4237             _, video_id = os.path.split(url_parent_path)
4238
4239         # get metadata
4240         metadata_url = META_DATA_URL_TEMPLATE % video_id
4241         metadata_text = self._download_webpage(metadata_url, video_id)
4242         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4243
4244         # extract values from metadata
4245         url_flv_el = metadata.find('url_flv')
4246         if url_flv_el is None:
4247             self._downloader.report_error(u'unable to extract download url')
4248             return
4249         video_url = url_flv_el.text
4250         extension = os.path.splitext(video_url)[1][1:]
4251         title_el = metadata.find('title')
4252         if title_el is None:
4253             self._downloader.report_error(u'unable to extract title')
4254             return
4255         title = title_el.text
4256         format_id_el = metadata.find('format_id')
4257         if format_id_el is None:
4258             format = ext
4259         else:
4260             format = format_id_el.text
4261         description_el = metadata.find('description')
4262         if description_el is not None:
4263             description = description_el.text
4264         else:
4265             description = None
4266         imagePreview_el = metadata.find('imagePreview')
4267         if imagePreview_el is not None:
4268             thumbnail = imagePreview_el.text
4269         else:
4270             thumbnail = None
4271         info = {
4272             'id': video_id,
4273             'url': video_url,
4274             'title': title,
4275             'ext': extension,
4276             'format': format,
4277             'thumbnail': thumbnail,
4278             'description': description
4279         }
4280         return [info]
4281
4282 class SpiegelIE(InfoExtractor):
4283     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4284
4285     def _real_extract(self, url):
4286         m = re.match(self._VALID_URL, url)
4287         video_id = m.group('videoID')
4288
4289         webpage = self._download_webpage(url, video_id)
4290         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4291         if not m:
4292             raise ExtractorError(u'Cannot find title')
4293         video_title = unescapeHTML(m.group(1))
4294
4295         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4296         xml_code = self._download_webpage(xml_url, video_id,
4297                     note=u'Downloading XML', errnote=u'Failed to download XML')
4298
4299         idoc = xml.etree.ElementTree.fromstring(xml_code)
4300         last_type = idoc[-1]
4301         filename = last_type.findall('./filename')[0].text
4302         duration = float(last_type.findall('./duration')[0].text)
4303
4304         video_url = 'http://video2.spiegel.de/flash/' + filename
4305         video_ext = filename.rpartition('.')[2]
4306         info = {
4307             'id': video_id,
4308             'url': video_url,
4309             'ext': video_ext,
4310             'title': video_title,
4311             'duration': duration,
4312         }
4313         return [info]
4314
4315 class LiveLeakIE(InfoExtractor):
4316
4317     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4318     IE_NAME = u'liveleak'
4319
4320     def _real_extract(self, url):
4321         mobj = re.match(self._VALID_URL, url)
4322         if mobj is None:
4323             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4324             return
4325
4326         video_id = mobj.group('video_id')
4327
4328         webpage = self._download_webpage(url, video_id)
4329
4330         m = re.search(r'file: "(.*?)",', webpage)
4331         if not m:
4332             self._downloader.report_error(u'unable to find video url')
4333             return
4334         video_url = m.group(1)
4335
4336         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4337         if not m:
4338             self._downloader.trouble(u'Cannot find video title')
4339         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4340
4341         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4342         if m:
4343             desc = unescapeHTML(m.group('desc'))
4344         else:
4345             desc = None
4346
4347         m = re.search(r'By:.*?(\w+)</a>', webpage)
4348         if m:
4349             uploader = clean_html(m.group(1))
4350         else:
4351             uploader = None
4352
4353         info = {
4354             'id':  video_id,
4355             'url': video_url,
4356             'ext': 'mp4',
4357             'title': title,
4358             'description': desc,
4359             'uploader': uploader
4360         }
4361
4362         return [info]
4363
4364 class ARDIE(InfoExtractor):
4365     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4366     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4367     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4368
4369     def _real_extract(self, url):
4370         # determine video id from url
4371         m = re.match(self._VALID_URL, url)
4372
4373         numid = re.search(r'documentId=([0-9]+)', url)
4374         if numid:
4375             video_id = numid.group(1)
4376         else:
4377             video_id = m.group('video_id')
4378
4379         # determine title and media streams from webpage
4380         html = self._download_webpage(url, video_id)
4381         title = re.search(self._TITLE, html).group('title')
4382         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4383         if not streams:
4384             assert '"fsk"' in html
4385             self._downloader.report_error(u'this video is only available after 8:00 pm')
4386             return
4387
4388         # choose default media type and highest quality for now
4389         stream = max([s for s in streams if int(s["media_type"]) == 0],
4390                      key=lambda s: int(s["quality"]))
4391
4392         # there's two possibilities: RTMP stream or HTTP download
4393         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4394         if stream['rtmp_url']:
4395             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4396             assert stream['video_url'].startswith('mp4:')
4397             info["url"] = stream["rtmp_url"]
4398             info["play_path"] = stream['video_url']
4399         else:
4400             assert stream["video_url"].endswith('.mp4')
4401             info["url"] = stream["video_url"]
4402         return [info]
4403
4404
4405 def gen_extractors():
4406     """ Return a list of an instance of every supported extractor.
4407     The order does matter; the first extractor matched is the one handling the URL.
4408     """
4409     return [
4410         YoutubePlaylistIE(),
4411         YoutubeChannelIE(),
4412         YoutubeUserIE(),
4413         YoutubeSearchIE(),
4414         YoutubeIE(),
4415         MetacafeIE(),
4416         DailymotionIE(),
4417         GoogleSearchIE(),
4418         PhotobucketIE(),
4419         YahooIE(),
4420         YahooSearchIE(),
4421         DepositFilesIE(),
4422         FacebookIE(),
4423         BlipTVUserIE(),
4424         BlipTVIE(),
4425         VimeoIE(),
4426         MyVideoIE(),
4427         ComedyCentralIE(),
4428         EscapistIE(),
4429         CollegeHumorIE(),
4430         XVideosIE(),
4431         SoundcloudSetIE(),
4432         SoundcloudIE(),
4433         InfoQIE(),
4434         MixcloudIE(),
4435         StanfordOpenClassroomIE(),
4436         MTVIE(),
4437         YoukuIE(),
4438         XNXXIE(),
4439         YouJizzIE(),
4440         PornotubeIE(),
4441         YouPornIE(),
4442         GooglePlusIE(),
4443         ArteTvIE(),
4444         NBAIE(),
4445         WorldStarHipHopIE(),
4446         JustinTVIE(),
4447         FunnyOrDieIE(),
4448         SteamIE(),
4449         UstreamIE(),
4450         RBMARadioIE(),
4451         EightTracksIE(),
4452         KeekIE(),
4453         TEDIE(),
4454         MySpassIE(),
4455         SpiegelIE(),
4456         LiveLeakIE(),
4457         ARDIE(),
4458         GenericIE()
4459     ]