git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self._downloader.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147
 148 class YoutubeIE(InfoExtractor):
 149     """Information extractor for youtube.com."""
 150
 151     _VALID_URL = r"""^
 152                      (
 153                          (?:https?://)?                                       # http(s):// (optional)
 154                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 155                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 156                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 157                          (?:                                                  # the various things that can precede the ID:
 158                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 159                              |(?:                                             # or the v= param in all its forms
 160                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 161                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 162                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 163                                  v=
 164                              )
 165                          )?                                                   # optional -> youtube.com/xxxx is OK
 166                      )?                                                       # all until now is optional -> you can pass the naked ID
 167                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 168                      (?(1).+)?                                                # if we found the ID, everything can follow
 169                      $"""
 170     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 171     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 172     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 173     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 174     _NETRC_MACHINE = 'youtube'
 175     # Listed in order of quality
 176     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 177     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 178     _video_extensions = {
 179         '13': '3gp',
 180         '17': 'mp4',
 181         '18': 'mp4',
 182         '22': 'mp4',
 183         '37': 'mp4',
 184         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 185         '43': 'webm',
 186         '44': 'webm',
 187         '45': 'webm',
 188         '46': 'webm',
 189     }
 190     _video_dimensions = {
 191         '5': '240x400',
 192         '6': '???',
 193         '13': '???',
 194         '17': '144x176',
 195         '18': '360x640',
 196         '22': '720x1280',
 197         '34': '360x640',
 198         '35': '480x854',
 199         '37': '1080x1920',
 200         '38': '3072x4096',
 201         '43': '360x640',
 202         '44': '480x854',
 203         '45': '720x1280',
 204         '46': '1080x1920',
 205     }
 206     IE_NAME = u'youtube'
 207
 208     @classmethod
 209     def suitable(cls, url):
 210         """Receives a URL and returns True if suitable for this IE."""
 211         if YoutubePlaylistIE.suitable(url): return False
 212         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 213
 214     def report_lang(self):
 215         """Report attempt to set language."""
 216         self._downloader.to_screen(u'[youtube] Setting language')
 217
 218     def report_login(self):
 219         """Report attempt to log in."""
 220         self._downloader.to_screen(u'[youtube] Logging in')
 221
 222     def report_age_confirmation(self):
 223         """Report attempt to confirm age."""
 224         self._downloader.to_screen(u'[youtube] Confirming age')
 225
 226     def report_video_webpage_download(self, video_id):
 227         """Report attempt to download video webpage."""
 228         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 229
 230     def report_video_info_webpage_download(self, video_id):
 231         """Report attempt to download video info webpage."""
 232         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 233
 234     def report_video_subtitles_download(self, video_id):
 235         """Report attempt to download video info webpage."""
 236         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 237
 238     def report_video_subtitles_request(self, video_id, sub_lang, format):
 239         """Report attempt to download video info webpage."""
 240         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 241
 242     def report_video_subtitles_available(self, video_id, sub_lang_list):
 243         """Report available subtitles."""
 244         sub_lang = ",".join(list(sub_lang_list.keys()))
 245         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 246
 247     def report_information_extraction(self, video_id):
 248         """Report attempt to extract video information."""
 249         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 250
 251     def report_unavailable_format(self, video_id, format):
 252         """Report extracted video URL."""
 253         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 254
 255     def report_rtmp_download(self):
 256         """Indicate the download will use the RTMP protocol."""
 257         self._downloader.to_screen(u'[youtube] RTMP download detected')
 258
 259     def _get_available_subtitles(self, video_id):
 260         self.report_video_subtitles_download(video_id)
 261         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 262         try:
 263             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 264         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 265             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 266         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 267         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 268         if not sub_lang_list:
 269             return (u'video doesn\'t have subtitles', None)
 270         return sub_lang_list
 271
 272     def _list_available_subtitles(self, video_id):
 273         sub_lang_list = self._get_available_subtitles(video_id)
 274         self.report_video_subtitles_available(video_id, sub_lang_list)
 275
 276     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 277         """
 278         Return tuple:
 279         (error_message, sub_lang, sub)
 280         """
 281         self.report_video_subtitles_request(video_id, sub_lang, format)
 282         params = compat_urllib_parse.urlencode({
 283             'lang': sub_lang,
 284             'name': sub_name,
 285             'v': video_id,
 286             'fmt': format,
 287         })
 288         url = 'http://www.youtube.com/api/timedtext?' + params
 289         try:
 290             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 291         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 292             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 293         if not sub:
 294             return (u'Did not fetch video subtitles', None, None)
 295         return (None, sub_lang, sub)
 296
 297     def _extract_subtitle(self, video_id):
 298         """
 299         Return a list with a tuple:
 300         [(error_message, sub_lang, sub)]
 301         """
 302         sub_lang_list = self._get_available_subtitles(video_id)
 303         sub_format = self._downloader.params.get('subtitlesformat')
 304         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 305             return [(sub_lang_list[0], None, None)]
 306         if self._downloader.params.get('subtitleslang', False):
 307             sub_lang = self._downloader.params.get('subtitleslang')
 308         elif 'en' in sub_lang_list:
 309             sub_lang = 'en'
 310         else:
 311             sub_lang = list(sub_lang_list.keys())[0]
 312         if not sub_lang in sub_lang_list:
 313             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 314
 315         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 316         return [subtitle]
 317
 318     def _extract_all_subtitles(self, video_id):
 319         sub_lang_list = self._get_available_subtitles(video_id)
 320         sub_format = self._downloader.params.get('subtitlesformat')
 321         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 322             return [(sub_lang_list[0], None, None)]
 323         subtitles = []
 324         for sub_lang in sub_lang_list:
 325             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 326             subtitles.append(subtitle)
 327         return subtitles
 328
 329     def _print_formats(self, formats):
 330         print('Available formats:')
 331         for x in formats:
 332             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 333
 334     def _real_initialize(self):
 335         if self._downloader is None:
 336             return
 337
 338         username = None
 339         password = None
 340         downloader_params = self._downloader.params
 341
 342         # Attempt to use provided username and password or .netrc data
 343         if downloader_params.get('username', None) is not None:
 344             username = downloader_params['username']
 345             password = downloader_params['password']
 346         elif downloader_params.get('usenetrc', False):
 347             try:
 348                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 349                 if info is not None:
 350                     username = info[0]
 351                     password = info[2]
 352                 else:
 353                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 354             except (IOError, netrc.NetrcParseError) as err:
 355                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 356                 return
 357
 358         # Set language
 359         request = compat_urllib_request.Request(self._LANG_URL)
 360         try:
 361             self.report_lang()
 362             compat_urllib_request.urlopen(request).read()
 363         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 364             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 365             return
 366
 367         # No authentication to be performed
 368         if username is None:
 369             return
 370
 371         request = compat_urllib_request.Request(self._LOGIN_URL)
 372         try:
 373             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 374         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 375             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 376             return
 377
 378         galx = None
 379         dsh = None
 380         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 381         if match:
 382           galx = match.group(1)
 383
 384         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 385         if match:
 386           dsh = match.group(1)
 387
 388         # Log in
 389         login_form_strs = {
 390                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 391                 u'Email': username,
 392                 u'GALX': galx,
 393                 u'Passwd': password,
 394                 u'PersistentCookie': u'yes',
 395                 u'_utf8': u'霱',
 396                 u'bgresponse': u'js_disabled',
 397                 u'checkConnection': u'',
 398                 u'checkedDomains': u'youtube',
 399                 u'dnConn': u'',
 400                 u'dsh': dsh,
 401                 u'pstMsg': u'0',
 402                 u'rmShown': u'1',
 403                 u'secTok': u'',
 404                 u'signIn': u'Sign in',
 405                 u'timeStmp': u'',
 406                 u'service': u'youtube',
 407                 u'uilel': u'3',
 408                 u'hl': u'en_US',
 409         }
 410         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 411         # chokes on unicode
 412         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 413         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 414         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 415         try:
 416             self.report_login()
 417             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 418             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 419                 self._downloader.report_warning(u'unable to log in: bad username or password')
 420                 return
 421         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 422             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 423             return
 424
 425         # Confirm age
 426         age_form = {
 427                 'next_url':     '/',
 428                 'action_confirm':   'Confirm',
 429                 }
 430         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 431         try:
 432             self.report_age_confirmation()
 433             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 435             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 436             return
 437
 438     def _extract_id(self, url):
 439         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 440         if mobj is None:
 441             self._downloader.report_error(u'invalid URL: %s' % url)
 442             return
 443         video_id = mobj.group(2)
 444         return video_id
 445
 446     def _real_extract(self, url):
 447         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 448         mobj = re.search(self._NEXT_URL_RE, url)
 449         if mobj:
 450             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 451         video_id = self._extract_id(url)
 452
 453         # Get video webpage
 454         self.report_video_webpage_download(video_id)
 455         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 456         request = compat_urllib_request.Request(url)
 457         try:
 458             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 459         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 460             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 461             return
 462
 463         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 464
 465         # Attempt to extract SWF player URL
 466         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 467         if mobj is not None:
 468             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 469         else:
 470             player_url = None
 471
 472         # Get video info
 473         self.report_video_info_webpage_download(video_id)
 474         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 475             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 476                     % (video_id, el_type))
 477             video_info_webpage = self._download_webpage(video_info_url, video_id,
 478                                     note=False,
 479                                     errnote='unable to download video info webpage')
 480             video_info = compat_parse_qs(video_info_webpage)
 481             if 'token' in video_info:
 482                 break
 483         if 'token' not in video_info:
 484             if 'reason' in video_info:
 485                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 486             else:
 487                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 488             return
 489
 490         # Check for "rental" videos
 491         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 492             self._downloader.report_error(u'"rental" videos not supported')
 493             return
 494
 495         # Start extracting information
 496         self.report_information_extraction(video_id)
 497
 498         # uploader
 499         if 'author' not in video_info:
 500             self._downloader.report_error(u'unable to extract uploader name')
 501             return
 502         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 503
 504         # uploader_id
 505         video_uploader_id = None
 506         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 507         if mobj is not None:
 508             video_uploader_id = mobj.group(1)
 509         else:
 510             self._downloader.report_warning(u'unable to extract uploader nickname')
 511
 512         # title
 513         if 'title' not in video_info:
 514             self._downloader.report_error(u'unable to extract video title')
 515             return
 516         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 517
 518         # thumbnail image
 519         if 'thumbnail_url' not in video_info:
 520             self._downloader.report_warning(u'unable to extract video thumbnail')
 521             video_thumbnail = ''
 522         else:   # don't panic if we can't find it
 523             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 524
 525         # upload date
 526         upload_date = None
 527         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 528         if mobj is not None:
 529             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 530             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 531             for expression in format_expressions:
 532                 try:
 533                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 534                 except:
 535                     pass
 536
 537         # description
 538         video_description = get_element_by_id("eow-description", video_webpage)
 539         if video_description:
 540             video_description = clean_html(video_description)
 541         else:
 542             video_description = ''
 543
 544         # subtitles
 545         video_subtitles = None
 546
 547         if self._downloader.params.get('writesubtitles', False):
 548             video_subtitles = self._extract_subtitle(video_id)
 549             if video_subtitles:
 550                 (sub_error, sub_lang, sub) = video_subtitles[0]
 551                 if sub_error:
 552                     self._downloader.report_error(sub_error)
 553
 554         if self._downloader.params.get('allsubtitles', False):
 555             video_subtitles = self._extract_all_subtitles(video_id)
 556             for video_subtitle in video_subtitles:
 557                 (sub_error, sub_lang, sub) = video_subtitle
 558                 if sub_error:
 559                     self._downloader.report_error(sub_error)
 560
 561         if self._downloader.params.get('listsubtitles', False):
 562             sub_lang_list = self._list_available_subtitles(video_id)
 563             return
 564
 565         if 'length_seconds' not in video_info:
 566             self._downloader.report_warning(u'unable to extract video duration')
 567             video_duration = ''
 568         else:
 569             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 570
 571         # token
 572         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 573
 574         # Decide which formats to download
 575         req_format = self._downloader.params.get('format', None)
 576
 577         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 578             self.report_rtmp_download()
 579             video_url_list = [(None, video_info['conn'][0])]
 580         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 581             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 582             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 583             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 584             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 585
 586             format_limit = self._downloader.params.get('format_limit', None)
 587             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 588             if format_limit is not None and format_limit in available_formats:
 589                 format_list = available_formats[available_formats.index(format_limit):]
 590             else:
 591                 format_list = available_formats
 592             existing_formats = [x for x in format_list if x in url_map]
 593             if len(existing_formats) == 0:
 594                 self._downloader.report_error(u'no known formats available for video')
 595                 return
 596             if self._downloader.params.get('listformats', None):
 597                 self._print_formats(existing_formats)
 598                 return
 599             if req_format is None or req_format == 'best':
 600                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 601             elif req_format == 'worst':
 602                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 603             elif req_format in ('-1', 'all'):
 604                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 605             else:
 606                 # Specific formats. We pick the first in a slash-delimeted sequence.
 607                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 608                 req_formats = req_format.split('/')
 609                 video_url_list = None
 610                 for rf in req_formats:
 611                     if rf in url_map:
 612                         video_url_list = [(rf, url_map[rf])]
 613                         break
 614                 if video_url_list is None:
 615                     self._downloader.report_error(u'requested format not available')
 616                     return
 617         else:
 618             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 619             return
 620
 621         results = []
 622         for format_param, video_real_url in video_url_list:
 623             # Extension
 624             video_extension = self._video_extensions.get(format_param, 'flv')
 625
 626             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 627                                               self._video_dimensions.get(format_param, '???'))
 628
 629             results.append({
 630                 'id':       video_id,
 631                 'url':      video_real_url,
 632                 'uploader': video_uploader,
 633                 'uploader_id': video_uploader_id,
 634                 'upload_date':  upload_date,
 635                 'title':    video_title,
 636                 'ext':      video_extension,
 637                 'format':   video_format,
 638                 'thumbnail':    video_thumbnail,
 639                 'description':  video_description,
 640                 'player_url':   player_url,
 641                 'subtitles':    video_subtitles,
 642                 'duration':     video_duration
 643             })
 644         return results
 645
 646
 647 class MetacafeIE(InfoExtractor):
 648     """Information Extractor for metacafe.com."""
 649
 650     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 651     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 652     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 653     IE_NAME = u'metacafe'
 654
 655     def __init__(self, downloader=None):
 656         InfoExtractor.__init__(self, downloader)
 657
 658     def report_disclaimer(self):
 659         """Report disclaimer retrieval."""
 660         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 661
 662     def report_age_confirmation(self):
 663         """Report attempt to confirm age."""
 664         self._downloader.to_screen(u'[metacafe] Confirming age')
 665
 666     def report_download_webpage(self, video_id):
 667         """Report webpage download."""
 668         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 669
 670     def report_extraction(self, video_id):
 671         """Report information extraction."""
 672         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 673
 674     def _real_initialize(self):
 675         # Retrieve disclaimer
 676         request = compat_urllib_request.Request(self._DISCLAIMER)
 677         try:
 678             self.report_disclaimer()
 679             disclaimer = compat_urllib_request.urlopen(request).read()
 680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 681             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 682             return
 683
 684         # Confirm age
 685         disclaimer_form = {
 686             'filters': '0',
 687             'submit': "Continue - I'm over 18",
 688             }
 689         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 690         try:
 691             self.report_age_confirmation()
 692             disclaimer = compat_urllib_request.urlopen(request).read()
 693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 694             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 695             return
 696
 697     def _real_extract(self, url):
 698         # Extract id and simplified title from URL
 699         mobj = re.match(self._VALID_URL, url)
 700         if mobj is None:
 701             self._downloader.report_error(u'invalid URL: %s' % url)
 702             return
 703
 704         video_id = mobj.group(1)
 705
 706         # Check if video comes from YouTube
 707         mobj2 = re.match(r'^yt-(.*)$', video_id)
 708         if mobj2 is not None:
 709             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 710             return
 711
 712         # Retrieve video webpage to extract further information
 713         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 714         try:
 715             self.report_download_webpage(video_id)
 716             webpage = compat_urllib_request.urlopen(request).read()
 717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 718             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
 719             return
 720
 721         # Extract URL, uploader and title from webpage
 722         self.report_extraction(video_id)
 723         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 724         if mobj is not None:
 725             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 726             video_extension = mediaURL[-3:]
 727
 728             # Extract gdaKey if available
 729             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 730             if mobj is None:
 731                 video_url = mediaURL
 732             else:
 733                 gdaKey = mobj.group(1)
 734                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 735         else:
 736             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 737             if mobj is None:
 738                 self._downloader.report_error(u'unable to extract media URL')
 739                 return
 740             vardict = compat_parse_qs(mobj.group(1))
 741             if 'mediaData' not in vardict:
 742                 self._downloader.report_error(u'unable to extract media URL')
 743                 return
 744             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 745             if mobj is None:
 746                 self._downloader.report_error(u'unable to extract media URL')
 747                 return
 748             mediaURL = mobj.group(1).replace('\\/', '/')
 749             video_extension = mediaURL[-3:]
 750             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 751
 752         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 753         if mobj is None:
 754             self._downloader.report_error(u'unable to extract title')
 755             return
 756         video_title = mobj.group(1).decode('utf-8')
 757
 758         mobj = re.search(r'submitter=(.*?);', webpage)
 759         if mobj is None:
 760             self._downloader.report_error(u'unable to extract uploader nickname')
 761             return
 762         video_uploader = mobj.group(1)
 763
 764         return [{
 765             'id':       video_id.decode('utf-8'),
 766             'url':      video_url.decode('utf-8'),
 767             'uploader': video_uploader.decode('utf-8'),
 768             'upload_date':  None,
 769             'title':    video_title,
 770             'ext':      video_extension.decode('utf-8'),
 771         }]
 772
 773
 774 class DailymotionIE(InfoExtractor):
 775     """Information Extractor for Dailymotion"""
 776
 777     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 778     IE_NAME = u'dailymotion'
 779     _WORKING = False
 780
 781     def __init__(self, downloader=None):
 782         InfoExtractor.__init__(self, downloader)
 783
 784     def report_extraction(self, video_id):
 785         """Report information extraction."""
 786         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 787
 788     def _real_extract(self, url):
 789         # Extract id and simplified title from URL
 790         mobj = re.match(self._VALID_URL, url)
 791         if mobj is None:
 792             self._downloader.report_error(u'invalid URL: %s' % url)
 793             return
 794
 795         video_id = mobj.group(1).split('_')[0].split('?')[0]
 796
 797         video_extension = 'mp4'
 798
 799         # Retrieve video webpage to extract further information
 800         request = compat_urllib_request.Request(url)
 801         request.add_header('Cookie', 'family_filter=off')
 802         webpage = self._download_webpage(request, video_id)
 803
 804         # Extract URL, uploader and title from webpage
 805         self.report_extraction(video_id)
 806         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 807         if mobj is None:
 808             self._downloader.report_error(u'unable to extract media URL')
 809             return
 810         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 811
 812         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 813             if key in flashvars:
 814                 max_quality = key
 815                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 816                 break
 817         else:
 818             self._downloader.report_error(u'unable to extract video URL')
 819             return
 820
 821         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 822         if mobj is None:
 823             self._downloader.report_error(u'unable to extract video URL')
 824             return
 825
 826         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 827
 828         # TODO: support choosing qualities
 829
 830         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 831         if mobj is None:
 832             self._downloader.report_error(u'unable to extract title')
 833             return
 834         video_title = unescapeHTML(mobj.group('title'))
 835
 836         video_uploader = None
 837         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 838         if mobj is None:
 839             # lookin for official user
 840             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 841             if mobj_official is None:
 842                 self._downloader.report_warning(u'unable to extract uploader nickname')
 843             else:
 844                 video_uploader = mobj_official.group(1)
 845         else:
 846             video_uploader = mobj.group(1)
 847
 848         video_upload_date = None
 849         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 850         if mobj is not None:
 851             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 852
 853         return [{
 854             'id':       video_id,
 855             'url':      video_url,
 856             'uploader': video_uploader,
 857             'upload_date':  video_upload_date,
 858             'title':    video_title,
 859             'ext':      video_extension,
 860         }]
 861
 862
 863 class PhotobucketIE(InfoExtractor):
 864     """Information extractor for photobucket.com."""
 865
 866     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 867     IE_NAME = u'photobucket'
 868
 869     def __init__(self, downloader=None):
 870         InfoExtractor.__init__(self, downloader)
 871
 872     def report_download_webpage(self, video_id):
 873         """Report webpage download."""
 874         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 875
 876     def report_extraction(self, video_id):
 877         """Report information extraction."""
 878         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 879
 880     def _real_extract(self, url):
 881         # Extract id from URL
 882         mobj = re.match(self._VALID_URL, url)
 883         if mobj is None:
 884             self._downloader.report_error(u'Invalid URL: %s' % url)
 885             return
 886
 887         video_id = mobj.group(1)
 888
 889         video_extension = 'flv'
 890
 891         # Retrieve video webpage to extract further information
 892         request = compat_urllib_request.Request(url)
 893         try:
 894             self.report_download_webpage(video_id)
 895             webpage = compat_urllib_request.urlopen(request).read()
 896         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 897             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 898             return
 899
 900         # Extract URL, uploader, and title from webpage
 901         self.report_extraction(video_id)
 902         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 903         if mobj is None:
 904             self._downloader.report_error(u'unable to extract media URL')
 905             return
 906         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 907
 908         video_url = mediaURL
 909
 910         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 911         if mobj is None:
 912             self._downloader.report_error(u'unable to extract title')
 913             return
 914         video_title = mobj.group(1).decode('utf-8')
 915
 916         video_uploader = mobj.group(2).decode('utf-8')
 917
 918         return [{
 919             'id':       video_id.decode('utf-8'),
 920             'url':      video_url.decode('utf-8'),
 921             'uploader': video_uploader,
 922             'upload_date':  None,
 923             'title':    video_title,
 924             'ext':      video_extension.decode('utf-8'),
 925         }]
 926
 927
 928 class YahooIE(InfoExtractor):
 929     """Information extractor for video.yahoo.com."""
 930
 931     _WORKING = False
 932     # _VALID_URL matches all Yahoo! Video URLs
 933     # _VPAGE_URL matches only the extractable '/watch/' URLs
 934     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 935     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 936     IE_NAME = u'video.yahoo'
 937
 938     def __init__(self, downloader=None):
 939         InfoExtractor.__init__(self, downloader)
 940
 941     def report_download_webpage(self, video_id):
 942         """Report webpage download."""
 943         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 944
 945     def report_extraction(self, video_id):
 946         """Report information extraction."""
 947         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 948
 949     def _real_extract(self, url, new_video=True):
 950         # Extract ID from URL
 951         mobj = re.match(self._VALID_URL, url)
 952         if mobj is None:
 953             self._downloader.report_error(u'Invalid URL: %s' % url)
 954             return
 955
 956         video_id = mobj.group(2)
 957         video_extension = 'flv'
 958
 959         # Rewrite valid but non-extractable URLs as
 960         # extractable English language /watch/ URLs
 961         if re.match(self._VPAGE_URL, url) is None:
 962             request = compat_urllib_request.Request(url)
 963             try:
 964                 webpage = compat_urllib_request.urlopen(request).read()
 965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 966                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 967                 return
 968
 969             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 970             if mobj is None:
 971                 self._downloader.report_error(u'Unable to extract id field')
 972                 return
 973             yahoo_id = mobj.group(1)
 974
 975             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 976             if mobj is None:
 977                 self._downloader.report_error(u'Unable to extract vid field')
 978                 return
 979             yahoo_vid = mobj.group(1)
 980
 981             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 982             return self._real_extract(url, new_video=False)
 983
 984         # Retrieve video webpage to extract further information
 985         request = compat_urllib_request.Request(url)
 986         try:
 987             self.report_download_webpage(video_id)
 988             webpage = compat_urllib_request.urlopen(request).read()
 989         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 990             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 991             return
 992
 993         # Extract uploader and title from webpage
 994         self.report_extraction(video_id)
 995         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 996         if mobj is None:
 997             self._downloader.report_error(u'unable to extract video title')
 998             return
 999         video_title = mobj.group(1).decode('utf-8')
1000
1001         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1002         if mobj is None:
1003             self._downloader.report_error(u'unable to extract video uploader')
1004             return
1005         video_uploader = mobj.group(1).decode('utf-8')
1006
1007         # Extract video thumbnail
1008         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1009         if mobj is None:
1010             self._downloader.report_error(u'unable to extract video thumbnail')
1011             return
1012         video_thumbnail = mobj.group(1).decode('utf-8')
1013
1014         # Extract video description
1015         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video description')
1018             return
1019         video_description = mobj.group(1).decode('utf-8')
1020         if not video_description:
1021             video_description = 'No description available.'
1022
1023         # Extract video height and width
1024         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video height')
1027             return
1028         yv_video_height = mobj.group(1)
1029
1030         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1031         if mobj is None:
1032             self._downloader.report_error(u'unable to extract video width')
1033             return
1034         yv_video_width = mobj.group(1)
1035
1036         # Retrieve video playlist to extract media URL
1037         # I'm not completely sure what all these options are, but we
1038         # seem to need most of them, otherwise the server sends a 401.
1039         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1040         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1041         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1042                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1043                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1044         try:
1045             self.report_download_webpage(video_id)
1046             webpage = compat_urllib_request.urlopen(request).read()
1047         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1048             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1049             return
1050
1051         # Extract media URL from playlist XML
1052         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1053         if mobj is None:
1054             self._downloader.report_error(u'Unable to extract media URL')
1055             return
1056         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1057         video_url = unescapeHTML(video_url)
1058
1059         return [{
1060             'id':       video_id.decode('utf-8'),
1061             'url':      video_url,
1062             'uploader': video_uploader,
1063             'upload_date':  None,
1064             'title':    video_title,
1065             'ext':      video_extension.decode('utf-8'),
1066             'thumbnail':    video_thumbnail.decode('utf-8'),
1067             'description':  video_description,
1068         }]
1069
1070
1071 class VimeoIE(InfoExtractor):
1072     """Information extractor for vimeo.com."""
1073
1074     # _VALID_URL matches Vimeo URLs
1075     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1076     IE_NAME = u'vimeo'
1077
1078     def __init__(self, downloader=None):
1079         InfoExtractor.__init__(self, downloader)
1080
1081     def report_download_webpage(self, video_id):
1082         """Report webpage download."""
1083         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1084
1085     def report_extraction(self, video_id):
1086         """Report information extraction."""
1087         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1088
1089     def _real_extract(self, url, new_video=True):
1090         # Extract ID from URL
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             self._downloader.report_error(u'Invalid URL: %s' % url)
1094             return
1095
1096         video_id = mobj.group('id')
1097         if not mobj.group('proto'):
1098             url = 'https://' + url
1099         if mobj.group('direct_link'):
1100             url = 'https://vimeo.com/' + video_id
1101
1102         # Retrieve video webpage to extract further information
1103         request = compat_urllib_request.Request(url, None, std_headers)
1104         try:
1105             self.report_download_webpage(video_id)
1106             webpage_bytes = compat_urllib_request.urlopen(request).read()
1107             webpage = webpage_bytes.decode('utf-8')
1108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1110             return
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             self._downloader.report_error(u'unable to extract info section')
1123             return
1124
1125         # Extract title
1126         video_title = config["video"]["title"]
1127
1128         # Extract uploader and uploader_id
1129         video_uploader = config["video"]["owner"]["name"]
1130         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1131
1132         # Extract video thumbnail
1133         video_thumbnail = config["video"]["thumbnail"]
1134
1135         # Extract video description
1136         video_description = get_element_by_attribute("itemprop", "description", webpage)
1137         if video_description: video_description = clean_html(video_description)
1138         else: video_description = u''
1139
1140         # Extract upload date
1141         video_upload_date = None
1142         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143         if mobj is not None:
1144             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145
1146         # Vimeo specific: extract request signature and timestamp
1147         sig = config['request']['signature']
1148         timestamp = config['request']['timestamp']
1149
1150         # Vimeo specific: extract video codec and quality information
1151         # First consider quality, then codecs, then take everything
1152         # TODO bind to format param
1153         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154         files = { 'hd': [], 'sd': [], 'other': []}
1155         for codec_name, codec_extension in codecs:
1156             if codec_name in config["video"]["files"]:
1157                 if 'hd' in config["video"]["files"][codec_name]:
1158                     files['hd'].append((codec_name, codec_extension, 'hd'))
1159                 elif 'sd' in config["video"]["files"][codec_name]:
1160                     files['sd'].append((codec_name, codec_extension, 'sd'))
1161                 else:
1162                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163
1164         for quality in ('hd', 'sd', 'other'):
1165             if len(files[quality]) > 0:
1166                 video_quality = files[quality][0][2]
1167                 video_codec = files[quality][0][0]
1168                 video_extension = files[quality][0][1]
1169                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170                 break
1171         else:
1172             self._downloader.report_error(u'no known codec found')
1173             return
1174
1175         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1177
1178         return [{
1179             'id':       video_id,
1180             'url':      video_url,
1181             'uploader': video_uploader,
1182             'uploader_id': video_uploader_id,
1183             'upload_date':  video_upload_date,
1184             'title':    video_title,
1185             'ext':      video_extension,
1186             'thumbnail':    video_thumbnail,
1187             'description':  video_description,
1188         }]
1189
1190
1191 class ArteTvIE(InfoExtractor):
1192     """arte.tv information extractor."""
1193
1194     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195     _LIVE_URL = r'index-[0-9]+\.html$'
1196
1197     IE_NAME = u'arte.tv'
1198
1199     def __init__(self, downloader=None):
1200         InfoExtractor.__init__(self, downloader)
1201
1202     def report_download_webpage(self, video_id):
1203         """Report webpage download."""
1204         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1205
1206     def report_extraction(self, video_id):
1207         """Report information extraction."""
1208         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1209
1210     def fetch_webpage(self, url):
1211         request = compat_urllib_request.Request(url)
1212         try:
1213             self.report_download_webpage(url)
1214             webpage = compat_urllib_request.urlopen(request).read()
1215         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1216             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1217             return
1218         except ValueError as err:
1219             self._downloader.report_error(u'Invalid URL: %s' % url)
1220             return
1221         return webpage
1222
1223     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1224         page = self.fetch_webpage(url)
1225         mobj = re.search(regex, page, regexFlags)
1226         info = {}
1227
1228         if mobj is None:
1229             self._downloader.report_error(u'Invalid URL: %s' % url)
1230             return
1231
1232         for (i, key, err) in matchTuples:
1233             if mobj.group(i) is None:
1234                 self._downloader.trouble(err)
1235                 return
1236             else:
1237                 info[key] = mobj.group(i)
1238
1239         return info
1240
1241     def extractLiveStream(self, url):
1242         video_lang = url.split('/')[-4]
1243         info = self.grep_webpage(
1244             url,
1245             r'src="(.*?/videothek_js.*?\.js)',
1246             0,
1247             [
1248                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1249             ]
1250         )
1251         http_host = url.split('/')[2]
1252         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1253         info = self.grep_webpage(
1254             next_url,
1255             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1256                 '(http://.*?\.swf).*?' +
1257                 '(rtmp://.*?)\'',
1258             re.DOTALL,
1259             [
1260                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1261                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1262                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1263             ]
1264         )
1265         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1266
1267     def extractPlus7Stream(self, url):
1268         video_lang = url.split('/')[-3]
1269         info = self.grep_webpage(
1270             url,
1271             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1272             0,
1273             [
1274                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1275             ]
1276         )
1277         next_url = compat_urllib_parse.unquote(info.get('url'))
1278         info = self.grep_webpage(
1279             next_url,
1280             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1281             0,
1282             [
1283                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1284             ]
1285         )
1286         next_url = compat_urllib_parse.unquote(info.get('url'))
1287
1288         info = self.grep_webpage(
1289             next_url,
1290             r'<video id="(.*?)".*?>.*?' +
1291                 '<name>(.*?)</name>.*?' +
1292                 '<dateVideo>(.*?)</dateVideo>.*?' +
1293                 '<url quality="hd">(.*?)</url>',
1294             re.DOTALL,
1295             [
1296                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1297                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1298                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1299                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1300             ]
1301         )
1302
1303         return {
1304             'id':           info.get('id'),
1305             'url':          compat_urllib_parse.unquote(info.get('url')),
1306             'uploader':     u'arte.tv',
1307             'upload_date':  info.get('date'),
1308             'title':        info.get('title').decode('utf-8'),
1309             'ext':          u'mp4',
1310             'format':       u'NA',
1311             'player_url':   None,
1312         }
1313
1314     def _real_extract(self, url):
1315         video_id = url.split('/')[-1]
1316         self.report_extraction(video_id)
1317
1318         if re.search(self._LIVE_URL, video_id) is not None:
1319             self.extractLiveStream(url)
1320             return
1321         else:
1322             info = self.extractPlus7Stream(url)
1323
1324         return [info]
1325
1326
1327 class GenericIE(InfoExtractor):
1328     """Generic last-resort information extractor."""
1329
1330     _VALID_URL = r'.*'
1331     IE_NAME = u'generic'
1332
1333     def __init__(self, downloader=None):
1334         InfoExtractor.__init__(self, downloader)
1335
1336     def report_download_webpage(self, video_id):
1337         """Report webpage download."""
1338         if not self._downloader.params.get('test', False):
1339             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1340         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1341
1342     def report_extraction(self, video_id):
1343         """Report information extraction."""
1344         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1345
1346     def report_following_redirect(self, new_url):
1347         """Report information extraction."""
1348         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1349
1350     def _test_redirect(self, url):
1351         """Check if it is a redirect, like url shorteners, in case restart chain."""
1352         class HeadRequest(compat_urllib_request.Request):
1353             def get_method(self):
1354                 return "HEAD"
1355
1356         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1357             """
1358             Subclass the HTTPRedirectHandler to make it use our
1359             HeadRequest also on the redirected URL
1360             """
1361             def redirect_request(self, req, fp, code, msg, headers, newurl):
1362                 if code in (301, 302, 303, 307):
1363                     newurl = newurl.replace(' ', '%20')
1364                     newheaders = dict((k,v) for k,v in req.headers.items()
1365                                       if k.lower() not in ("content-length", "content-type"))
1366                     return HeadRequest(newurl,
1367                                        headers=newheaders,
1368                                        origin_req_host=req.get_origin_req_host(),
1369                                        unverifiable=True)
1370                 else:
1371                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1372
1373         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1374             """
1375             Fallback to GET if HEAD is not allowed (405 HTTP error)
1376             """
1377             def http_error_405(self, req, fp, code, msg, headers):
1378                 fp.read()
1379                 fp.close()
1380
1381                 newheaders = dict((k,v) for k,v in req.headers.items()
1382                                   if k.lower() not in ("content-length", "content-type"))
1383                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1384                                                  headers=newheaders,
1385                                                  origin_req_host=req.get_origin_req_host(),
1386                                                  unverifiable=True))
1387
1388         # Build our opener
1389         opener = compat_urllib_request.OpenerDirector()
1390         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1391                         HTTPMethodFallback, HEADRedirectHandler,
1392                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1393             opener.add_handler(handler())
1394
1395         response = opener.open(HeadRequest(url))
1396         new_url = response.geturl()
1397
1398         if url == new_url:
1399             return False
1400
1401         self.report_following_redirect(new_url)
1402         self._downloader.download([new_url])
1403         return True
1404
1405     def _real_extract(self, url):
1406         if self._test_redirect(url): return
1407
1408         video_id = url.split('/')[-1]
1409         try:
1410             webpage = self._download_webpage(url, video_id)
1411         except ValueError as err:
1412             # since this is the last-resort InfoExtractor, if
1413             # this error is thrown, it'll be thrown here
1414             self._downloader.report_error(u'Invalid URL: %s' % url)
1415             return
1416
1417         self.report_extraction(video_id)
1418         # Start with something easy: JW Player in SWFObject
1419         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1420         if mobj is None:
1421             # Broaden the search a little bit
1422             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1423         if mobj is None:
1424             # Broaden the search a little bit: JWPlayer JS loader
1425             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1426         if mobj is None:
1427             self._downloader.report_error(u'Invalid URL: %s' % url)
1428             return
1429
1430         # It's possible that one of the regexes
1431         # matched, but returned an empty group:
1432         if mobj.group(1) is None:
1433             self._downloader.report_error(u'Invalid URL: %s' % url)
1434             return
1435
1436         video_url = compat_urllib_parse.unquote(mobj.group(1))
1437         video_id = os.path.basename(video_url)
1438
1439         # here's a fun little line of code for you:
1440         video_extension = os.path.splitext(video_id)[1][1:]
1441         video_id = os.path.splitext(video_id)[0]
1442
1443         # it's tempting to parse this further, but you would
1444         # have to take into account all the variations like
1445         #   Video Title - Site Name
1446         #   Site Name | Video Title
1447         #   Video Title - Tagline | Site Name
1448         # and so on and so forth; it's just not practical
1449         mobj = re.search(r'<title>(.*)</title>', webpage)
1450         if mobj is None:
1451             self._downloader.report_error(u'unable to extract title')
1452             return
1453         video_title = mobj.group(1)
1454
1455         # video uploader is domain name
1456         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1457         if mobj is None:
1458             self._downloader.report_error(u'unable to extract title')
1459             return
1460         video_uploader = mobj.group(1)
1461
1462         return [{
1463             'id':       video_id,
1464             'url':      video_url,
1465             'uploader': video_uploader,
1466             'upload_date':  None,
1467             'title':    video_title,
1468             'ext':      video_extension,
1469         }]
1470
1471
1472 class YoutubeSearchIE(InfoExtractor):
1473     """Information Extractor for YouTube search queries."""
1474     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1475     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1476     _max_youtube_results = 1000
1477     IE_NAME = u'youtube:search'
1478
1479     def __init__(self, downloader=None):
1480         InfoExtractor.__init__(self, downloader)
1481
1482     def report_download_page(self, query, pagenum):
1483         """Report attempt to download search page with given number."""
1484         query = query.decode(preferredencoding())
1485         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1486
1487     def _real_extract(self, query):
1488         mobj = re.match(self._VALID_URL, query)
1489         if mobj is None:
1490             self._downloader.report_error(u'invalid search query "%s"' % query)
1491             return
1492
1493         prefix, query = query.split(':')
1494         prefix = prefix[8:]
1495         query = query.encode('utf-8')
1496         if prefix == '':
1497             self._download_n_results(query, 1)
1498             return
1499         elif prefix == 'all':
1500             self._download_n_results(query, self._max_youtube_results)
1501             return
1502         else:
1503             try:
1504                 n = int(prefix)
1505                 if n <= 0:
1506                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1507                     return
1508                 elif n > self._max_youtube_results:
1509                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1510                     n = self._max_youtube_results
1511                 self._download_n_results(query, n)
1512                 return
1513             except ValueError: # parsing prefix as integer fails
1514                 self._download_n_results(query, 1)
1515                 return
1516
1517     def _download_n_results(self, query, n):
1518         """Downloads a specified number of results for a query"""
1519
1520         video_ids = []
1521         pagenum = 0
1522         limit = n
1523
1524         while (50 * pagenum) < limit:
1525             self.report_download_page(query, pagenum+1)
1526             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1527             request = compat_urllib_request.Request(result_url)
1528             try:
1529                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1530             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1531                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1532                 return
1533             api_response = json.loads(data)['data']
1534
1535             if not 'items' in api_response:
1536                 self._downloader.trouble(u'[youtube] No video results')
1537                 return
1538
1539             new_ids = list(video['id'] for video in api_response['items'])
1540             video_ids += new_ids
1541
1542             limit = min(n, api_response['totalItems'])
1543             pagenum += 1
1544
1545         if len(video_ids) > n:
1546             video_ids = video_ids[:n]
1547         for id in video_ids:
1548             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1549         return
1550
1551
1552 class GoogleSearchIE(InfoExtractor):
1553     """Information Extractor for Google Video search queries."""
1554     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1555     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1556     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1557     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1558     _max_google_results = 1000
1559     IE_NAME = u'video.google:search'
1560
1561     def __init__(self, downloader=None):
1562         InfoExtractor.__init__(self, downloader)
1563
1564     def report_download_page(self, query, pagenum):
1565         """Report attempt to download playlist page with given number."""
1566         query = query.decode(preferredencoding())
1567         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1568
1569     def _real_extract(self, query):
1570         mobj = re.match(self._VALID_URL, query)
1571         if mobj is None:
1572             self._downloader.report_error(u'invalid search query "%s"' % query)
1573             return
1574
1575         prefix, query = query.split(':')
1576         prefix = prefix[8:]
1577         query = query.encode('utf-8')
1578         if prefix == '':
1579             self._download_n_results(query, 1)
1580             return
1581         elif prefix == 'all':
1582             self._download_n_results(query, self._max_google_results)
1583             return
1584         else:
1585             try:
1586                 n = int(prefix)
1587                 if n <= 0:
1588                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1589                     return
1590                 elif n > self._max_google_results:
1591                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1592                     n = self._max_google_results
1593                 self._download_n_results(query, n)
1594                 return
1595             except ValueError: # parsing prefix as integer fails
1596                 self._download_n_results(query, 1)
1597                 return
1598
1599     def _download_n_results(self, query, n):
1600         """Downloads a specified number of results for a query"""
1601
1602         video_ids = []
1603         pagenum = 0
1604
1605         while True:
1606             self.report_download_page(query, pagenum)
1607             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1608             request = compat_urllib_request.Request(result_url)
1609             try:
1610                 page = compat_urllib_request.urlopen(request).read()
1611             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1612                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1613                 return
1614
1615             # Extract video identifiers
1616             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1617                 video_id = mobj.group(1)
1618                 if video_id not in video_ids:
1619                     video_ids.append(video_id)
1620                     if len(video_ids) == n:
1621                         # Specified n videos reached
1622                         for id in video_ids:
1623                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624                         return
1625
1626             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1627                 for id in video_ids:
1628                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1629                 return
1630
1631             pagenum = pagenum + 1
1632
1633
1634 class YahooSearchIE(InfoExtractor):
1635     """Information Extractor for Yahoo! Video search queries."""
1636
1637     _WORKING = False
1638     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1639     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1640     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1641     _MORE_PAGES_INDICATOR = r'\s*Next'
1642     _max_yahoo_results = 1000
1643     IE_NAME = u'video.yahoo:search'
1644
1645     def __init__(self, downloader=None):
1646         InfoExtractor.__init__(self, downloader)
1647
1648     def report_download_page(self, query, pagenum):
1649         """Report attempt to download playlist page with given number."""
1650         query = query.decode(preferredencoding())
1651         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1652
1653     def _real_extract(self, query):
1654         mobj = re.match(self._VALID_URL, query)
1655         if mobj is None:
1656             self._downloader.report_error(u'invalid search query "%s"' % query)
1657             return
1658
1659         prefix, query = query.split(':')
1660         prefix = prefix[8:]
1661         query = query.encode('utf-8')
1662         if prefix == '':
1663             self._download_n_results(query, 1)
1664             return
1665         elif prefix == 'all':
1666             self._download_n_results(query, self._max_yahoo_results)
1667             return
1668         else:
1669             try:
1670                 n = int(prefix)
1671                 if n <= 0:
1672                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1673                     return
1674                 elif n > self._max_yahoo_results:
1675                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1676                     n = self._max_yahoo_results
1677                 self._download_n_results(query, n)
1678                 return
1679             except ValueError: # parsing prefix as integer fails
1680                 self._download_n_results(query, 1)
1681                 return
1682
1683     def _download_n_results(self, query, n):
1684         """Downloads a specified number of results for a query"""
1685
1686         video_ids = []
1687         already_seen = set()
1688         pagenum = 1
1689
1690         while True:
1691             self.report_download_page(query, pagenum)
1692             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1693             request = compat_urllib_request.Request(result_url)
1694             try:
1695                 page = compat_urllib_request.urlopen(request).read()
1696             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1697                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1698                 return
1699
1700             # Extract video identifiers
1701             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1702                 video_id = mobj.group(1)
1703                 if video_id not in already_seen:
1704                     video_ids.append(video_id)
1705                     already_seen.add(video_id)
1706                     if len(video_ids) == n:
1707                         # Specified n videos reached
1708                         for id in video_ids:
1709                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710                         return
1711
1712             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1713                 for id in video_ids:
1714                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1715                 return
1716
1717             pagenum = pagenum + 1
1718
1719
1720 class YoutubePlaylistIE(InfoExtractor):
1721     """Information Extractor for YouTube playlists."""
1722
1723     _VALID_URL = r"""(?:
1724                         (?:https?://)?
1725                         (?:\w+\.)?
1726                         youtube\.com/
1727                         (?:
1728                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1729                            \? (?:.*?&)*? (?:p|a|list)=
1730                         |  p/
1731                         )
1732                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1733                         .*
1734                      |
1735                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1736                      )"""
1737     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1738     _MAX_RESULTS = 50
1739     IE_NAME = u'youtube:playlist'
1740
1741     def __init__(self, downloader=None):
1742         InfoExtractor.__init__(self, downloader)
1743
1744     @classmethod
1745     def suitable(cls, url):
1746         """Receives a URL and returns True if suitable for this IE."""
1747         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1748
1749     def report_download_page(self, playlist_id, pagenum):
1750         """Report attempt to download playlist page with given number."""
1751         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1752
1753     def _real_extract(self, url):
1754         # Extract playlist id
1755         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1756         if mobj is None:
1757             self._downloader.report_error(u'invalid url: %s' % url)
1758             return
1759
1760         # Download playlist videos from API
1761         playlist_id = mobj.group(1) or mobj.group(2)
1762         page_num = 1
1763         videos = []
1764
1765         while True:
1766             self.report_download_page(playlist_id, page_num)
1767
1768             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1769             try:
1770                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1771             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1772                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1773                 return
1774
1775             try:
1776                 response = json.loads(page)
1777             except ValueError as err:
1778                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1779                 return
1780
1781             if not 'feed' in response or not 'entry' in response['feed']:
1782                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1783                 return
1784             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1785                         for entry in response['feed']['entry']
1786                         if 'content' in entry ]
1787
1788             if len(response['feed']['entry']) < self._MAX_RESULTS:
1789                 break
1790             page_num += 1
1791
1792         videos = [v[1] for v in sorted(videos)]
1793         total = len(videos)
1794
1795         playliststart = self._downloader.params.get('playliststart', 1) - 1
1796         playlistend = self._downloader.params.get('playlistend', -1)
1797         if playlistend == -1:
1798             videos = videos[playliststart:]
1799         else:
1800             videos = videos[playliststart:playlistend]
1801
1802         if len(videos) == total:
1803             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1804         else:
1805             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1806
1807         for video in videos:
1808             self._downloader.download([video])
1809         return
1810
1811
1812 class YoutubeChannelIE(InfoExtractor):
1813     """Information Extractor for YouTube channels."""
1814
1815     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1816     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1817     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1818     IE_NAME = u'youtube:channel'
1819
1820     def report_download_page(self, channel_id, pagenum):
1821         """Report attempt to download channel page with given number."""
1822         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1823
1824     def _real_extract(self, url):
1825         # Extract channel id
1826         mobj = re.match(self._VALID_URL, url)
1827         if mobj is None:
1828             self._downloader.report_error(u'invalid url: %s' % url)
1829             return
1830
1831         # Download channel pages
1832         channel_id = mobj.group(1)
1833         video_ids = []
1834         pagenum = 1
1835
1836         while True:
1837             self.report_download_page(channel_id, pagenum)
1838             url = self._TEMPLATE_URL % (channel_id, pagenum)
1839             request = compat_urllib_request.Request(url)
1840             try:
1841                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1842             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1843                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1844                 return
1845
1846             # Extract video identifiers
1847             ids_in_page = []
1848             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1849                 if mobj.group(1) not in ids_in_page:
1850                     ids_in_page.append(mobj.group(1))
1851             video_ids.extend(ids_in_page)
1852
1853             if self._MORE_PAGES_INDICATOR not in page:
1854                 break
1855             pagenum = pagenum + 1
1856
1857         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1858
1859         for id in video_ids:
1860             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1861         return
1862
1863
1864 class YoutubeUserIE(InfoExtractor):
1865     """Information Extractor for YouTube users."""
1866
1867     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1868     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1869     _GDATA_PAGE_SIZE = 50
1870     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1871     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1872     IE_NAME = u'youtube:user'
1873
1874     def __init__(self, downloader=None):
1875         InfoExtractor.__init__(self, downloader)
1876
1877     def report_download_page(self, username, start_index):
1878         """Report attempt to download user page."""
1879         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1880                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1881
1882     def _real_extract(self, url):
1883         # Extract username
1884         mobj = re.match(self._VALID_URL, url)
1885         if mobj is None:
1886             self._downloader.report_error(u'invalid url: %s' % url)
1887             return
1888
1889         username = mobj.group(1)
1890
1891         # Download video ids using YouTube Data API. Result size per
1892         # query is limited (currently to 50 videos) so we need to query
1893         # page by page until there are no video ids - it means we got
1894         # all of them.
1895
1896         video_ids = []
1897         pagenum = 0
1898
1899         while True:
1900             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1901             self.report_download_page(username, start_index)
1902
1903             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1904
1905             try:
1906                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1908                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1909                 return
1910
1911             # Extract video identifiers
1912             ids_in_page = []
1913
1914             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915                 if mobj.group(1) not in ids_in_page:
1916                     ids_in_page.append(mobj.group(1))
1917
1918             video_ids.extend(ids_in_page)
1919
1920             # A little optimization - if current page is not
1921             # "full", ie. does not contain PAGE_SIZE video ids then
1922             # we can assume that this page is the last one - there
1923             # are no more ids on further pages - no need to query
1924             # again.
1925
1926             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1927                 break
1928
1929             pagenum += 1
1930
1931         all_ids_count = len(video_ids)
1932         playliststart = self._downloader.params.get('playliststart', 1) - 1
1933         playlistend = self._downloader.params.get('playlistend', -1)
1934
1935         if playlistend == -1:
1936             video_ids = video_ids[playliststart:]
1937         else:
1938             video_ids = video_ids[playliststart:playlistend]
1939
1940         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1941                 (username, all_ids_count, len(video_ids)))
1942
1943         for video_id in video_ids:
1944             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1945
1946
1947 class BlipTVUserIE(InfoExtractor):
1948     """Information Extractor for blip.tv users."""
1949
1950     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1951     _PAGE_SIZE = 12
1952     IE_NAME = u'blip.tv:user'
1953
1954     def __init__(self, downloader=None):
1955         InfoExtractor.__init__(self, downloader)
1956
1957     def report_download_page(self, username, pagenum):
1958         """Report attempt to download user page."""
1959         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960                 (self.IE_NAME, username, pagenum))
1961
1962     def _real_extract(self, url):
1963         # Extract username
1964         mobj = re.match(self._VALID_URL, url)
1965         if mobj is None:
1966             self._downloader.report_error(u'invalid url: %s' % url)
1967             return
1968
1969         username = mobj.group(1)
1970
1971         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1972
1973         request = compat_urllib_request.Request(url)
1974
1975         try:
1976             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977             mobj = re.search(r'data-users-id="([^"]+)"', page)
1978             page_base = page_base % mobj.group(1)
1979         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1981             return
1982
1983
1984         # Download video ids using BlipTV Ajax calls. Result size per
1985         # query is limited (currently to 12 videos) so we need to query
1986         # page by page until there are no video ids - it means we got
1987         # all of them.
1988
1989         video_ids = []
1990         pagenum = 1
1991
1992         while True:
1993             self.report_download_page(username, pagenum)
1994             url = page_base + "&page=" + str(pagenum)
1995             request = compat_urllib_request.Request( url )
1996             try:
1997                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2000                 return
2001
2002             # Extract video identifiers
2003             ids_in_page = []
2004
2005             for mobj in re.finditer(r'href="/([^"]+)"', page):
2006                 if mobj.group(1) not in ids_in_page:
2007                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2008
2009             video_ids.extend(ids_in_page)
2010
2011             # A little optimization - if current page is not
2012             # "full", ie. does not contain PAGE_SIZE video ids then
2013             # we can assume that this page is the last one - there
2014             # are no more ids on further pages - no need to query
2015             # again.
2016
2017             if len(ids_in_page) < self._PAGE_SIZE:
2018                 break
2019
2020             pagenum += 1
2021
2022         all_ids_count = len(video_ids)
2023         playliststart = self._downloader.params.get('playliststart', 1) - 1
2024         playlistend = self._downloader.params.get('playlistend', -1)
2025
2026         if playlistend == -1:
2027             video_ids = video_ids[playliststart:]
2028         else:
2029             video_ids = video_ids[playliststart:playlistend]
2030
2031         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2032                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2033
2034         for video_id in video_ids:
2035             self._downloader.download([u'http://blip.tv/'+video_id])
2036
2037
2038 class DepositFilesIE(InfoExtractor):
2039     """Information extractor for depositfiles.com"""
2040
2041     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2042
2043     def report_download_webpage(self, file_id):
2044         """Report webpage download."""
2045         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2046
2047     def report_extraction(self, file_id):
2048         """Report information extraction."""
2049         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2050
2051     def _real_extract(self, url):
2052         file_id = url.split('/')[-1]
2053         # Rebuild url in english locale
2054         url = 'http://depositfiles.com/en/files/' + file_id
2055
2056         # Retrieve file webpage with 'Free download' button pressed
2057         free_download_indication = { 'gateway_result' : '1' }
2058         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2059         try:
2060             self.report_download_webpage(file_id)
2061             webpage = compat_urllib_request.urlopen(request).read()
2062         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2063             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2064             return
2065
2066         # Search for the real file URL
2067         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2068         if (mobj is None) or (mobj.group(1) is None):
2069             # Try to figure out reason of the error.
2070             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2071             if (mobj is not None) and (mobj.group(1) is not None):
2072                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2073                 self._downloader.report_error(u'%s' % restriction_message)
2074             else:
2075                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2076             return
2077
2078         file_url = mobj.group(1)
2079         file_extension = os.path.splitext(file_url)[1][1:]
2080
2081         # Search for file title
2082         mobj = re.search(r'<b title="(.*?)">', webpage)
2083         if mobj is None:
2084             self._downloader.report_error(u'unable to extract title')
2085             return
2086         file_title = mobj.group(1).decode('utf-8')
2087
2088         return [{
2089             'id':       file_id.decode('utf-8'),
2090             'url':      file_url.decode('utf-8'),
2091             'uploader': None,
2092             'upload_date':  None,
2093             'title':    file_title,
2094             'ext':      file_extension.decode('utf-8'),
2095         }]
2096
2097
2098 class FacebookIE(InfoExtractor):
2099     """Information Extractor for Facebook"""
2100
2101     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2102     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2103     _NETRC_MACHINE = 'facebook'
2104     IE_NAME = u'facebook'
2105
2106     def report_login(self):
2107         """Report attempt to log in."""
2108         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2109
2110     def _real_initialize(self):
2111         if self._downloader is None:
2112             return
2113
2114         useremail = None
2115         password = None
2116         downloader_params = self._downloader.params
2117
2118         # Attempt to use provided username and password or .netrc data
2119         if downloader_params.get('username', None) is not None:
2120             useremail = downloader_params['username']
2121             password = downloader_params['password']
2122         elif downloader_params.get('usenetrc', False):
2123             try:
2124                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2125                 if info is not None:
2126                     useremail = info[0]
2127                     password = info[2]
2128                 else:
2129                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2130             except (IOError, netrc.NetrcParseError) as err:
2131                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2132                 return
2133
2134         if useremail is None:
2135             return
2136
2137         # Log in
2138         login_form = {
2139             'email': useremail,
2140             'pass': password,
2141             'login': 'Log+In'
2142             }
2143         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2144         try:
2145             self.report_login()
2146             login_results = compat_urllib_request.urlopen(request).read()
2147             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2148                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2149                 return
2150         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2151             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2152             return
2153
2154     def _real_extract(self, url):
2155         mobj = re.match(self._VALID_URL, url)
2156         if mobj is None:
2157             self._downloader.report_error(u'invalid URL: %s' % url)
2158             return
2159         video_id = mobj.group('ID')
2160
2161         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2162         webpage = self._download_webpage(url, video_id)
2163
2164         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2165         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2166         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2167         if not m:
2168             raise ExtractorError(u'Cannot parse data')
2169         data = dict(json.loads(m.group(1)))
2170         params_raw = compat_urllib_parse.unquote(data['params'])
2171         params = json.loads(params_raw)
2172         video_data = params['video_data'][0]
2173         video_url = video_data.get('hd_src')
2174         if not video_url:
2175             video_url = video_data['sd_src']
2176         if not video_url:
2177             raise ExtractorError(u'Cannot find video URL')
2178         video_duration = int(video_data['video_duration'])
2179         thumbnail = video_data['thumbnail_src']
2180
2181         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2182         if not m:
2183             raise ExtractorError(u'Cannot find title in webpage')
2184         video_title = unescapeHTML(m.group(1))
2185
2186         info = {
2187             'id': video_id,
2188             'title': video_title,
2189             'url': video_url,
2190             'ext': 'mp4',
2191             'duration': video_duration,
2192             'thumbnail': thumbnail,
2193         }
2194         return [info]
2195
2196
2197 class BlipTVIE(InfoExtractor):
2198     """Information extractor for blip.tv"""
2199
2200     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2201     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2202     IE_NAME = u'blip.tv'
2203
2204     def report_extraction(self, file_id):
2205         """Report information extraction."""
2206         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2207
2208     def report_direct_download(self, title):
2209         """Report information extraction."""
2210         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2211
2212     def _real_extract(self, url):
2213         mobj = re.match(self._VALID_URL, url)
2214         if mobj is None:
2215             self._downloader.report_error(u'invalid URL: %s' % url)
2216             return
2217
2218         urlp = compat_urllib_parse_urlparse(url)
2219         if urlp.path.startswith('/play/'):
2220             request = compat_urllib_request.Request(url)
2221             response = compat_urllib_request.urlopen(request)
2222             redirecturl = response.geturl()
2223             rurlp = compat_urllib_parse_urlparse(redirecturl)
2224             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2225             url = 'http://blip.tv/a/a-' + file_id
2226             return self._real_extract(url)
2227
2228
2229         if '?' in url:
2230             cchar = '&'
2231         else:
2232             cchar = '?'
2233         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2234         request = compat_urllib_request.Request(json_url)
2235         request.add_header('User-Agent', 'iTunes/10.6.1')
2236         self.report_extraction(mobj.group(1))
2237         info = None
2238         try:
2239             urlh = compat_urllib_request.urlopen(request)
2240             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2241                 basename = url.split('/')[-1]
2242                 title,ext = os.path.splitext(basename)
2243                 title = title.decode('UTF-8')
2244                 ext = ext.replace('.', '')
2245                 self.report_direct_download(title)
2246                 info = {
2247                     'id': title,
2248                     'url': url,
2249                     'uploader': None,
2250                     'upload_date': None,
2251                     'title': title,
2252                     'ext': ext,
2253                     'urlhandle': urlh
2254                 }
2255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2256             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2257         if info is None: # Regular URL
2258             try:
2259                 json_code_bytes = urlh.read()
2260                 json_code = json_code_bytes.decode('utf-8')
2261             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2263                 return
2264
2265             try:
2266                 json_data = json.loads(json_code)
2267                 if 'Post' in json_data:
2268                     data = json_data['Post']
2269                 else:
2270                     data = json_data
2271
2272                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2273                 video_url = data['media']['url']
2274                 umobj = re.match(self._URL_EXT, video_url)
2275                 if umobj is None:
2276                     raise ValueError('Can not determine filename extension')
2277                 ext = umobj.group(1)
2278
2279                 info = {
2280                     'id': data['item_id'],
2281                     'url': video_url,
2282                     'uploader': data['display_name'],
2283                     'upload_date': upload_date,
2284                     'title': data['title'],
2285                     'ext': ext,
2286                     'format': data['media']['mimeType'],
2287                     'thumbnail': data['thumbnailUrl'],
2288                     'description': data['description'],
2289                     'player_url': data['embedUrl'],
2290                     'user_agent': 'iTunes/10.6.1',
2291                 }
2292             except (ValueError,KeyError) as err:
2293                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2294                 return
2295
2296         return [info]
2297
2298
2299 class MyVideoIE(InfoExtractor):
2300     """Information Extractor for myvideo.de."""
2301
2302     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2303     IE_NAME = u'myvideo'
2304
2305     def __init__(self, downloader=None):
2306         InfoExtractor.__init__(self, downloader)
2307
2308     def report_extraction(self, video_id):
2309         """Report information extraction."""
2310         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2311
2312     def _real_extract(self,url):
2313         mobj = re.match(self._VALID_URL, url)
2314         if mobj is None:
2315             self._download.report_error(u'invalid URL: %s' % url)
2316             return
2317
2318         video_id = mobj.group(1)
2319
2320         # Get video webpage
2321         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2322         webpage = self._download_webpage(webpage_url, video_id)
2323
2324         self.report_extraction(video_id)
2325         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2326                  webpage)
2327         if mobj is None:
2328             self._downloader.report_error(u'unable to extract media URL')
2329             return
2330         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2331
2332         mobj = re.search('<title>([^<]+)</title>', webpage)
2333         if mobj is None:
2334             self._downloader.report_error(u'unable to extract title')
2335             return
2336
2337         video_title = mobj.group(1)
2338
2339         return [{
2340             'id':       video_id,
2341             'url':      video_url,
2342             'uploader': None,
2343             'upload_date':  None,
2344             'title':    video_title,
2345             'ext':      u'flv',
2346         }]
2347
2348 class ComedyCentralIE(InfoExtractor):
2349     """Information extractor for The Daily Show and Colbert Report """
2350
2351     # urls can be abbreviations like :thedailyshow or :colbert
2352     # urls for episodes like:
2353     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2354     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2355     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2356     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2357                       |(https?://)?(www\.)?
2358                           (?P<showname>thedailyshow|colbertnation)\.com/
2359                          (full-episodes/(?P<episode>.*)|
2360                           (?P<clip>
2361                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2362                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2363                      $"""
2364
2365     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2366
2367     _video_extensions = {
2368         '3500': 'mp4',
2369         '2200': 'mp4',
2370         '1700': 'mp4',
2371         '1200': 'mp4',
2372         '750': 'mp4',
2373         '400': 'mp4',
2374     }
2375     _video_dimensions = {
2376         '3500': '1280x720',
2377         '2200': '960x540',
2378         '1700': '768x432',
2379         '1200': '640x360',
2380         '750': '512x288',
2381         '400': '384x216',
2382     }
2383
2384     @classmethod
2385     def suitable(cls, url):
2386         """Receives a URL and returns True if suitable for this IE."""
2387         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2388
2389     def report_extraction(self, episode_id):
2390         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2391
2392     def report_config_download(self, episode_id, media_id):
2393         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2394
2395     def report_index_download(self, episode_id):
2396         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2397
2398     def _print_formats(self, formats):
2399         print('Available formats:')
2400         for x in formats:
2401             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2402
2403
2404     def _real_extract(self, url):
2405         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2406         if mobj is None:
2407             self._downloader.report_error(u'invalid URL: %s' % url)
2408             return
2409
2410         if mobj.group('shortname'):
2411             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2412                 url = u'http://www.thedailyshow.com/full-episodes/'
2413             else:
2414                 url = u'http://www.colbertnation.com/full-episodes/'
2415             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2416             assert mobj is not None
2417
2418         if mobj.group('clip'):
2419             if mobj.group('showname') == 'thedailyshow':
2420                 epTitle = mobj.group('tdstitle')
2421             else:
2422                 epTitle = mobj.group('cntitle')
2423             dlNewest = False
2424         else:
2425             dlNewest = not mobj.group('episode')
2426             if dlNewest:
2427                 epTitle = mobj.group('showname')
2428             else:
2429                 epTitle = mobj.group('episode')
2430
2431         req = compat_urllib_request.Request(url)
2432         self.report_extraction(epTitle)
2433         try:
2434             htmlHandle = compat_urllib_request.urlopen(req)
2435             html = htmlHandle.read()
2436             webpage = html.decode('utf-8')
2437         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2438             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2439             return
2440         if dlNewest:
2441             url = htmlHandle.geturl()
2442             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2443             if mobj is None:
2444                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2445                 return
2446             if mobj.group('episode') == '':
2447                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2448                 return
2449             epTitle = mobj.group('episode')
2450
2451         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2452
2453         if len(mMovieParams) == 0:
2454             # The Colbert Report embeds the information in a without
2455             # a URL prefix; so extract the alternate reference
2456             # and then add the URL prefix manually.
2457
2458             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2459             if len(altMovieParams) == 0:
2460                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2461                 return
2462             else:
2463                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2464
2465         uri = mMovieParams[0][1]
2466         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2467         self.report_index_download(epTitle)
2468         try:
2469             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2470         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2472             return
2473
2474         results = []
2475
2476         idoc = xml.etree.ElementTree.fromstring(indexXml)
2477         itemEls = idoc.findall('.//item')
2478         for partNum,itemEl in enumerate(itemEls):
2479             mediaId = itemEl.findall('./guid')[0].text
2480             shortMediaId = mediaId.split(':')[-1]
2481             showId = mediaId.split(':')[-2].replace('.com', '')
2482             officialTitle = itemEl.findall('./title')[0].text
2483             officialDate = itemEl.findall('./pubDate')[0].text
2484
2485             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2486                         compat_urllib_parse.urlencode({'uri': mediaId}))
2487             configReq = compat_urllib_request.Request(configUrl)
2488             self.report_config_download(epTitle, shortMediaId)
2489             try:
2490                 configXml = compat_urllib_request.urlopen(configReq).read()
2491             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2493                 return
2494
2495             cdoc = xml.etree.ElementTree.fromstring(configXml)
2496             turls = []
2497             for rendition in cdoc.findall('.//rendition'):
2498                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2499                 turls.append(finfo)
2500
2501             if len(turls) == 0:
2502                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2503                 continue
2504
2505             if self._downloader.params.get('listformats', None):
2506                 self._print_formats([i[0] for i in turls])
2507                 return
2508
2509             # For now, just pick the highest bitrate
2510             format,rtmp_video_url = turls[-1]
2511
2512             # Get the format arg from the arg stream
2513             req_format = self._downloader.params.get('format', None)
2514
2515             # Select format if we can find one
2516             for f,v in turls:
2517                 if f == req_format:
2518                     format, rtmp_video_url = f, v
2519                     break
2520
2521             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2522             if not m:
2523                 raise ExtractorError(u'Cannot transform RTMP url')
2524             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2525             video_url = base + m.group('finalid')
2526
2527             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2528             info = {
2529                 'id': shortMediaId,
2530                 'url': video_url,
2531                 'uploader': showId,
2532                 'upload_date': officialDate,
2533                 'title': effTitle,
2534                 'ext': 'mp4',
2535                 'format': format,
2536                 'thumbnail': None,
2537                 'description': officialTitle,
2538             }
2539             results.append(info)
2540
2541         return results
2542
2543
2544 class EscapistIE(InfoExtractor):
2545     """Information extractor for The Escapist """
2546
2547     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2548     IE_NAME = u'escapist'
2549
2550     def report_extraction(self, showName):
2551         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2552
2553     def report_config_download(self, showName):
2554         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2555
2556     def _real_extract(self, url):
2557         mobj = re.match(self._VALID_URL, url)
2558         if mobj is None:
2559             self._downloader.report_error(u'invalid URL: %s' % url)
2560             return
2561         showName = mobj.group('showname')
2562         videoId = mobj.group('episode')
2563
2564         self.report_extraction(showName)
2565         try:
2566             webPage = compat_urllib_request.urlopen(url)
2567             webPageBytes = webPage.read()
2568             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2569             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2570         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2572             return
2573
2574         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2575         description = unescapeHTML(descMatch.group(1))
2576         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2577         imgUrl = unescapeHTML(imgMatch.group(1))
2578         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2579         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2580         configUrlMatch = re.search('config=(.*)$', playerUrl)
2581         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2582
2583         self.report_config_download(showName)
2584         try:
2585             configJSON = compat_urllib_request.urlopen(configUrl)
2586             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2587             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2589             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2590             return
2591
2592         # Technically, it's JavaScript, not JSON
2593         configJSON = configJSON.replace("'", '"')
2594
2595         try:
2596             config = json.loads(configJSON)
2597         except (ValueError,) as err:
2598             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2599             return
2600
2601         playlist = config['playlist']
2602         videoUrl = playlist[1]['url']
2603
2604         info = {
2605             'id': videoId,
2606             'url': videoUrl,
2607             'uploader': showName,
2608             'upload_date': None,
2609             'title': showName,
2610             'ext': 'mp4',
2611             'thumbnail': imgUrl,
2612             'description': description,
2613             'player_url': playerUrl,
2614         }
2615
2616         return [info]
2617
2618 class CollegeHumorIE(InfoExtractor):
2619     """Information extractor for collegehumor.com"""
2620
2621     _WORKING = False
2622     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2623     IE_NAME = u'collegehumor'
2624
2625     def report_manifest(self, video_id):
2626         """Report information extraction."""
2627         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2628
2629     def report_extraction(self, video_id):
2630         """Report information extraction."""
2631         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2632
2633     def _real_extract(self, url):
2634         mobj = re.match(self._VALID_URL, url)
2635         if mobj is None:
2636             self._downloader.report_error(u'invalid URL: %s' % url)
2637             return
2638         video_id = mobj.group('videoid')
2639
2640         info = {
2641             'id': video_id,
2642             'uploader': None,
2643             'upload_date': None,
2644         }
2645
2646         self.report_extraction(video_id)
2647         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2648         try:
2649             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2650         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2652             return
2653
2654         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2655         try:
2656             videoNode = mdoc.findall('./video')[0]
2657             info['description'] = videoNode.findall('./description')[0].text
2658             info['title'] = videoNode.findall('./caption')[0].text
2659             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2660             manifest_url = videoNode.findall('./file')[0].text
2661         except IndexError:
2662             self._downloader.report_error(u'Invalid metadata XML file')
2663             return
2664
2665         manifest_url += '?hdcore=2.10.3'
2666         self.report_manifest(video_id)
2667         try:
2668             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2669         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2670             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2671             return
2672
2673         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2674         try:
2675             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2676             node_id = media_node.attrib['url']
2677             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2678         except IndexError as err:
2679             self._downloader.report_error(u'Invalid manifest file')
2680             return
2681
2682         url_pr = compat_urllib_parse_urlparse(manifest_url)
2683         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2684
2685         info['url'] = url
2686         info['ext'] = 'f4f'
2687         return [info]
2688
2689
2690 class XVideosIE(InfoExtractor):
2691     """Information extractor for xvideos.com"""
2692
2693     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2694     IE_NAME = u'xvideos'
2695
2696     def report_extraction(self, video_id):
2697         """Report information extraction."""
2698         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2699
2700     def _real_extract(self, url):
2701         mobj = re.match(self._VALID_URL, url)
2702         if mobj is None:
2703             self._downloader.report_error(u'invalid URL: %s' % url)
2704             return
2705         video_id = mobj.group(1)
2706
2707         webpage = self._download_webpage(url, video_id)
2708
2709         self.report_extraction(video_id)
2710
2711
2712         # Extract video URL
2713         mobj = re.search(r'flv_url=(.+?)&', webpage)
2714         if mobj is None:
2715             self._downloader.report_error(u'unable to extract video url')
2716             return
2717         video_url = compat_urllib_parse.unquote(mobj.group(1))
2718
2719
2720         # Extract title
2721         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2722         if mobj is None:
2723             self._downloader.report_error(u'unable to extract video title')
2724             return
2725         video_title = mobj.group(1)
2726
2727
2728         # Extract video thumbnail
2729         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2730         if mobj is None:
2731             self._downloader.report_error(u'unable to extract video thumbnail')
2732             return
2733         video_thumbnail = mobj.group(0)
2734
2735         info = {
2736             'id': video_id,
2737             'url': video_url,
2738             'uploader': None,
2739             'upload_date': None,
2740             'title': video_title,
2741             'ext': 'flv',
2742             'thumbnail': video_thumbnail,
2743             'description': None,
2744         }
2745
2746         return [info]
2747
2748
2749 class SoundcloudIE(InfoExtractor):
2750     """Information extractor for soundcloud.com
2751        To access the media, the uid of the song and a stream token
2752        must be extracted from the page source and the script must make
2753        a request to media.soundcloud.com/crossdomain.xml. Then
2754        the media can be grabbed by requesting from an url composed
2755        of the stream token and uid
2756      """
2757
2758     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2759     IE_NAME = u'soundcloud'
2760
2761     def __init__(self, downloader=None):
2762         InfoExtractor.__init__(self, downloader)
2763
2764     def report_resolve(self, video_id):
2765         """Report information extraction."""
2766         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2767
2768     def report_extraction(self, video_id):
2769         """Report information extraction."""
2770         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2771
2772     def _real_extract(self, url):
2773         mobj = re.match(self._VALID_URL, url)
2774         if mobj is None:
2775             self._downloader.report_error(u'invalid URL: %s' % url)
2776             return
2777
2778         # extract uploader (which is in the url)
2779         uploader = mobj.group(1)
2780         # extract simple title (uploader + slug of song title)
2781         slug_title =  mobj.group(2)
2782         simple_title = uploader + u'-' + slug_title
2783
2784         self.report_resolve('%s/%s' % (uploader, slug_title))
2785
2786         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2787         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2788         request = compat_urllib_request.Request(resolv_url)
2789         try:
2790             info_json_bytes = compat_urllib_request.urlopen(request).read()
2791             info_json = info_json_bytes.decode('utf-8')
2792         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2794             return
2795
2796         info = json.loads(info_json)
2797         video_id = info['id']
2798         self.report_extraction('%s/%s' % (uploader, slug_title))
2799
2800         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2801         request = compat_urllib_request.Request(streams_url)
2802         try:
2803             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2804             stream_json = stream_json_bytes.decode('utf-8')
2805         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2806             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2807             return
2808
2809         streams = json.loads(stream_json)
2810         mediaURL = streams['http_mp3_128_url']
2811
2812         return [{
2813             'id':       info['id'],
2814             'url':      mediaURL,
2815             'uploader': info['user']['username'],
2816             'upload_date':  info['created_at'],
2817             'title':    info['title'],
2818             'ext':      u'mp3',
2819             'description': info['description'],
2820         }]
2821
2822 class SoundcloudSetIE(InfoExtractor):
2823     """Information extractor for soundcloud.com sets
2824        To access the media, the uid of the song and a stream token
2825        must be extracted from the page source and the script must make
2826        a request to media.soundcloud.com/crossdomain.xml. Then
2827        the media can be grabbed by requesting from an url composed
2828        of the stream token and uid
2829      """
2830
2831     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2832     IE_NAME = u'soundcloud'
2833
2834     def __init__(self, downloader=None):
2835         InfoExtractor.__init__(self, downloader)
2836
2837     def report_resolve(self, video_id):
2838         """Report information extraction."""
2839         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2840
2841     def report_extraction(self, video_id):
2842         """Report information extraction."""
2843         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2844
2845     def _real_extract(self, url):
2846         mobj = re.match(self._VALID_URL, url)
2847         if mobj is None:
2848             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2849             return
2850
2851         # extract uploader (which is in the url)
2852         uploader = mobj.group(1)
2853         # extract simple title (uploader + slug of song title)
2854         slug_title =  mobj.group(2)
2855         simple_title = uploader + u'-' + slug_title
2856
2857         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2858
2859         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2860         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2861         request = compat_urllib_request.Request(resolv_url)
2862         try:
2863             info_json_bytes = compat_urllib_request.urlopen(request).read()
2864             info_json = info_json_bytes.decode('utf-8')
2865         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2867             return
2868
2869         videos = []
2870         info = json.loads(info_json)
2871         if 'errors' in info:
2872             for err in info['errors']:
2873                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2874             return
2875
2876         for track in info['tracks']:
2877             video_id = track['id']
2878             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2879
2880             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2881             request = compat_urllib_request.Request(streams_url)
2882             try:
2883                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2884                 stream_json = stream_json_bytes.decode('utf-8')
2885             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2886                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2887                 return
2888
2889             streams = json.loads(stream_json)
2890             mediaURL = streams['http_mp3_128_url']
2891
2892             videos.append({
2893                 'id':       video_id,
2894                 'url':      mediaURL,
2895                 'uploader': track['user']['username'],
2896                 'upload_date':  track['created_at'],
2897                 'title':    track['title'],
2898                 'ext':      u'mp3',
2899                 'description': track['description'],
2900             })
2901         return videos
2902
2903
2904 class InfoQIE(InfoExtractor):
2905     """Information extractor for infoq.com"""
2906     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2907
2908     def report_extraction(self, video_id):
2909         """Report information extraction."""
2910         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2911
2912     def _real_extract(self, url):
2913         mobj = re.match(self._VALID_URL, url)
2914         if mobj is None:
2915             self._downloader.report_error(u'invalid URL: %s' % url)
2916             return
2917
2918         webpage = self._download_webpage(url, video_id=url)
2919         self.report_extraction(url)
2920
2921         # Extract video URL
2922         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2923         if mobj is None:
2924             self._downloader.report_error(u'unable to extract video url')
2925             return
2926         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2927         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2928
2929         # Extract title
2930         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2931         if mobj is None:
2932             self._downloader.report_error(u'unable to extract video title')
2933             return
2934         video_title = mobj.group(1)
2935
2936         # Extract description
2937         video_description = u'No description available.'
2938         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2939         if mobj is not None:
2940             video_description = mobj.group(1)
2941
2942         video_filename = video_url.split('/')[-1]
2943         video_id, extension = video_filename.split('.')
2944
2945         info = {
2946             'id': video_id,
2947             'url': video_url,
2948             'uploader': None,
2949             'upload_date': None,
2950             'title': video_title,
2951             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2952             'thumbnail': None,
2953             'description': video_description,
2954         }
2955
2956         return [info]
2957
2958 class MixcloudIE(InfoExtractor):
2959     """Information extractor for www.mixcloud.com"""
2960
2961     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2962     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2963     IE_NAME = u'mixcloud'
2964
2965     def __init__(self, downloader=None):
2966         InfoExtractor.__init__(self, downloader)
2967
2968     def report_download_json(self, file_id):
2969         """Report JSON download."""
2970         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2971
2972     def report_extraction(self, file_id):
2973         """Report information extraction."""
2974         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2975
2976     def get_urls(self, jsonData, fmt, bitrate='best'):
2977         """Get urls from 'audio_formats' section in json"""
2978         file_url = None
2979         try:
2980             bitrate_list = jsonData[fmt]
2981             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2982                 bitrate = max(bitrate_list) # select highest
2983
2984             url_list = jsonData[fmt][bitrate]
2985         except TypeError: # we have no bitrate info.
2986             url_list = jsonData[fmt]
2987         return url_list
2988
2989     def check_urls(self, url_list):
2990         """Returns 1st active url from list"""
2991         for url in url_list:
2992             try:
2993                 compat_urllib_request.urlopen(url)
2994                 return url
2995             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2996                 url = None
2997
2998         return None
2999
3000     def _print_formats(self, formats):
3001         print('Available formats:')
3002         for fmt in formats.keys():
3003             for b in formats[fmt]:
3004                 try:
3005                     ext = formats[fmt][b][0]
3006                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3007                 except TypeError: # we have no bitrate info
3008                     ext = formats[fmt][0]
3009                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3010                     break
3011
3012     def _real_extract(self, url):
3013         mobj = re.match(self._VALID_URL, url)
3014         if mobj is None:
3015             self._downloader.report_error(u'invalid URL: %s' % url)
3016             return
3017         # extract uploader & filename from url
3018         uploader = mobj.group(1).decode('utf-8')
3019         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3020
3021         # construct API request
3022         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3023         # retrieve .json file with links to files
3024         request = compat_urllib_request.Request(file_url)
3025         try:
3026             self.report_download_json(file_url)
3027             jsonData = compat_urllib_request.urlopen(request).read()
3028         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3029             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3030             return
3031
3032         # parse JSON
3033         json_data = json.loads(jsonData)
3034         player_url = json_data['player_swf_url']
3035         formats = dict(json_data['audio_formats'])
3036
3037         req_format = self._downloader.params.get('format', None)
3038         bitrate = None
3039
3040         if self._downloader.params.get('listformats', None):
3041             self._print_formats(formats)
3042             return
3043
3044         if req_format is None or req_format == 'best':
3045             for format_param in formats.keys():
3046                 url_list = self.get_urls(formats, format_param)
3047                 # check urls
3048                 file_url = self.check_urls(url_list)
3049                 if file_url is not None:
3050                     break # got it!
3051         else:
3052             if req_format not in formats:
3053                 self._downloader.report_error(u'format is not available')
3054                 return
3055
3056             url_list = self.get_urls(formats, req_format)
3057             file_url = self.check_urls(url_list)
3058             format_param = req_format
3059
3060         return [{
3061             'id': file_id.decode('utf-8'),
3062             'url': file_url.decode('utf-8'),
3063             'uploader': uploader.decode('utf-8'),
3064             'upload_date': None,
3065             'title': json_data['name'],
3066             'ext': file_url.split('.')[-1].decode('utf-8'),
3067             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3068             'thumbnail': json_data['thumbnail_url'],
3069             'description': json_data['description'],
3070             'player_url': player_url.decode('utf-8'),
3071         }]
3072
3073 class StanfordOpenClassroomIE(InfoExtractor):
3074     """Information extractor for Stanford's Open ClassRoom"""
3075
3076     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3077     IE_NAME = u'stanfordoc'
3078
3079     def report_download_webpage(self, objid):
3080         """Report information extraction."""
3081         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3082
3083     def report_extraction(self, video_id):
3084         """Report information extraction."""
3085         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3086
3087     def _real_extract(self, url):
3088         mobj = re.match(self._VALID_URL, url)
3089         if mobj is None:
3090             raise ExtractorError(u'Invalid URL: %s' % url)
3091
3092         if mobj.group('course') and mobj.group('video'): # A specific video
3093             course = mobj.group('course')
3094             video = mobj.group('video')
3095             info = {
3096                 'id': course + '_' + video,
3097                 'uploader': None,
3098                 'upload_date': None,
3099             }
3100
3101             self.report_extraction(info['id'])
3102             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3103             xmlUrl = baseUrl + video + '.xml'
3104             try:
3105                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3106             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3107                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3108                 return
3109             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3110             try:
3111                 info['title'] = mdoc.findall('./title')[0].text
3112                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3113             except IndexError:
3114                 self._downloader.report_error(u'Invalid metadata XML file')
3115                 return
3116             info['ext'] = info['url'].rpartition('.')[2]
3117             return [info]
3118         elif mobj.group('course'): # A course page
3119             course = mobj.group('course')
3120             info = {
3121                 'id': course,
3122                 'type': 'playlist',
3123                 'uploader': None,
3124                 'upload_date': None,
3125             }
3126
3127             coursepage = self._download_webpage(url, info['id'],
3128                                         note='Downloading course info page',
3129                                         errnote='Unable to download course info page')
3130
3131             m = re.search('<h1>([^<]+)</h1>', coursepage)
3132             if m:
3133                 info['title'] = unescapeHTML(m.group(1))
3134             else:
3135                 info['title'] = info['id']
3136
3137             m = re.search('<description>([^<]+)</description>', coursepage)
3138             if m:
3139                 info['description'] = unescapeHTML(m.group(1))
3140
3141             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3142             info['list'] = [
3143                 {
3144                     'type': 'reference',
3145                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3146                 }
3147                     for vpage in links]
3148             results = []
3149             for entry in info['list']:
3150                 assert entry['type'] == 'reference'
3151                 results += self.extract(entry['url'])
3152             return results
3153         else: # Root page
3154             info = {
3155                 'id': 'Stanford OpenClassroom',
3156                 'type': 'playlist',
3157                 'uploader': None,
3158                 'upload_date': None,
3159             }
3160
3161             self.report_download_webpage(info['id'])
3162             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3163             try:
3164                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3165             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3167                 return
3168
3169             info['title'] = info['id']
3170
3171             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3172             info['list'] = [
3173                 {
3174                     'type': 'reference',
3175                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3176                 }
3177                     for cpage in links]
3178
3179             results = []
3180             for entry in info['list']:
3181                 assert entry['type'] == 'reference'
3182                 results += self.extract(entry['url'])
3183             return results
3184
3185 class MTVIE(InfoExtractor):
3186     """Information extractor for MTV.com"""
3187
3188     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3189     IE_NAME = u'mtv'
3190
3191     def report_extraction(self, video_id):
3192         """Report information extraction."""
3193         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3194
3195     def _real_extract(self, url):
3196         mobj = re.match(self._VALID_URL, url)
3197         if mobj is None:
3198             self._downloader.report_error(u'invalid URL: %s' % url)
3199             return
3200         if not mobj.group('proto'):
3201             url = 'http://' + url
3202         video_id = mobj.group('videoid')
3203
3204         webpage = self._download_webpage(url, video_id)
3205
3206         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3207         if mobj is None:
3208             self._downloader.report_error(u'unable to extract song name')
3209             return
3210         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3211         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3212         if mobj is None:
3213             self._downloader.report_error(u'unable to extract performer')
3214             return
3215         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3216         video_title = performer + ' - ' + song_name
3217
3218         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3219         if mobj is None:
3220             self._downloader.report_error(u'unable to mtvn_uri')
3221             return
3222         mtvn_uri = mobj.group(1)
3223
3224         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3225         if mobj is None:
3226             self._downloader.report_error(u'unable to extract content id')
3227             return
3228         content_id = mobj.group(1)
3229
3230         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3231         self.report_extraction(video_id)
3232         request = compat_urllib_request.Request(videogen_url)
3233         try:
3234             metadataXml = compat_urllib_request.urlopen(request).read()
3235         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3237             return
3238
3239         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3240         renditions = mdoc.findall('.//rendition')
3241
3242         # For now, always pick the highest quality.
3243         rendition = renditions[-1]
3244
3245         try:
3246             _,_,ext = rendition.attrib['type'].partition('/')
3247             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3248             video_url = rendition.find('./src').text
3249         except KeyError:
3250             self._downloader.trouble('Invalid rendition field.')
3251             return
3252
3253         info = {
3254             'id': video_id,
3255             'url': video_url,
3256             'uploader': performer,
3257             'upload_date': None,
3258             'title': video_title,
3259             'ext': ext,
3260             'format': format,
3261         }
3262
3263         return [info]
3264
3265
3266 class YoukuIE(InfoExtractor):
3267     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3268
3269     def report_download_webpage(self, file_id):
3270         """Report webpage download."""
3271         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3272
3273     def report_extraction(self, file_id):
3274         """Report information extraction."""
3275         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3276
3277     def _gen_sid(self):
3278         nowTime = int(time.time() * 1000)
3279         random1 = random.randint(1000,1998)
3280         random2 = random.randint(1000,9999)
3281
3282         return "%d%d%d" %(nowTime,random1,random2)
3283
3284     def _get_file_ID_mix_string(self, seed):
3285         mixed = []
3286         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3287         seed = float(seed)
3288         for i in range(len(source)):
3289             seed  =  (seed * 211 + 30031 ) % 65536
3290             index  =  math.floor(seed / 65536 * len(source) )
3291             mixed.append(source[int(index)])
3292             source.remove(source[int(index)])
3293         #return ''.join(mixed)
3294         return mixed
3295
3296     def _get_file_id(self, fileId, seed):
3297         mixed = self._get_file_ID_mix_string(seed)
3298         ids = fileId.split('*')
3299         realId = []
3300         for ch in ids:
3301             if ch:
3302                 realId.append(mixed[int(ch)])
3303         return ''.join(realId)
3304
3305     def _real_extract(self, url):
3306         mobj = re.match(self._VALID_URL, url)
3307         if mobj is None:
3308             self._downloader.report_error(u'invalid URL: %s' % url)
3309             return
3310         video_id = mobj.group('ID')
3311
3312         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3313
3314         request = compat_urllib_request.Request(info_url, None, std_headers)
3315         try:
3316             self.report_download_webpage(video_id)
3317             jsondata = compat_urllib_request.urlopen(request).read()
3318         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3319             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3320             return
3321
3322         self.report_extraction(video_id)
3323         try:
3324             jsonstr = jsondata.decode('utf-8')
3325             config = json.loads(jsonstr)
3326
3327             video_title =  config['data'][0]['title']
3328             seed = config['data'][0]['seed']
3329
3330             format = self._downloader.params.get('format', None)
3331             supported_format = list(config['data'][0]['streamfileids'].keys())
3332
3333             if format is None or format == 'best':
3334                 if 'hd2' in supported_format:
3335                     format = 'hd2'
3336                 else:
3337                     format = 'flv'
3338                 ext = u'flv'
3339             elif format == 'worst':
3340                 format = 'mp4'
3341                 ext = u'mp4'
3342             else:
3343                 format = 'flv'
3344                 ext = u'flv'
3345
3346
3347             fileid = config['data'][0]['streamfileids'][format]
3348             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3349         except (UnicodeDecodeError, ValueError, KeyError):
3350             self._downloader.report_error(u'unable to extract info section')
3351             return
3352
3353         files_info=[]
3354         sid = self._gen_sid()
3355         fileid = self._get_file_id(fileid, seed)
3356
3357         #column 8,9 of fileid represent the segment number
3358         #fileid[7:9] should be changed
3359         for index, key in enumerate(keys):
3360
3361             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3362             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3363
3364             info = {
3365                 'id': '%s_part%02d' % (video_id, index),
3366                 'url': download_url,
3367                 'uploader': None,
3368                 'upload_date': None,
3369                 'title': video_title,
3370                 'ext': ext,
3371             }
3372             files_info.append(info)
3373
3374         return files_info
3375
3376
3377 class XNXXIE(InfoExtractor):
3378     """Information extractor for xnxx.com"""
3379
3380     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3381     IE_NAME = u'xnxx'
3382     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3383     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3384     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3385
3386     def report_webpage(self, video_id):
3387         """Report information extraction"""
3388         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3389
3390     def report_extraction(self, video_id):
3391         """Report information extraction"""
3392         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3393
3394     def _real_extract(self, url):
3395         mobj = re.match(self._VALID_URL, url)
3396         if mobj is None:
3397             self._downloader.report_error(u'invalid URL: %s' % url)
3398             return
3399         video_id = mobj.group(1)
3400
3401         self.report_webpage(video_id)
3402
3403         # Get webpage content
3404         try:
3405             webpage_bytes = compat_urllib_request.urlopen(url).read()
3406             webpage = webpage_bytes.decode('utf-8')
3407         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3408             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3409             return
3410
3411         result = re.search(self.VIDEO_URL_RE, webpage)
3412         if result is None:
3413             self._downloader.report_error(u'unable to extract video url')
3414             return
3415         video_url = compat_urllib_parse.unquote(result.group(1))
3416
3417         result = re.search(self.VIDEO_TITLE_RE, webpage)
3418         if result is None:
3419             self._downloader.report_error(u'unable to extract video title')
3420             return
3421         video_title = result.group(1)
3422
3423         result = re.search(self.VIDEO_THUMB_RE, webpage)
3424         if result is None:
3425             self._downloader.report_error(u'unable to extract video thumbnail')
3426             return
3427         video_thumbnail = result.group(1)
3428
3429         return [{
3430             'id': video_id,
3431             'url': video_url,
3432             'uploader': None,
3433             'upload_date': None,
3434             'title': video_title,
3435             'ext': 'flv',
3436             'thumbnail': video_thumbnail,
3437             'description': None,
3438         }]
3439
3440
3441 class GooglePlusIE(InfoExtractor):
3442     """Information extractor for plus.google.com."""
3443
3444     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3445     IE_NAME = u'plus.google'
3446
3447     def __init__(self, downloader=None):
3448         InfoExtractor.__init__(self, downloader)
3449
3450     def report_extract_entry(self, url):
3451         """Report downloading extry"""
3452         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3453
3454     def report_date(self, upload_date):
3455         """Report downloading extry"""
3456         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3457
3458     def report_uploader(self, uploader):
3459         """Report downloading extry"""
3460         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3461
3462     def report_title(self, video_title):
3463         """Report downloading extry"""
3464         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3465
3466     def report_extract_vid_page(self, video_page):
3467         """Report information extraction."""
3468         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3469
3470     def _real_extract(self, url):
3471         # Extract id from URL
3472         mobj = re.match(self._VALID_URL, url)
3473         if mobj is None:
3474             self._downloader.report_error(u'Invalid URL: %s' % url)
3475             return
3476
3477         post_url = mobj.group(0)
3478         video_id = mobj.group(1)
3479
3480         video_extension = 'flv'
3481
3482         # Step 1, Retrieve post webpage to extract further information
3483         self.report_extract_entry(post_url)
3484         request = compat_urllib_request.Request(post_url)
3485         try:
3486             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3489             return
3490
3491         # Extract update date
3492         upload_date = None
3493         pattern = 'title="Timestamp">(.*?)</a>'
3494         mobj = re.search(pattern, webpage)
3495         if mobj:
3496             upload_date = mobj.group(1)
3497             # Convert timestring to a format suitable for filename
3498             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3499             upload_date = upload_date.strftime('%Y%m%d')
3500         self.report_date(upload_date)
3501
3502         # Extract uploader
3503         uploader = None
3504         pattern = r'rel\="author".*?>(.*?)</a>'
3505         mobj = re.search(pattern, webpage)
3506         if mobj:
3507             uploader = mobj.group(1)
3508         self.report_uploader(uploader)
3509
3510         # Extract title
3511         # Get the first line for title
3512         video_title = u'NA'
3513         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3514         mobj = re.search(pattern, webpage)
3515         if mobj:
3516             video_title = mobj.group(1)
3517         self.report_title(video_title)
3518
3519         # Step 2, Stimulate clicking the image box to launch video
3520         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3521         mobj = re.search(pattern, webpage)
3522         if mobj is None:
3523             self._downloader.report_error(u'unable to extract video page URL')
3524
3525         video_page = mobj.group(1)
3526         request = compat_urllib_request.Request(video_page)
3527         try:
3528             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3529         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3530             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3531             return
3532         self.report_extract_vid_page(video_page)
3533
3534
3535         # Extract video links on video page
3536         """Extract video links of all sizes"""
3537         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3538         mobj = re.findall(pattern, webpage)
3539         if len(mobj) == 0:
3540             self._downloader.report_error(u'unable to extract video links')
3541
3542         # Sort in resolution
3543         links = sorted(mobj)
3544
3545         # Choose the lowest of the sort, i.e. highest resolution
3546         video_url = links[-1]
3547         # Only get the url. The resolution part in the tuple has no use anymore
3548         video_url = video_url[-1]
3549         # Treat escaped \u0026 style hex
3550         try:
3551             video_url = video_url.decode("unicode_escape")
3552         except AttributeError: # Python 3
3553             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3554
3555
3556         return [{
3557             'id':       video_id,
3558             'url':      video_url,
3559             'uploader': uploader,
3560             'upload_date':  upload_date,
3561             'title':    video_title,
3562             'ext':      video_extension,
3563         }]
3564
3565 class NBAIE(InfoExtractor):
3566     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3567     IE_NAME = u'nba'
3568
3569     def _real_extract(self, url):
3570         mobj = re.match(self._VALID_URL, url)
3571         if mobj is None:
3572             self._downloader.report_error(u'invalid URL: %s' % url)
3573             return
3574
3575         video_id = mobj.group(1)
3576         if video_id.endswith('/index.html'):
3577             video_id = video_id[:-len('/index.html')]
3578
3579         webpage = self._download_webpage(url, video_id)
3580
3581         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3582         def _findProp(rexp, default=None):
3583             m = re.search(rexp, webpage)
3584             if m:
3585                 return unescapeHTML(m.group(1))
3586             else:
3587                 return default
3588
3589         shortened_video_id = video_id.rpartition('/')[2]
3590         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3591         info = {
3592             'id': shortened_video_id,
3593             'url': video_url,
3594             'ext': 'mp4',
3595             'title': title,
3596             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3597             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3598         }
3599         return [info]
3600
3601 class JustinTVIE(InfoExtractor):
3602     """Information extractor for justin.tv and twitch.tv"""
3603     # TODO: One broadcast may be split into multiple videos. The key
3604     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3605     # starts at 1 and increases. Can we treat all parts as one video?
3606
3607     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3608         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3609     _JUSTIN_PAGE_LIMIT = 100
3610     IE_NAME = u'justin.tv'
3611
3612     def report_extraction(self, file_id):
3613         """Report information extraction."""
3614         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3615
3616     def report_download_page(self, channel, offset):
3617         """Report attempt to download a single page of videos."""
3618         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3619                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3620
3621     # Return count of items, list of *valid* items
3622     def _parse_page(self, url):
3623         try:
3624             urlh = compat_urllib_request.urlopen(url)
3625             webpage_bytes = urlh.read()
3626             webpage = webpage_bytes.decode('utf-8', 'ignore')
3627         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3628             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3629             return
3630
3631         response = json.loads(webpage)
3632         if type(response) != list:
3633             error_text = response.get('error', 'unknown error')
3634             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3635             return
3636         info = []
3637         for clip in response:
3638             video_url = clip['video_file_url']
3639             if video_url:
3640                 video_extension = os.path.splitext(video_url)[1][1:]
3641                 video_date = re.sub('-', '', clip['start_time'][:10])
3642                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3643                 video_id = clip['id']
3644                 video_title = clip.get('title', video_id)
3645                 info.append({
3646                     'id': video_id,
3647                     'url': video_url,
3648                     'title': video_title,
3649                     'uploader': clip.get('channel_name', video_uploader_id),
3650                     'uploader_id': video_uploader_id,
3651                     'upload_date': video_date,
3652                     'ext': video_extension,
3653                 })
3654         return (len(response), info)
3655
3656     def _real_extract(self, url):
3657         mobj = re.match(self._VALID_URL, url)
3658         if mobj is None:
3659             self._downloader.report_error(u'invalid URL: %s' % url)
3660             return
3661
3662         api = 'http://api.justin.tv'
3663         video_id = mobj.group(mobj.lastindex)
3664         paged = False
3665         if mobj.lastindex == 1:
3666             paged = True
3667             api += '/channel/archives/%s.json'
3668         else:
3669             api += '/broadcast/by_archive/%s.json'
3670         api = api % (video_id,)
3671
3672         self.report_extraction(video_id)
3673
3674         info = []
3675         offset = 0
3676         limit = self._JUSTIN_PAGE_LIMIT
3677         while True:
3678             if paged:
3679                 self.report_download_page(video_id, offset)
3680             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3681             page_count, page_info = self._parse_page(page_url)
3682             info.extend(page_info)
3683             if not paged or page_count != limit:
3684                 break
3685             offset += limit
3686         return info
3687
3688 class FunnyOrDieIE(InfoExtractor):
3689     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3690
3691     def _real_extract(self, url):
3692         mobj = re.match(self._VALID_URL, url)
3693         if mobj is None:
3694             self._downloader.report_error(u'invalid URL: %s' % url)
3695             return
3696
3697         video_id = mobj.group('id')
3698         webpage = self._download_webpage(url, video_id)
3699
3700         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3701         if not m:
3702             self._downloader.report_error(u'unable to find video information')
3703         video_url = unescapeHTML(m.group('url'))
3704
3705         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3706         if not m:
3707             self._downloader.trouble(u'Cannot find video title')
3708         title = clean_html(m.group('title'))
3709
3710         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3711         if m:
3712             desc = unescapeHTML(m.group('desc'))
3713         else:
3714             desc = None
3715
3716         info = {
3717             'id': video_id,
3718             'url': video_url,
3719             'ext': 'mp4',
3720             'title': title,
3721             'description': desc,
3722         }
3723         return [info]
3724
3725 class SteamIE(InfoExtractor):
3726     _VALID_URL = r"""http://store.steampowered.com/
3727                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3728                 (?P<gameID>\d+)/?
3729                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3730                 """
3731
3732     @classmethod
3733     def suitable(cls, url):
3734         """Receives a URL and returns True if suitable for this IE."""
3735         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3736
3737     def _real_extract(self, url):
3738         m = re.match(self._VALID_URL, url, re.VERBOSE)
3739         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3740         gameID = m.group('gameID')
3741         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3742         webpage = self._download_webpage(videourl, gameID)
3743         mweb = re.finditer(urlRE, webpage)
3744         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3745         titles = re.finditer(namesRE, webpage)
3746         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3747         thumbs = re.finditer(thumbsRE, webpage)
3748         videos = []
3749         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3750             video_id = vid.group('videoID')
3751             title = vtitle.group('videoName')
3752             video_url = vid.group('videoURL')
3753             video_thumb = thumb.group('thumbnail')
3754             if not video_url:
3755                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3756             info = {
3757                 'id':video_id,
3758                 'url':video_url,
3759                 'ext': 'flv',
3760                 'title': unescapeHTML(title),
3761                 'thumbnail': video_thumb
3762                   }
3763             videos.append(info)
3764         return videos
3765
3766 class UstreamIE(InfoExtractor):
3767     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3768     IE_NAME = u'ustream'
3769
3770     def _real_extract(self, url):
3771         m = re.match(self._VALID_URL, url)
3772         video_id = m.group('videoID')
3773         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3774         webpage = self._download_webpage(url, video_id)
3775         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3776         title = m.group('title')
3777         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3778         uploader = m.group('uploader')
3779         info = {
3780                 'id':video_id,
3781                 'url':video_url,
3782                 'ext': 'flv',
3783                 'title': title,
3784                 'uploader': uploader
3785                   }
3786         return [info]
3787
3788 class WorldStarHipHopIE(InfoExtractor):
3789     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3790     IE_NAME = u'WorldStarHipHop'
3791
3792     def _real_extract(self, url):
3793         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3794
3795         webpage_src = compat_urllib_request.urlopen(url).read()
3796         webpage_src = webpage_src.decode('utf-8')
3797
3798         mobj = re.search(_src_url, webpage_src)
3799
3800         m = re.match(self._VALID_URL, url)
3801         video_id = m.group('id')
3802
3803         if mobj is not None:
3804             video_url = mobj.group()
3805             if 'mp4' in video_url:
3806                 ext = 'mp4'
3807             else:
3808                 ext = 'flv'
3809         else:
3810             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3811             return
3812
3813         _title = r"""<title>(.*)</title>"""
3814
3815         mobj = re.search(_title, webpage_src)
3816
3817         if mobj is not None:
3818             title = mobj.group(1)
3819         else:
3820             title = 'World Start Hip Hop - %s' % time.ctime()
3821
3822         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3823         mobj = re.search(_thumbnail, webpage_src)
3824
3825         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3826         if mobj is not None:
3827             thumbnail = mobj.group(1)
3828         else:
3829             _title = r"""candytitles.*>(.*)</span>"""
3830             mobj = re.search(_title, webpage_src)
3831             if mobj is not None:
3832                 title = mobj.group(1)
3833             thumbnail = None
3834
3835         results = [{
3836                     'id': video_id,
3837                     'url' : video_url,
3838                     'title' : title,
3839                     'thumbnail' : thumbnail,
3840                     'ext' : ext,
3841                     }]
3842         return results
3843
3844 class RBMARadioIE(InfoExtractor):
3845     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3846
3847     def _real_extract(self, url):
3848         m = re.match(self._VALID_URL, url)
3849         video_id = m.group('videoID')
3850
3851         webpage = self._download_webpage(url, video_id)
3852         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3853         if not m:
3854             raise ExtractorError(u'Cannot find metadata')
3855         json_data = m.group(1)
3856
3857         try:
3858             data = json.loads(json_data)
3859         except ValueError as e:
3860             raise ExtractorError(u'Invalid JSON: ' + str(e))
3861
3862         video_url = data['akamai_url'] + '&cbr=256'
3863         url_parts = compat_urllib_parse_urlparse(video_url)
3864         video_ext = url_parts.path.rpartition('.')[2]
3865         info = {
3866                 'id': video_id,
3867                 'url': video_url,
3868                 'ext': video_ext,
3869                 'title': data['title'],
3870                 'description': data.get('teaser_text'),
3871                 'location': data.get('country_of_origin'),
3872                 'uploader': data.get('host', {}).get('name'),
3873                 'uploader_id': data.get('host', {}).get('slug'),
3874                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3875                 'duration': data.get('duration'),
3876         }
3877         return [info]
3878
3879
3880 class YouPornIE(InfoExtractor):
3881     """Information extractor for youporn.com."""
3882     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3883
3884     def _print_formats(self, formats):
3885         """Print all available formats"""
3886         print(u'Available formats:')
3887         print(u'ext\t\tformat')
3888         print(u'---------------------------------')
3889         for format in formats:
3890             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3891
3892     def _specific(self, req_format, formats):
3893         for x in formats:
3894             if(x["format"]==req_format):
3895                 return x
3896         return None
3897
3898     def _real_extract(self, url):
3899         mobj = re.match(self._VALID_URL, url)
3900         if mobj is None:
3901             self._downloader.report_error(u'invalid URL: %s' % url)
3902             return
3903
3904         video_id = mobj.group('videoid')
3905
3906         req = compat_urllib_request.Request(url)
3907         req.add_header('Cookie', 'age_verified=1')
3908         webpage = self._download_webpage(req, video_id)
3909
3910         # Get the video title
3911         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3912         if result is None:
3913             raise ExtractorError(u'Unable to extract video title')
3914         video_title = result.group('title').strip()
3915
3916         # Get the video date
3917         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3918         if result is None:
3919             self._downloader.report_warning(u'unable to extract video date')
3920             upload_date = None
3921         else:
3922             upload_date = result.group('date').strip()
3923
3924         # Get the video uploader
3925         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3926         if result is None:
3927             self._downloader.report_warning(u'unable to extract uploader')
3928             video_uploader = None
3929         else:
3930             video_uploader = result.group('uploader').strip()
3931             video_uploader = clean_html( video_uploader )
3932
3933         # Get all of the formats available
3934         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3935         result = re.search(DOWNLOAD_LIST_RE, webpage)
3936         if result is None:
3937             raise ExtractorError(u'Unable to extract download list')
3938         download_list_html = result.group('download_list').strip()
3939
3940         # Get all of the links from the page
3941         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3942         links = re.findall(LINK_RE, download_list_html)
3943         if(len(links) == 0):
3944             raise ExtractorError(u'ERROR: no known formats available for video')
3945
3946         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3947
3948         formats = []
3949         for link in links:
3950
3951             # A link looks like this:
3952             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3953             # A path looks like this:
3954             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3955             video_url = unescapeHTML( link )
3956             path = compat_urllib_parse_urlparse( video_url ).path
3957             extension = os.path.splitext( path )[1][1:]
3958             format = path.split('/')[4].split('_')[:2]
3959             size = format[0]
3960             bitrate = format[1]
3961             format = "-".join( format )
3962             title = u'%s-%s-%s' % (video_title, size, bitrate)
3963
3964             formats.append({
3965                 'id': video_id,
3966                 'url': video_url,
3967                 'uploader': video_uploader,
3968                 'upload_date': upload_date,
3969                 'title': title,
3970                 'ext': extension,
3971                 'format': format,
3972                 'thumbnail': None,
3973                 'description': None,
3974                 'player_url': None
3975             })
3976
3977         if self._downloader.params.get('listformats', None):
3978             self._print_formats(formats)
3979             return
3980
3981         req_format = self._downloader.params.get('format', None)
3982         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3983
3984         if req_format is None or req_format == 'best':
3985             return [formats[0]]
3986         elif req_format == 'worst':
3987             return [formats[-1]]
3988         elif req_format in ('-1', 'all'):
3989             return formats
3990         else:
3991             format = self._specific( req_format, formats )
3992             if result is None:
3993                 self._downloader.report_error(u'requested format not available')
3994                 return
3995             return [format]
3996
3997
3998
3999 class PornotubeIE(InfoExtractor):
4000     """Information extractor for pornotube.com."""
4001     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4002
4003     def _real_extract(self, url):
4004         mobj = re.match(self._VALID_URL, url)
4005         if mobj is None:
4006             self._downloader.report_error(u'invalid URL: %s' % url)
4007             return
4008
4009         video_id = mobj.group('videoid')
4010         video_title = mobj.group('title')
4011
4012         # Get webpage content
4013         webpage = self._download_webpage(url, video_id)
4014
4015         # Get the video URL
4016         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4017         result = re.search(VIDEO_URL_RE, webpage)
4018         if result is None:
4019             self._downloader.report_error(u'unable to extract video url')
4020             return
4021         video_url = compat_urllib_parse.unquote(result.group('url'))
4022
4023         #Get the uploaded date
4024         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4025         result = re.search(VIDEO_UPLOADED_RE, webpage)
4026         if result is None:
4027             self._downloader.report_error(u'unable to extract video title')
4028             return
4029         upload_date = result.group('date')
4030
4031         info = {'id': video_id,
4032                 'url': video_url,
4033                 'uploader': None,
4034                 'upload_date': upload_date,
4035                 'title': video_title,
4036                 'ext': 'flv',
4037                 'format': 'flv'}
4038
4039         return [info]
4040
4041 class YouJizzIE(InfoExtractor):
4042     """Information extractor for youjizz.com."""
4043     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4044
4045     def _real_extract(self, url):
4046         mobj = re.match(self._VALID_URL, url)
4047         if mobj is None:
4048             self._downloader.report_error(u'invalid URL: %s' % url)
4049             return
4050
4051         video_id = mobj.group('videoid')
4052
4053         # Get webpage content
4054         webpage = self._download_webpage(url, video_id)
4055
4056         # Get the video title
4057         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4058         if result is None:
4059             raise ExtractorError(u'ERROR: unable to extract video title')
4060         video_title = result.group('title').strip()
4061
4062         # Get the embed page
4063         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4064         if result is None:
4065             raise ExtractorError(u'ERROR: unable to extract embed page')
4066
4067         embed_page_url = result.group(0).strip()
4068         video_id = result.group('videoid')
4069
4070         webpage = self._download_webpage(embed_page_url, video_id)
4071
4072         # Get the video URL
4073         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4074         if result is None:
4075             raise ExtractorError(u'ERROR: unable to extract video url')
4076         video_url = result.group('source')
4077
4078         info = {'id': video_id,
4079                 'url': video_url,
4080                 'title': video_title,
4081                 'ext': 'flv',
4082                 'format': 'flv',
4083                 'player_url': embed_page_url}
4084
4085         return [info]
4086
4087 class EightTracksIE(InfoExtractor):
4088     IE_NAME = '8tracks'
4089     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4090
4091     def _real_extract(self, url):
4092         mobj = re.match(self._VALID_URL, url)
4093         if mobj is None:
4094             raise ExtractorError(u'Invalid URL: %s' % url)
4095         playlist_id = mobj.group('id')
4096
4097         webpage = self._download_webpage(url, playlist_id)
4098
4099         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4100         if not m:
4101             raise ExtractorError(u'Cannot find trax information')
4102         json_like = m.group(1)
4103         data = json.loads(json_like)
4104
4105         session = str(random.randint(0, 1000000000))
4106         mix_id = data['id']
4107         track_count = data['tracks_count']
4108         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4109         next_url = first_url
4110         res = []
4111         for i in itertools.count():
4112             api_json = self._download_webpage(next_url, playlist_id,
4113                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4114                 errnote=u'Failed to download song information')
4115             api_data = json.loads(api_json)
4116             track_data = api_data[u'set']['track']
4117             info = {
4118                 'id': track_data['id'],
4119                 'url': track_data['track_file_stream_url'],
4120                 'title': track_data['performer'] + u' - ' + track_data['name'],
4121                 'raw_title': track_data['name'],
4122                 'uploader_id': data['user']['login'],
4123                 'ext': 'm4a',
4124             }
4125             res.append(info)
4126             if api_data['set']['at_last_track']:
4127                 break
4128             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4129         return res
4130
4131 class KeekIE(InfoExtractor):
4132     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4133     IE_NAME = u'keek'
4134
4135     def _real_extract(self, url):
4136         m = re.match(self._VALID_URL, url)
4137         video_id = m.group('videoID')
4138         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4139         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4140         webpage = self._download_webpage(url, video_id)
4141         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4142         title = unescapeHTML(m.group('title'))
4143         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4144         uploader = clean_html(m.group('uploader'))
4145         info = {
4146                 'id': video_id,
4147                 'url': video_url,
4148                 'ext': 'mp4',
4149                 'title': title,
4150                 'thumbnail': thumbnail,
4151                 'uploader': uploader
4152         }
4153         return [info]
4154
4155 class TEDIE(InfoExtractor):
4156     _VALID_URL=r'''http://www.ted.com/
4157                    (
4158                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4159                         |
4160                         ((?P<type_talk>talks)) # We have a simple talk
4161                    )
4162                    /(?P<name>\w+) # Here goes the name and then ".html"
4163                    '''
4164
4165     @classmethod
4166     def suitable(cls, url):
4167         """Receives a URL and returns True if suitable for this IE."""
4168         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4169
4170     def _real_extract(self, url):
4171         m=re.match(self._VALID_URL, url, re.VERBOSE)
4172         if m.group('type_talk'):
4173             return [self._talk_info(url)]
4174         else :
4175             playlist_id=m.group('playlist_id')
4176             name=m.group('name')
4177             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4178             return self._playlist_videos_info(url,name,playlist_id)
4179
4180     def _talk_video_link(self,mediaSlug):
4181         '''Returns the video link for that mediaSlug'''
4182         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4183
4184     def _playlist_videos_info(self,url,name,playlist_id=0):
4185         '''Returns the videos of the playlist'''
4186         video_RE=r'''
4187                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4188                      ([.\s]*?)data-playlist_item_id="(\d+)"
4189                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4190                      '''
4191         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4192         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4193         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4194         m_names=re.finditer(video_name_RE,webpage)
4195         info=[]
4196         for m_video, m_name in zip(m_videos,m_names):
4197             video_id=m_video.group('video_id')
4198             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4199             info.append(self._talk_info(talk_url,video_id))
4200         return info
4201
4202     def _talk_info(self, url, video_id=0):
4203         """Return the video for the talk in the url"""
4204         m=re.match(self._VALID_URL, url,re.VERBOSE)
4205         videoName=m.group('name')
4206         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4207         # If the url includes the language we get the title translated
4208         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4209         title=re.search(title_RE, webpage).group('title')
4210         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4211                         "id":(?P<videoID>[\d]+).*?
4212                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4213         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4214         thumb_match=re.search(thumb_RE,webpage)
4215         info_match=re.search(info_RE,webpage,re.VERBOSE)
4216         video_id=info_match.group('videoID')
4217         mediaSlug=info_match.group('mediaSlug')
4218         video_url=self._talk_video_link(mediaSlug)
4219         info = {
4220                 'id': video_id,
4221                 'url': video_url,
4222                 'ext': 'mp4',
4223                 'title': title,
4224                 'thumbnail': thumb_match.group('thumbnail')
4225                 }
4226         return info
4227
4228 class MySpassIE(InfoExtractor):
4229     _VALID_URL = r'http://www.myspass.de/.*'
4230
4231     def _real_extract(self, url):
4232         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4233
4234         # video id is the last path element of the URL
4235         # usually there is a trailing slash, so also try the second but last
4236         url_path = compat_urllib_parse_urlparse(url).path
4237         url_parent_path, video_id = os.path.split(url_path)
4238         if not video_id:
4239             _, video_id = os.path.split(url_parent_path)
4240
4241         # get metadata
4242         metadata_url = META_DATA_URL_TEMPLATE % video_id
4243         metadata_text = self._download_webpage(metadata_url, video_id)
4244         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4245
4246         # extract values from metadata
4247         url_flv_el = metadata.find('url_flv')
4248         if url_flv_el is None:
4249             self._downloader.report_error(u'unable to extract download url')
4250             return
4251         video_url = url_flv_el.text
4252         extension = os.path.splitext(video_url)[1][1:]
4253         title_el = metadata.find('title')
4254         if title_el is None:
4255             self._downloader.report_error(u'unable to extract title')
4256             return
4257         title = title_el.text
4258         format_id_el = metadata.find('format_id')
4259         if format_id_el is None:
4260             format = ext
4261         else:
4262             format = format_id_el.text
4263         description_el = metadata.find('description')
4264         if description_el is not None:
4265             description = description_el.text
4266         else:
4267             description = None
4268         imagePreview_el = metadata.find('imagePreview')
4269         if imagePreview_el is not None:
4270             thumbnail = imagePreview_el.text
4271         else:
4272             thumbnail = None
4273         info = {
4274             'id': video_id,
4275             'url': video_url,
4276             'title': title,
4277             'ext': extension,
4278             'format': format,
4279             'thumbnail': thumbnail,
4280             'description': description
4281         }
4282         return [info]
4283
4284 class SpiegelIE(InfoExtractor):
4285     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4286
4287     def _real_extract(self, url):
4288         m = re.match(self._VALID_URL, url)
4289         video_id = m.group('videoID')
4290
4291         webpage = self._download_webpage(url, video_id)
4292         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4293         if not m:
4294             raise ExtractorError(u'Cannot find title')
4295         video_title = unescapeHTML(m.group(1))
4296
4297         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4298         xml_code = self._download_webpage(xml_url, video_id,
4299                     note=u'Downloading XML', errnote=u'Failed to download XML')
4300
4301         idoc = xml.etree.ElementTree.fromstring(xml_code)
4302         last_type = idoc[-1]
4303         filename = last_type.findall('./filename')[0].text
4304         duration = float(last_type.findall('./duration')[0].text)
4305
4306         video_url = 'http://video2.spiegel.de/flash/' + filename
4307         video_ext = filename.rpartition('.')[2]
4308         info = {
4309             'id': video_id,
4310             'url': video_url,
4311             'ext': video_ext,
4312             'title': video_title,
4313             'duration': duration,
4314         }
4315         return [info]
4316
4317 class LiveLeakIE(InfoExtractor):
4318
4319     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4320     IE_NAME = u'liveleak'
4321
4322     def _real_extract(self, url):
4323         mobj = re.match(self._VALID_URL, url)
4324         if mobj is None:
4325             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4326             return
4327
4328         video_id = mobj.group('video_id')
4329
4330         webpage = self._download_webpage(url, video_id)
4331
4332         m = re.search(r'file: "(.*?)",', webpage)
4333         if not m:
4334             self._downloader.report_error(u'unable to find video url')
4335             return
4336         video_url = m.group(1)
4337
4338         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4339         if not m:
4340             self._downloader.trouble(u'Cannot find video title')
4341         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4342
4343         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4344         if m:
4345             desc = unescapeHTML(m.group('desc'))
4346         else:
4347             desc = None
4348
4349         m = re.search(r'By:.*?(\w+)</a>', webpage)
4350         if m:
4351             uploader = clean_html(m.group(1))
4352         else:
4353             uploader = None
4354
4355         info = {
4356             'id':  video_id,
4357             'url': video_url,
4358             'ext': 'mp4',
4359             'title': title,
4360             'description': desc,
4361             'uploader': uploader
4362         }
4363
4364         return [info]
4365
4366 class ARDIE(InfoExtractor):
4367     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4368     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4369     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4370
4371     def _real_extract(self, url):
4372         # determine video id from url
4373         m = re.match(self._VALID_URL, url)
4374
4375         numid = re.search(r'documentId=([0-9]+)', url)
4376         if numid:
4377             video_id = numid.group(1)
4378         else:
4379             video_id = m.group('video_id')
4380
4381         # determine title and media streams from webpage
4382         html = self._download_webpage(url, video_id)
4383         title = re.search(self._TITLE, html).group('title')
4384         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4385         if not streams:
4386             assert '"fsk"' in html
4387             self._downloader.report_error(u'this video is only available after 8:00 pm')
4388             return
4389
4390         # choose default media type and highest quality for now
4391         stream = max([s for s in streams if int(s["media_type"]) == 0],
4392                      key=lambda s: int(s["quality"]))
4393
4394         # there's two possibilities: RTMP stream or HTTP download
4395         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4396         if stream['rtmp_url']:
4397             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4398             assert stream['video_url'].startswith('mp4:')
4399             info["url"] = stream["rtmp_url"]
4400             info["play_path"] = stream['video_url']
4401         else:
4402             assert stream["video_url"].endswith('.mp4')
4403             info["url"] = stream["video_url"]
4404         return [info]
4405
4406
4407 def gen_extractors():
4408     """ Return a list of an instance of every supported extractor.
4409     The order does matter; the first extractor matched is the one handling the URL.
4410     """
4411     return [
4412         YoutubePlaylistIE(),
4413         YoutubeChannelIE(),
4414         YoutubeUserIE(),
4415         YoutubeSearchIE(),
4416         YoutubeIE(),
4417         MetacafeIE(),
4418         DailymotionIE(),
4419         GoogleSearchIE(),
4420         PhotobucketIE(),
4421         YahooIE(),
4422         YahooSearchIE(),
4423         DepositFilesIE(),
4424         FacebookIE(),
4425         BlipTVUserIE(),
4426         BlipTVIE(),
4427         VimeoIE(),
4428         MyVideoIE(),
4429         ComedyCentralIE(),
4430         EscapistIE(),
4431         CollegeHumorIE(),
4432         XVideosIE(),
4433         SoundcloudSetIE(),
4434         SoundcloudIE(),
4435         InfoQIE(),
4436         MixcloudIE(),
4437         StanfordOpenClassroomIE(),
4438         MTVIE(),
4439         YoukuIE(),
4440         XNXXIE(),
4441         YouJizzIE(),
4442         PornotubeIE(),
4443         YouPornIE(),
4444         GooglePlusIE(),
4445         ArteTvIE(),
4446         NBAIE(),
4447         WorldStarHipHopIE(),
4448         JustinTVIE(),
4449         FunnyOrDieIE(),
4450         SteamIE(),
4451         UstreamIE(),
4452         RBMARadioIE(),
4453         EightTracksIE(),
4454         KeekIE(),
4455         TEDIE(),
4456         MySpassIE(),
4457         SpiegelIE(),
4458         LiveLeakIE(),
4459         ARDIE(),
4460         GenericIE()
4461     ]