_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self._downloader.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     #Methods for following #608
 148     #They set the correct value of the '_type' key
 149     def video_result(self, video_info):
 150         """Returns a video"""
 151         video_info['_type'] = 'video'
 152         return video_info
 153     def url_result(self, url, ie=None):
 154         """Returns a url that points to a page that should be processed"""
 155         #TODO: ie should be the class used for getting the info
 156         video_info = {'_type': 'url',
 157                       'url': url}
 158         return video_info
 159     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 160         """Returns a playlist"""
 161         video_info = {'_type': 'playlist',
 162                       'entries': entries}
 163         if playlist_id:
 164             video_info['id'] = playlist_id
 165         if playlist_title:
 166             video_info['title'] = playlist_title
 167         return video_info
 168
 169
 170 class YoutubeIE(InfoExtractor):
 171     """Information extractor for youtube.com."""
 172
 173     _VALID_URL = r"""^
 174                      (
 175                          (?:https?://)?                                       # http(s):// (optional)
 176                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 177                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 178                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 179                          (?:                                                  # the various things that can precede the ID:
 180                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 181                              |(?:                                             # or the v= param in all its forms
 182                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 183                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 184                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 185                                  v=
 186                              )
 187                          )?                                                   # optional -> youtube.com/xxxx is OK
 188                      )?                                                       # all until now is optional -> you can pass the naked ID
 189                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 190                      (?(1).+)?                                                # if we found the ID, everything can follow
 191                      $"""
 192     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 193     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 194     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 195     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 196     _NETRC_MACHINE = 'youtube'
 197     # Listed in order of quality
 198     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 199     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 200     _video_extensions = {
 201         '13': '3gp',
 202         '17': 'mp4',
 203         '18': 'mp4',
 204         '22': 'mp4',
 205         '37': 'mp4',
 206         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 207         '43': 'webm',
 208         '44': 'webm',
 209         '45': 'webm',
 210         '46': 'webm',
 211     }
 212     _video_dimensions = {
 213         '5': '240x400',
 214         '6': '???',
 215         '13': '???',
 216         '17': '144x176',
 217         '18': '360x640',
 218         '22': '720x1280',
 219         '34': '360x640',
 220         '35': '480x854',
 221         '37': '1080x1920',
 222         '38': '3072x4096',
 223         '43': '360x640',
 224         '44': '480x854',
 225         '45': '720x1280',
 226         '46': '1080x1920',
 227     }
 228     IE_NAME = u'youtube'
 229
 230     @classmethod
 231     def suitable(cls, url):
 232         """Receives a URL and returns True if suitable for this IE."""
 233         if YoutubePlaylistIE.suitable(url): return False
 234         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 235
 236     def report_lang(self):
 237         """Report attempt to set language."""
 238         self._downloader.to_screen(u'[youtube] Setting language')
 239
 240     def report_login(self):
 241         """Report attempt to log in."""
 242         self._downloader.to_screen(u'[youtube] Logging in')
 243
 244     def report_age_confirmation(self):
 245         """Report attempt to confirm age."""
 246         self._downloader.to_screen(u'[youtube] Confirming age')
 247
 248     def report_video_webpage_download(self, video_id):
 249         """Report attempt to download video webpage."""
 250         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 251
 252     def report_video_info_webpage_download(self, video_id):
 253         """Report attempt to download video info webpage."""
 254         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 255
 256     def report_video_subtitles_download(self, video_id):
 257         """Report attempt to download video info webpage."""
 258         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 259
 260     def report_video_subtitles_request(self, video_id, sub_lang, format):
 261         """Report attempt to download video info webpage."""
 262         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 263
 264     def report_video_subtitles_available(self, video_id, sub_lang_list):
 265         """Report available subtitles."""
 266         sub_lang = ",".join(list(sub_lang_list.keys()))
 267         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 268
 269     def report_information_extraction(self, video_id):
 270         """Report attempt to extract video information."""
 271         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 272
 273     def report_unavailable_format(self, video_id, format):
 274         """Report extracted video URL."""
 275         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 276
 277     def report_rtmp_download(self):
 278         """Indicate the download will use the RTMP protocol."""
 279         self._downloader.to_screen(u'[youtube] RTMP download detected')
 280
 281     def _get_available_subtitles(self, video_id):
 282         self.report_video_subtitles_download(video_id)
 283         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 284         try:
 285             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 286         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 287             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 288         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 289         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 290         if not sub_lang_list:
 291             return (u'video doesn\'t have subtitles', None)
 292         return sub_lang_list
 293
 294     def _list_available_subtitles(self, video_id):
 295         sub_lang_list = self._get_available_subtitles(video_id)
 296         self.report_video_subtitles_available(video_id, sub_lang_list)
 297
 298     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 299         """
 300         Return tuple:
 301         (error_message, sub_lang, sub)
 302         """
 303         self.report_video_subtitles_request(video_id, sub_lang, format)
 304         params = compat_urllib_parse.urlencode({
 305             'lang': sub_lang,
 306             'name': sub_name,
 307             'v': video_id,
 308             'fmt': format,
 309         })
 310         url = 'http://www.youtube.com/api/timedtext?' + params
 311         try:
 312             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 313         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 314             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 315         if not sub:
 316             return (u'Did not fetch video subtitles', None, None)
 317         return (None, sub_lang, sub)
 318
 319     def _extract_subtitle(self, video_id):
 320         """
 321         Return a list with a tuple:
 322         [(error_message, sub_lang, sub)]
 323         """
 324         sub_lang_list = self._get_available_subtitles(video_id)
 325         sub_format = self._downloader.params.get('subtitlesformat')
 326         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 327             return [(sub_lang_list[0], None, None)]
 328         if self._downloader.params.get('subtitleslang', False):
 329             sub_lang = self._downloader.params.get('subtitleslang')
 330         elif 'en' in sub_lang_list:
 331             sub_lang = 'en'
 332         else:
 333             sub_lang = list(sub_lang_list.keys())[0]
 334         if not sub_lang in sub_lang_list:
 335             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 336
 337         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 338         return [subtitle]
 339
 340     def _extract_all_subtitles(self, video_id):
 341         sub_lang_list = self._get_available_subtitles(video_id)
 342         sub_format = self._downloader.params.get('subtitlesformat')
 343         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 344             return [(sub_lang_list[0], None, None)]
 345         subtitles = []
 346         for sub_lang in sub_lang_list:
 347             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 348             subtitles.append(subtitle)
 349         return subtitles
 350
 351     def _print_formats(self, formats):
 352         print('Available formats:')
 353         for x in formats:
 354             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 355
 356     def _real_initialize(self):
 357         if self._downloader is None:
 358             return
 359
 360         username = None
 361         password = None
 362         downloader_params = self._downloader.params
 363
 364         # Attempt to use provided username and password or .netrc data
 365         if downloader_params.get('username', None) is not None:
 366             username = downloader_params['username']
 367             password = downloader_params['password']
 368         elif downloader_params.get('usenetrc', False):
 369             try:
 370                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 371                 if info is not None:
 372                     username = info[0]
 373                     password = info[2]
 374                 else:
 375                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 376             except (IOError, netrc.NetrcParseError) as err:
 377                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 378                 return
 379
 380         # Set language
 381         request = compat_urllib_request.Request(self._LANG_URL)
 382         try:
 383             self.report_lang()
 384             compat_urllib_request.urlopen(request).read()
 385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 386             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 387             return
 388
 389         # No authentication to be performed
 390         if username is None:
 391             return
 392
 393         request = compat_urllib_request.Request(self._LOGIN_URL)
 394         try:
 395             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 396         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 397             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 398             return
 399
 400         galx = None
 401         dsh = None
 402         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 403         if match:
 404           galx = match.group(1)
 405
 406         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 407         if match:
 408           dsh = match.group(1)
 409
 410         # Log in
 411         login_form_strs = {
 412                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 413                 u'Email': username,
 414                 u'GALX': galx,
 415                 u'Passwd': password,
 416                 u'PersistentCookie': u'yes',
 417                 u'_utf8': u'霱',
 418                 u'bgresponse': u'js_disabled',
 419                 u'checkConnection': u'',
 420                 u'checkedDomains': u'youtube',
 421                 u'dnConn': u'',
 422                 u'dsh': dsh,
 423                 u'pstMsg': u'0',
 424                 u'rmShown': u'1',
 425                 u'secTok': u'',
 426                 u'signIn': u'Sign in',
 427                 u'timeStmp': u'',
 428                 u'service': u'youtube',
 429                 u'uilel': u'3',
 430                 u'hl': u'en_US',
 431         }
 432         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 433         # chokes on unicode
 434         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 435         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 436         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 437         try:
 438             self.report_login()
 439             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 440             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 441                 self._downloader.report_warning(u'unable to log in: bad username or password')
 442                 return
 443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 444             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 445             return
 446
 447         # Confirm age
 448         age_form = {
 449                 'next_url':     '/',
 450                 'action_confirm':   'Confirm',
 451                 }
 452         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 453         try:
 454             self.report_age_confirmation()
 455             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 458             return
 459
 460     def _extract_id(self, url):
 461         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 462         if mobj is None:
 463             self._downloader.report_error(u'invalid URL: %s' % url)
 464             return
 465         video_id = mobj.group(2)
 466         return video_id
 467
 468     def _real_extract(self, url):
 469         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 470         mobj = re.search(self._NEXT_URL_RE, url)
 471         if mobj:
 472             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 473         video_id = self._extract_id(url)
 474
 475         # Get video webpage
 476         self.report_video_webpage_download(video_id)
 477         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 478         request = compat_urllib_request.Request(url)
 479         try:
 480             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 481         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 482             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 483             return
 484
 485         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 486
 487         # Attempt to extract SWF player URL
 488         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 489         if mobj is not None:
 490             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 491         else:
 492             player_url = None
 493
 494         # Get video info
 495         self.report_video_info_webpage_download(video_id)
 496         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 497             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 498                     % (video_id, el_type))
 499             video_info_webpage = self._download_webpage(video_info_url, video_id,
 500                                     note=False,
 501                                     errnote='unable to download video info webpage')
 502             video_info = compat_parse_qs(video_info_webpage)
 503             if 'token' in video_info:
 504                 break
 505         if 'token' not in video_info:
 506             if 'reason' in video_info:
 507                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 508             else:
 509                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 510             return
 511
 512         # Check for "rental" videos
 513         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 514             self._downloader.report_error(u'"rental" videos not supported')
 515             return
 516
 517         # Start extracting information
 518         self.report_information_extraction(video_id)
 519
 520         # uploader
 521         if 'author' not in video_info:
 522             self._downloader.report_error(u'unable to extract uploader name')
 523             return
 524         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 525
 526         # uploader_id
 527         video_uploader_id = None
 528         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 529         if mobj is not None:
 530             video_uploader_id = mobj.group(1)
 531         else:
 532             self._downloader.report_warning(u'unable to extract uploader nickname')
 533
 534         # title
 535         if 'title' not in video_info:
 536             self._downloader.report_error(u'unable to extract video title')
 537             return
 538         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 539
 540         # thumbnail image
 541         if 'thumbnail_url' not in video_info:
 542             self._downloader.report_warning(u'unable to extract video thumbnail')
 543             video_thumbnail = ''
 544         else:   # don't panic if we can't find it
 545             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 546
 547         # upload date
 548         upload_date = None
 549         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 550         if mobj is not None:
 551             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 552             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 553             for expression in format_expressions:
 554                 try:
 555                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 556                 except:
 557                     pass
 558
 559         # description
 560         video_description = get_element_by_id("eow-description", video_webpage)
 561         if video_description:
 562             video_description = clean_html(video_description)
 563         else:
 564             video_description = ''
 565
 566         # subtitles
 567         video_subtitles = None
 568
 569         if self._downloader.params.get('writesubtitles', False):
 570             video_subtitles = self._extract_subtitle(video_id)
 571             if video_subtitles:
 572                 (sub_error, sub_lang, sub) = video_subtitles[0]
 573                 if sub_error:
 574                     self._downloader.report_error(sub_error)
 575
 576         if self._downloader.params.get('allsubtitles', False):
 577             video_subtitles = self._extract_all_subtitles(video_id)
 578             for video_subtitle in video_subtitles:
 579                 (sub_error, sub_lang, sub) = video_subtitle
 580                 if sub_error:
 581                     self._downloader.report_error(sub_error)
 582
 583         if self._downloader.params.get('listsubtitles', False):
 584             sub_lang_list = self._list_available_subtitles(video_id)
 585             return
 586
 587         if 'length_seconds' not in video_info:
 588             self._downloader.report_warning(u'unable to extract video duration')
 589             video_duration = ''
 590         else:
 591             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 592
 593         # token
 594         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 595
 596         # Decide which formats to download
 597         req_format = self._downloader.params.get('format', None)
 598
 599         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 600             self.report_rtmp_download()
 601             video_url_list = [(None, video_info['conn'][0])]
 602         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 603             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 604             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 605             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 606             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 607
 608             format_limit = self._downloader.params.get('format_limit', None)
 609             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 610             if format_limit is not None and format_limit in available_formats:
 611                 format_list = available_formats[available_formats.index(format_limit):]
 612             else:
 613                 format_list = available_formats
 614             existing_formats = [x for x in format_list if x in url_map]
 615             if len(existing_formats) == 0:
 616                 self._downloader.report_error(u'no known formats available for video')
 617                 return
 618             if self._downloader.params.get('listformats', None):
 619                 self._print_formats(existing_formats)
 620                 return
 621             if req_format is None or req_format == 'best':
 622                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 623             elif req_format == 'worst':
 624                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 625             elif req_format in ('-1', 'all'):
 626                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 627             else:
 628                 # Specific formats. We pick the first in a slash-delimeted sequence.
 629                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 630                 req_formats = req_format.split('/')
 631                 video_url_list = None
 632                 for rf in req_formats:
 633                     if rf in url_map:
 634                         video_url_list = [(rf, url_map[rf])]
 635                         break
 636                 if video_url_list is None:
 637                     self._downloader.report_error(u'requested format not available')
 638                     return
 639         else:
 640             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 641             return
 642
 643         results = []
 644         for format_param, video_real_url in video_url_list:
 645             # Extension
 646             video_extension = self._video_extensions.get(format_param, 'flv')
 647
 648             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 649                                               self._video_dimensions.get(format_param, '???'))
 650
 651             results.append({
 652                 'id':       video_id,
 653                 'url':      video_real_url,
 654                 'uploader': video_uploader,
 655                 'uploader_id': video_uploader_id,
 656                 'upload_date':  upload_date,
 657                 'title':    video_title,
 658                 'ext':      video_extension,
 659                 'format':   video_format,
 660                 'thumbnail':    video_thumbnail,
 661                 'description':  video_description,
 662                 'player_url':   player_url,
 663                 'subtitles':    video_subtitles,
 664                 'duration':     video_duration
 665             })
 666         return results
 667
 668
 669 class MetacafeIE(InfoExtractor):
 670     """Information Extractor for metacafe.com."""
 671
 672     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 673     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 674     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 675     IE_NAME = u'metacafe'
 676
 677     def __init__(self, downloader=None):
 678         InfoExtractor.__init__(self, downloader)
 679
 680     def report_disclaimer(self):
 681         """Report disclaimer retrieval."""
 682         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 683
 684     def report_age_confirmation(self):
 685         """Report attempt to confirm age."""
 686         self._downloader.to_screen(u'[metacafe] Confirming age')
 687
 688     def report_download_webpage(self, video_id):
 689         """Report webpage download."""
 690         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 691
 692     def report_extraction(self, video_id):
 693         """Report information extraction."""
 694         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 695
 696     def _real_initialize(self):
 697         # Retrieve disclaimer
 698         request = compat_urllib_request.Request(self._DISCLAIMER)
 699         try:
 700             self.report_disclaimer()
 701             disclaimer = compat_urllib_request.urlopen(request).read()
 702         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 703             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 704             return
 705
 706         # Confirm age
 707         disclaimer_form = {
 708             'filters': '0',
 709             'submit': "Continue - I'm over 18",
 710             }
 711         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 712         try:
 713             self.report_age_confirmation()
 714             disclaimer = compat_urllib_request.urlopen(request).read()
 715         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 716             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 717             return
 718
 719     def _real_extract(self, url):
 720         # Extract id and simplified title from URL
 721         mobj = re.match(self._VALID_URL, url)
 722         if mobj is None:
 723             self._downloader.report_error(u'invalid URL: %s' % url)
 724             return
 725
 726         video_id = mobj.group(1)
 727
 728         # Check if video comes from YouTube
 729         mobj2 = re.match(r'^yt-(.*)$', video_id)
 730         if mobj2 is not None:
 731             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
 732
 733         # Retrieve video webpage to extract further information
 734         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 735         try:
 736             self.report_download_webpage(video_id)
 737             webpage = compat_urllib_request.urlopen(request).read()
 738         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 739             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
 740             return
 741
 742         # Extract URL, uploader and title from webpage
 743         self.report_extraction(video_id)
 744         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 745         if mobj is not None:
 746             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 747             video_extension = mediaURL[-3:]
 748
 749             # Extract gdaKey if available
 750             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 751             if mobj is None:
 752                 video_url = mediaURL
 753             else:
 754                 gdaKey = mobj.group(1)
 755                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 756         else:
 757             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 758             if mobj is None:
 759                 self._downloader.report_error(u'unable to extract media URL')
 760                 return
 761             vardict = compat_parse_qs(mobj.group(1))
 762             if 'mediaData' not in vardict:
 763                 self._downloader.report_error(u'unable to extract media URL')
 764                 return
 765             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 766             if mobj is None:
 767                 self._downloader.report_error(u'unable to extract media URL')
 768                 return
 769             mediaURL = mobj.group(1).replace('\\/', '/')
 770             video_extension = mediaURL[-3:]
 771             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 772
 773         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 774         if mobj is None:
 775             self._downloader.report_error(u'unable to extract title')
 776             return
 777         video_title = mobj.group(1).decode('utf-8')
 778
 779         mobj = re.search(r'submitter=(.*?);', webpage)
 780         if mobj is None:
 781             self._downloader.report_error(u'unable to extract uploader nickname')
 782             return
 783         video_uploader = mobj.group(1)
 784
 785         return [{
 786             'id':       video_id.decode('utf-8'),
 787             'url':      video_url.decode('utf-8'),
 788             'uploader': video_uploader.decode('utf-8'),
 789             'upload_date':  None,
 790             'title':    video_title,
 791             'ext':      video_extension.decode('utf-8'),
 792         }]
 793
 794
 795 class DailymotionIE(InfoExtractor):
 796     """Information Extractor for Dailymotion"""
 797
 798     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 799     IE_NAME = u'dailymotion'
 800     _WORKING = False
 801
 802     def __init__(self, downloader=None):
 803         InfoExtractor.__init__(self, downloader)
 804
 805     def report_extraction(self, video_id):
 806         """Report information extraction."""
 807         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 808
 809     def _real_extract(self, url):
 810         # Extract id and simplified title from URL
 811         mobj = re.match(self._VALID_URL, url)
 812         if mobj is None:
 813             self._downloader.report_error(u'invalid URL: %s' % url)
 814             return
 815
 816         video_id = mobj.group(1).split('_')[0].split('?')[0]
 817
 818         video_extension = 'mp4'
 819
 820         # Retrieve video webpage to extract further information
 821         request = compat_urllib_request.Request(url)
 822         request.add_header('Cookie', 'family_filter=off')
 823         webpage = self._download_webpage(request, video_id)
 824
 825         # Extract URL, uploader and title from webpage
 826         self.report_extraction(video_id)
 827         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 828         if mobj is None:
 829             self._downloader.report_error(u'unable to extract media URL')
 830             return
 831         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 832
 833         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 834             if key in flashvars:
 835                 max_quality = key
 836                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 837                 break
 838         else:
 839             self._downloader.report_error(u'unable to extract video URL')
 840             return
 841
 842         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 843         if mobj is None:
 844             self._downloader.report_error(u'unable to extract video URL')
 845             return
 846
 847         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 848
 849         # TODO: support choosing qualities
 850
 851         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 852         if mobj is None:
 853             self._downloader.report_error(u'unable to extract title')
 854             return
 855         video_title = unescapeHTML(mobj.group('title'))
 856
 857         video_uploader = None
 858         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 859         if mobj is None:
 860             # lookin for official user
 861             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 862             if mobj_official is None:
 863                 self._downloader.report_warning(u'unable to extract uploader nickname')
 864             else:
 865                 video_uploader = mobj_official.group(1)
 866         else:
 867             video_uploader = mobj.group(1)
 868
 869         video_upload_date = None
 870         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 871         if mobj is not None:
 872             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 873
 874         return [{
 875             'id':       video_id,
 876             'url':      video_url,
 877             'uploader': video_uploader,
 878             'upload_date':  video_upload_date,
 879             'title':    video_title,
 880             'ext':      video_extension,
 881         }]
 882
 883
 884 class PhotobucketIE(InfoExtractor):
 885     """Information extractor for photobucket.com."""
 886
 887     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 888     IE_NAME = u'photobucket'
 889
 890     def __init__(self, downloader=None):
 891         InfoExtractor.__init__(self, downloader)
 892
 893     def report_download_webpage(self, video_id):
 894         """Report webpage download."""
 895         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 896
 897     def report_extraction(self, video_id):
 898         """Report information extraction."""
 899         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 900
 901     def _real_extract(self, url):
 902         # Extract id from URL
 903         mobj = re.match(self._VALID_URL, url)
 904         if mobj is None:
 905             self._downloader.report_error(u'Invalid URL: %s' % url)
 906             return
 907
 908         video_id = mobj.group(1)
 909
 910         video_extension = 'flv'
 911
 912         # Retrieve video webpage to extract further information
 913         request = compat_urllib_request.Request(url)
 914         try:
 915             self.report_download_webpage(video_id)
 916             webpage = compat_urllib_request.urlopen(request).read()
 917         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 918             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 919             return
 920
 921         # Extract URL, uploader, and title from webpage
 922         self.report_extraction(video_id)
 923         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 924         if mobj is None:
 925             self._downloader.report_error(u'unable to extract media URL')
 926             return
 927         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 928
 929         video_url = mediaURL
 930
 931         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 932         if mobj is None:
 933             self._downloader.report_error(u'unable to extract title')
 934             return
 935         video_title = mobj.group(1).decode('utf-8')
 936
 937         video_uploader = mobj.group(2).decode('utf-8')
 938
 939         return [{
 940             'id':       video_id.decode('utf-8'),
 941             'url':      video_url.decode('utf-8'),
 942             'uploader': video_uploader,
 943             'upload_date':  None,
 944             'title':    video_title,
 945             'ext':      video_extension.decode('utf-8'),
 946         }]
 947
 948
 949 class YahooIE(InfoExtractor):
 950     """Information extractor for video.yahoo.com."""
 951
 952     _WORKING = False
 953     # _VALID_URL matches all Yahoo! Video URLs
 954     # _VPAGE_URL matches only the extractable '/watch/' URLs
 955     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 956     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 957     IE_NAME = u'video.yahoo'
 958
 959     def __init__(self, downloader=None):
 960         InfoExtractor.__init__(self, downloader)
 961
 962     def report_download_webpage(self, video_id):
 963         """Report webpage download."""
 964         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 965
 966     def report_extraction(self, video_id):
 967         """Report information extraction."""
 968         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 969
 970     def _real_extract(self, url, new_video=True):
 971         # Extract ID from URL
 972         mobj = re.match(self._VALID_URL, url)
 973         if mobj is None:
 974             self._downloader.report_error(u'Invalid URL: %s' % url)
 975             return
 976
 977         video_id = mobj.group(2)
 978         video_extension = 'flv'
 979
 980         # Rewrite valid but non-extractable URLs as
 981         # extractable English language /watch/ URLs
 982         if re.match(self._VPAGE_URL, url) is None:
 983             request = compat_urllib_request.Request(url)
 984             try:
 985                 webpage = compat_urllib_request.urlopen(request).read()
 986             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 987                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 988                 return
 989
 990             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 991             if mobj is None:
 992                 self._downloader.report_error(u'Unable to extract id field')
 993                 return
 994             yahoo_id = mobj.group(1)
 995
 996             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 997             if mobj is None:
 998                 self._downloader.report_error(u'Unable to extract vid field')
 999                 return
1000             yahoo_vid = mobj.group(1)
1001
1002             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1003             return self._real_extract(url, new_video=False)
1004
1005         # Retrieve video webpage to extract further information
1006         request = compat_urllib_request.Request(url)
1007         try:
1008             self.report_download_webpage(video_id)
1009             webpage = compat_urllib_request.urlopen(request).read()
1010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1012             return
1013
1014         # Extract uploader and title from webpage
1015         self.report_extraction(video_id)
1016         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1017         if mobj is None:
1018             self._downloader.report_error(u'unable to extract video title')
1019             return
1020         video_title = mobj.group(1).decode('utf-8')
1021
1022         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1023         if mobj is None:
1024             self._downloader.report_error(u'unable to extract video uploader')
1025             return
1026         video_uploader = mobj.group(1).decode('utf-8')
1027
1028         # Extract video thumbnail
1029         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1030         if mobj is None:
1031             self._downloader.report_error(u'unable to extract video thumbnail')
1032             return
1033         video_thumbnail = mobj.group(1).decode('utf-8')
1034
1035         # Extract video description
1036         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1037         if mobj is None:
1038             self._downloader.report_error(u'unable to extract video description')
1039             return
1040         video_description = mobj.group(1).decode('utf-8')
1041         if not video_description:
1042             video_description = 'No description available.'
1043
1044         # Extract video height and width
1045         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1046         if mobj is None:
1047             self._downloader.report_error(u'unable to extract video height')
1048             return
1049         yv_video_height = mobj.group(1)
1050
1051         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1052         if mobj is None:
1053             self._downloader.report_error(u'unable to extract video width')
1054             return
1055         yv_video_width = mobj.group(1)
1056
1057         # Retrieve video playlist to extract media URL
1058         # I'm not completely sure what all these options are, but we
1059         # seem to need most of them, otherwise the server sends a 401.
1060         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1061         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1062         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1063                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1064                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1065         try:
1066             self.report_download_webpage(video_id)
1067             webpage = compat_urllib_request.urlopen(request).read()
1068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1069             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1070             return
1071
1072         # Extract media URL from playlist XML
1073         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1074         if mobj is None:
1075             self._downloader.report_error(u'Unable to extract media URL')
1076             return
1077         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1078         video_url = unescapeHTML(video_url)
1079
1080         return [{
1081             'id':       video_id.decode('utf-8'),
1082             'url':      video_url,
1083             'uploader': video_uploader,
1084             'upload_date':  None,
1085             'title':    video_title,
1086             'ext':      video_extension.decode('utf-8'),
1087             'thumbnail':    video_thumbnail.decode('utf-8'),
1088             'description':  video_description,
1089         }]
1090
1091
1092 class VimeoIE(InfoExtractor):
1093     """Information extractor for vimeo.com."""
1094
1095     # _VALID_URL matches Vimeo URLs
1096     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1097     IE_NAME = u'vimeo'
1098
1099     def __init__(self, downloader=None):
1100         InfoExtractor.__init__(self, downloader)
1101
1102     def report_download_webpage(self, video_id):
1103         """Report webpage download."""
1104         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1105
1106     def report_extraction(self, video_id):
1107         """Report information extraction."""
1108         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1109
1110     def _real_extract(self, url, new_video=True):
1111         # Extract ID from URL
1112         mobj = re.match(self._VALID_URL, url)
1113         if mobj is None:
1114             self._downloader.report_error(u'Invalid URL: %s' % url)
1115             return
1116
1117         video_id = mobj.group('id')
1118         if not mobj.group('proto'):
1119             url = 'https://' + url
1120         if mobj.group('direct_link'):
1121             url = 'https://vimeo.com/' + video_id
1122
1123         # Retrieve video webpage to extract further information
1124         request = compat_urllib_request.Request(url, None, std_headers)
1125         try:
1126             self.report_download_webpage(video_id)
1127             webpage_bytes = compat_urllib_request.urlopen(request).read()
1128             webpage = webpage_bytes.decode('utf-8')
1129         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1130             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1131             return
1132
1133         # Now we begin extracting as much information as we can from what we
1134         # retrieved. First we extract the information common to all extractors,
1135         # and latter we extract those that are Vimeo specific.
1136         self.report_extraction(video_id)
1137
1138         # Extract the config JSON
1139         try:
1140             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1141             config = json.loads(config)
1142         except:
1143             self._downloader.report_error(u'unable to extract info section')
1144             return
1145
1146         # Extract title
1147         video_title = config["video"]["title"]
1148
1149         # Extract uploader and uploader_id
1150         video_uploader = config["video"]["owner"]["name"]
1151         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1152
1153         # Extract video thumbnail
1154         video_thumbnail = config["video"]["thumbnail"]
1155
1156         # Extract video description
1157         video_description = get_element_by_attribute("itemprop", "description", webpage)
1158         if video_description: video_description = clean_html(video_description)
1159         else: video_description = u''
1160
1161         # Extract upload date
1162         video_upload_date = None
1163         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1164         if mobj is not None:
1165             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1166
1167         # Vimeo specific: extract request signature and timestamp
1168         sig = config['request']['signature']
1169         timestamp = config['request']['timestamp']
1170
1171         # Vimeo specific: extract video codec and quality information
1172         # First consider quality, then codecs, then take everything
1173         # TODO bind to format param
1174         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1175         files = { 'hd': [], 'sd': [], 'other': []}
1176         for codec_name, codec_extension in codecs:
1177             if codec_name in config["video"]["files"]:
1178                 if 'hd' in config["video"]["files"][codec_name]:
1179                     files['hd'].append((codec_name, codec_extension, 'hd'))
1180                 elif 'sd' in config["video"]["files"][codec_name]:
1181                     files['sd'].append((codec_name, codec_extension, 'sd'))
1182                 else:
1183                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1184
1185         for quality in ('hd', 'sd', 'other'):
1186             if len(files[quality]) > 0:
1187                 video_quality = files[quality][0][2]
1188                 video_codec = files[quality][0][0]
1189                 video_extension = files[quality][0][1]
1190                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1191                 break
1192         else:
1193             self._downloader.report_error(u'no known codec found')
1194             return
1195
1196         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1197                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1198
1199         return [{
1200             'id':       video_id,
1201             'url':      video_url,
1202             'uploader': video_uploader,
1203             'uploader_id': video_uploader_id,
1204             'upload_date':  video_upload_date,
1205             'title':    video_title,
1206             'ext':      video_extension,
1207             'thumbnail':    video_thumbnail,
1208             'description':  video_description,
1209         }]
1210
1211
1212 class ArteTvIE(InfoExtractor):
1213     """arte.tv information extractor."""
1214
1215     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1216     _LIVE_URL = r'index-[0-9]+\.html$'
1217
1218     IE_NAME = u'arte.tv'
1219
1220     def __init__(self, downloader=None):
1221         InfoExtractor.__init__(self, downloader)
1222
1223     def report_download_webpage(self, video_id):
1224         """Report webpage download."""
1225         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1226
1227     def report_extraction(self, video_id):
1228         """Report information extraction."""
1229         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1230
1231     def fetch_webpage(self, url):
1232         request = compat_urllib_request.Request(url)
1233         try:
1234             self.report_download_webpage(url)
1235             webpage = compat_urllib_request.urlopen(request).read()
1236         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1237             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1238             return
1239         except ValueError as err:
1240             self._downloader.report_error(u'Invalid URL: %s' % url)
1241             return
1242         return webpage
1243
1244     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1245         page = self.fetch_webpage(url)
1246         mobj = re.search(regex, page, regexFlags)
1247         info = {}
1248
1249         if mobj is None:
1250             self._downloader.report_error(u'Invalid URL: %s' % url)
1251             return
1252
1253         for (i, key, err) in matchTuples:
1254             if mobj.group(i) is None:
1255                 self._downloader.trouble(err)
1256                 return
1257             else:
1258                 info[key] = mobj.group(i)
1259
1260         return info
1261
1262     def extractLiveStream(self, url):
1263         video_lang = url.split('/')[-4]
1264         info = self.grep_webpage(
1265             url,
1266             r'src="(.*?/videothek_js.*?\.js)',
1267             0,
1268             [
1269                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1270             ]
1271         )
1272         http_host = url.split('/')[2]
1273         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1274         info = self.grep_webpage(
1275             next_url,
1276             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1277                 '(http://.*?\.swf).*?' +
1278                 '(rtmp://.*?)\'',
1279             re.DOTALL,
1280             [
1281                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1282                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1283                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1284             ]
1285         )
1286         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1287
1288     def extractPlus7Stream(self, url):
1289         video_lang = url.split('/')[-3]
1290         info = self.grep_webpage(
1291             url,
1292             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1293             0,
1294             [
1295                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1296             ]
1297         )
1298         next_url = compat_urllib_parse.unquote(info.get('url'))
1299         info = self.grep_webpage(
1300             next_url,
1301             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1302             0,
1303             [
1304                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1305             ]
1306         )
1307         next_url = compat_urllib_parse.unquote(info.get('url'))
1308
1309         info = self.grep_webpage(
1310             next_url,
1311             r'<video id="(.*?)".*?>.*?' +
1312                 '<name>(.*?)</name>.*?' +
1313                 '<dateVideo>(.*?)</dateVideo>.*?' +
1314                 '<url quality="hd">(.*?)</url>',
1315             re.DOTALL,
1316             [
1317                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1318                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1319                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1320                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1321             ]
1322         )
1323
1324         return {
1325             'id':           info.get('id'),
1326             'url':          compat_urllib_parse.unquote(info.get('url')),
1327             'uploader':     u'arte.tv',
1328             'upload_date':  info.get('date'),
1329             'title':        info.get('title').decode('utf-8'),
1330             'ext':          u'mp4',
1331             'format':       u'NA',
1332             'player_url':   None,
1333         }
1334
1335     def _real_extract(self, url):
1336         video_id = url.split('/')[-1]
1337         self.report_extraction(video_id)
1338
1339         if re.search(self._LIVE_URL, video_id) is not None:
1340             self.extractLiveStream(url)
1341             return
1342         else:
1343             info = self.extractPlus7Stream(url)
1344
1345         return [info]
1346
1347
1348 class GenericIE(InfoExtractor):
1349     """Generic last-resort information extractor."""
1350
1351     _VALID_URL = r'.*'
1352     IE_NAME = u'generic'
1353
1354     def __init__(self, downloader=None):
1355         InfoExtractor.__init__(self, downloader)
1356
1357     def report_download_webpage(self, video_id):
1358         """Report webpage download."""
1359         if not self._downloader.params.get('test', False):
1360             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1361         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1362
1363     def report_extraction(self, video_id):
1364         """Report information extraction."""
1365         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1366
1367     def report_following_redirect(self, new_url):
1368         """Report information extraction."""
1369         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1370
1371     def _test_redirect(self, url):
1372         """Check if it is a redirect, like url shorteners, in case return the new url."""
1373         class HeadRequest(compat_urllib_request.Request):
1374             def get_method(self):
1375                 return "HEAD"
1376
1377         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1378             """
1379             Subclass the HTTPRedirectHandler to make it use our
1380             HeadRequest also on the redirected URL
1381             """
1382             def redirect_request(self, req, fp, code, msg, headers, newurl):
1383                 if code in (301, 302, 303, 307):
1384                     newurl = newurl.replace(' ', '%20')
1385                     newheaders = dict((k,v) for k,v in req.headers.items()
1386                                       if k.lower() not in ("content-length", "content-type"))
1387                     return HeadRequest(newurl,
1388                                        headers=newheaders,
1389                                        origin_req_host=req.get_origin_req_host(),
1390                                        unverifiable=True)
1391                 else:
1392                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1393
1394         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1395             """
1396             Fallback to GET if HEAD is not allowed (405 HTTP error)
1397             """
1398             def http_error_405(self, req, fp, code, msg, headers):
1399                 fp.read()
1400                 fp.close()
1401
1402                 newheaders = dict((k,v) for k,v in req.headers.items()
1403                                   if k.lower() not in ("content-length", "content-type"))
1404                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1405                                                  headers=newheaders,
1406                                                  origin_req_host=req.get_origin_req_host(),
1407                                                  unverifiable=True))
1408
1409         # Build our opener
1410         opener = compat_urllib_request.OpenerDirector()
1411         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1412                         HTTPMethodFallback, HEADRedirectHandler,
1413                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1414             opener.add_handler(handler())
1415
1416         response = opener.open(HeadRequest(url))
1417         new_url = response.geturl()
1418
1419         if url == new_url:
1420             return False
1421
1422         self.report_following_redirect(new_url)
1423         return new_url
1424
1425     def _real_extract(self, url):
1426         new_url = self._test_redirect(url)
1427         if new_url: return [self.url_result(new_url)]
1428
1429         video_id = url.split('/')[-1]
1430         try:
1431             webpage = self._download_webpage(url, video_id)
1432         except ValueError as err:
1433             # since this is the last-resort InfoExtractor, if
1434             # this error is thrown, it'll be thrown here
1435             self._downloader.report_error(u'Invalid URL: %s' % url)
1436             return
1437
1438         self.report_extraction(video_id)
1439         # Start with something easy: JW Player in SWFObject
1440         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1441         if mobj is None:
1442             # Broaden the search a little bit
1443             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1444         if mobj is None:
1445             # Broaden the search a little bit: JWPlayer JS loader
1446             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1447         if mobj is None:
1448             self._downloader.report_error(u'Invalid URL: %s' % url)
1449             return
1450
1451         # It's possible that one of the regexes
1452         # matched, but returned an empty group:
1453         if mobj.group(1) is None:
1454             self._downloader.report_error(u'Invalid URL: %s' % url)
1455             return
1456
1457         video_url = compat_urllib_parse.unquote(mobj.group(1))
1458         video_id = os.path.basename(video_url)
1459
1460         # here's a fun little line of code for you:
1461         video_extension = os.path.splitext(video_id)[1][1:]
1462         video_id = os.path.splitext(video_id)[0]
1463
1464         # it's tempting to parse this further, but you would
1465         # have to take into account all the variations like
1466         #   Video Title - Site Name
1467         #   Site Name | Video Title
1468         #   Video Title - Tagline | Site Name
1469         # and so on and so forth; it's just not practical
1470         mobj = re.search(r'<title>(.*)</title>', webpage)
1471         if mobj is None:
1472             self._downloader.report_error(u'unable to extract title')
1473             return
1474         video_title = mobj.group(1)
1475
1476         # video uploader is domain name
1477         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1478         if mobj is None:
1479             self._downloader.report_error(u'unable to extract title')
1480             return
1481         video_uploader = mobj.group(1)
1482
1483         return [{
1484             'id':       video_id,
1485             'url':      video_url,
1486             'uploader': video_uploader,
1487             'upload_date':  None,
1488             'title':    video_title,
1489             'ext':      video_extension,
1490         }]
1491
1492
1493 class YoutubeSearchIE(InfoExtractor):
1494     """Information Extractor for YouTube search queries."""
1495     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1496     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1497     _max_youtube_results = 1000
1498     IE_NAME = u'youtube:search'
1499
1500     def __init__(self, downloader=None):
1501         InfoExtractor.__init__(self, downloader)
1502
1503     def report_download_page(self, query, pagenum):
1504         """Report attempt to download search page with given number."""
1505         query = query.decode(preferredencoding())
1506         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1507
1508     def _real_extract(self, query):
1509         mobj = re.match(self._VALID_URL, query)
1510         if mobj is None:
1511             self._downloader.report_error(u'invalid search query "%s"' % query)
1512             return
1513
1514         prefix, query = query.split(':')
1515         prefix = prefix[8:]
1516         query = query.encode('utf-8')
1517         if prefix == '':
1518             self._download_n_results(query, 1)
1519             return
1520         elif prefix == 'all':
1521             self._download_n_results(query, self._max_youtube_results)
1522             return
1523         else:
1524             try:
1525                 n = int(prefix)
1526                 if n <= 0:
1527                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1528                     return
1529                 elif n > self._max_youtube_results:
1530                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1531                     n = self._max_youtube_results
1532                 self._download_n_results(query, n)
1533                 return
1534             except ValueError: # parsing prefix as integer fails
1535                 self._download_n_results(query, 1)
1536                 return
1537
1538     def _download_n_results(self, query, n):
1539         """Downloads a specified number of results for a query"""
1540
1541         video_ids = []
1542         pagenum = 0
1543         limit = n
1544
1545         while (50 * pagenum) < limit:
1546             self.report_download_page(query, pagenum+1)
1547             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1548             request = compat_urllib_request.Request(result_url)
1549             try:
1550                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1551             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1552                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1553                 return
1554             api_response = json.loads(data)['data']
1555
1556             if not 'items' in api_response:
1557                 self._downloader.trouble(u'[youtube] No video results')
1558                 return
1559
1560             new_ids = list(video['id'] for video in api_response['items'])
1561             video_ids += new_ids
1562
1563             limit = min(n, api_response['totalItems'])
1564             pagenum += 1
1565
1566         if len(video_ids) > n:
1567             video_ids = video_ids[:n]
1568         for id in video_ids:
1569             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1570         return
1571
1572
1573 class GoogleSearchIE(InfoExtractor):
1574     """Information Extractor for Google Video search queries."""
1575     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1576     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1577     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1578     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1579     _max_google_results = 1000
1580     IE_NAME = u'video.google:search'
1581
1582     def __init__(self, downloader=None):
1583         InfoExtractor.__init__(self, downloader)
1584
1585     def report_download_page(self, query, pagenum):
1586         """Report attempt to download playlist page with given number."""
1587         query = query.decode(preferredencoding())
1588         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1589
1590     def _real_extract(self, query):
1591         mobj = re.match(self._VALID_URL, query)
1592         if mobj is None:
1593             self._downloader.report_error(u'invalid search query "%s"' % query)
1594             return
1595
1596         prefix, query = query.split(':')
1597         prefix = prefix[8:]
1598         query = query.encode('utf-8')
1599         if prefix == '':
1600             self._download_n_results(query, 1)
1601             return
1602         elif prefix == 'all':
1603             self._download_n_results(query, self._max_google_results)
1604             return
1605         else:
1606             try:
1607                 n = int(prefix)
1608                 if n <= 0:
1609                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1610                     return
1611                 elif n > self._max_google_results:
1612                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1613                     n = self._max_google_results
1614                 self._download_n_results(query, n)
1615                 return
1616             except ValueError: # parsing prefix as integer fails
1617                 self._download_n_results(query, 1)
1618                 return
1619
1620     def _download_n_results(self, query, n):
1621         """Downloads a specified number of results for a query"""
1622
1623         video_ids = []
1624         pagenum = 0
1625
1626         while True:
1627             self.report_download_page(query, pagenum)
1628             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1629             request = compat_urllib_request.Request(result_url)
1630             try:
1631                 page = compat_urllib_request.urlopen(request).read()
1632             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1633                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1634                 return
1635
1636             # Extract video identifiers
1637             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1638                 video_id = mobj.group(1)
1639                 if video_id not in video_ids:
1640                     video_ids.append(video_id)
1641                     if len(video_ids) == n:
1642                         # Specified n videos reached
1643                         for id in video_ids:
1644                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                         return
1646
1647             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1648                 for id in video_ids:
1649                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1650                 return
1651
1652             pagenum = pagenum + 1
1653
1654
1655 class YahooSearchIE(InfoExtractor):
1656     """Information Extractor for Yahoo! Video search queries."""
1657
1658     _WORKING = False
1659     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1660     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1661     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1662     _MORE_PAGES_INDICATOR = r'\s*Next'
1663     _max_yahoo_results = 1000
1664     IE_NAME = u'video.yahoo:search'
1665
1666     def __init__(self, downloader=None):
1667         InfoExtractor.__init__(self, downloader)
1668
1669     def report_download_page(self, query, pagenum):
1670         """Report attempt to download playlist page with given number."""
1671         query = query.decode(preferredencoding())
1672         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1673
1674     def _real_extract(self, query):
1675         mobj = re.match(self._VALID_URL, query)
1676         if mobj is None:
1677             self._downloader.report_error(u'invalid search query "%s"' % query)
1678             return
1679
1680         prefix, query = query.split(':')
1681         prefix = prefix[8:]
1682         query = query.encode('utf-8')
1683         if prefix == '':
1684             self._download_n_results(query, 1)
1685             return
1686         elif prefix == 'all':
1687             self._download_n_results(query, self._max_yahoo_results)
1688             return
1689         else:
1690             try:
1691                 n = int(prefix)
1692                 if n <= 0:
1693                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1694                     return
1695                 elif n > self._max_yahoo_results:
1696                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1697                     n = self._max_yahoo_results
1698                 self._download_n_results(query, n)
1699                 return
1700             except ValueError: # parsing prefix as integer fails
1701                 self._download_n_results(query, 1)
1702                 return
1703
1704     def _download_n_results(self, query, n):
1705         """Downloads a specified number of results for a query"""
1706
1707         video_ids = []
1708         already_seen = set()
1709         pagenum = 1
1710
1711         while True:
1712             self.report_download_page(query, pagenum)
1713             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1714             request = compat_urllib_request.Request(result_url)
1715             try:
1716                 page = compat_urllib_request.urlopen(request).read()
1717             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1719                 return
1720
1721             # Extract video identifiers
1722             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1723                 video_id = mobj.group(1)
1724                 if video_id not in already_seen:
1725                     video_ids.append(video_id)
1726                     already_seen.add(video_id)
1727                     if len(video_ids) == n:
1728                         # Specified n videos reached
1729                         for id in video_ids:
1730                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                         return
1732
1733             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1734                 for id in video_ids:
1735                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1736                 return
1737
1738             pagenum = pagenum + 1
1739
1740
1741 class YoutubePlaylistIE(InfoExtractor):
1742     """Information Extractor for YouTube playlists."""
1743
1744     _VALID_URL = r"""(?:
1745                         (?:https?://)?
1746                         (?:\w+\.)?
1747                         youtube\.com/
1748                         (?:
1749                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1750                            \? (?:.*?&)*? (?:p|a|list)=
1751                         |  p/
1752                         )
1753                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1754                         .*
1755                      |
1756                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1757                      )"""
1758     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1759     _MAX_RESULTS = 50
1760     IE_NAME = u'youtube:playlist'
1761
1762     def __init__(self, downloader=None):
1763         InfoExtractor.__init__(self, downloader)
1764
1765     @classmethod
1766     def suitable(cls, url):
1767         """Receives a URL and returns True if suitable for this IE."""
1768         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1769
1770     def report_download_page(self, playlist_id, pagenum):
1771         """Report attempt to download playlist page with given number."""
1772         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1773
1774     def _real_extract(self, url):
1775         # Extract playlist id
1776         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1777         if mobj is None:
1778             self._downloader.report_error(u'invalid url: %s' % url)
1779             return
1780
1781         # Download playlist videos from API
1782         playlist_id = mobj.group(1) or mobj.group(2)
1783         page_num = 1
1784         videos = []
1785
1786         while True:
1787             self.report_download_page(playlist_id, page_num)
1788
1789             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1790             try:
1791                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1792             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1794                 return
1795
1796             try:
1797                 response = json.loads(page)
1798             except ValueError as err:
1799                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1800                 return
1801
1802             if 'feed' not in response:
1803                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1804                 return
1805             if 'entry' not in response['feed']:
1806                 # Number of videos is a multiple of self._MAX_RESULTS
1807                 break
1808
1809             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1810                         for entry in response['feed']['entry']
1811                         if 'content' in entry ]
1812
1813             if len(response['feed']['entry']) < self._MAX_RESULTS:
1814                 break
1815             page_num += 1
1816
1817         videos = [v[1] for v in sorted(videos)]
1818
1819         url_results = [self.url_result(url) for url in videos]
1820         return [self.playlist_result(url_results, playlist_id)]
1821
1822
1823 class YoutubeChannelIE(InfoExtractor):
1824     """Information Extractor for YouTube channels."""
1825
1826     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1827     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1828     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1829     IE_NAME = u'youtube:channel'
1830
1831     def report_download_page(self, channel_id, pagenum):
1832         """Report attempt to download channel page with given number."""
1833         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1834
1835     def _real_extract(self, url):
1836         # Extract channel id
1837         mobj = re.match(self._VALID_URL, url)
1838         if mobj is None:
1839             self._downloader.report_error(u'invalid url: %s' % url)
1840             return
1841
1842         # Download channel pages
1843         channel_id = mobj.group(1)
1844         video_ids = []
1845         pagenum = 1
1846
1847         while True:
1848             self.report_download_page(channel_id, pagenum)
1849             url = self._TEMPLATE_URL % (channel_id, pagenum)
1850             request = compat_urllib_request.Request(url)
1851             try:
1852                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1853             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1854                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1855                 return
1856
1857             # Extract video identifiers
1858             ids_in_page = []
1859             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1860                 if mobj.group(1) not in ids_in_page:
1861                     ids_in_page.append(mobj.group(1))
1862             video_ids.extend(ids_in_page)
1863
1864             if self._MORE_PAGES_INDICATOR not in page:
1865                 break
1866             pagenum = pagenum + 1
1867
1868         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1869
1870         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1871         url_entries = [self.url_result(url) for url in urls]
1872         return [self.playlist_result(url_entries, channel_id)]
1873
1874
1875 class YoutubeUserIE(InfoExtractor):
1876     """Information Extractor for YouTube users."""
1877
1878     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1879     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1880     _GDATA_PAGE_SIZE = 50
1881     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1882     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1883     IE_NAME = u'youtube:user'
1884
1885     def __init__(self, downloader=None):
1886         InfoExtractor.__init__(self, downloader)
1887
1888     def report_download_page(self, username, start_index):
1889         """Report attempt to download user page."""
1890         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1891                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1892
1893     def _real_extract(self, url):
1894         # Extract username
1895         mobj = re.match(self._VALID_URL, url)
1896         if mobj is None:
1897             self._downloader.report_error(u'invalid url: %s' % url)
1898             return
1899
1900         username = mobj.group(1)
1901
1902         # Download video ids using YouTube Data API. Result size per
1903         # query is limited (currently to 50 videos) so we need to query
1904         # page by page until there are no video ids - it means we got
1905         # all of them.
1906
1907         video_ids = []
1908         pagenum = 0
1909
1910         while True:
1911             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1912             self.report_download_page(username, start_index)
1913
1914             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1915
1916             try:
1917                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1920                 return
1921
1922             # Extract video identifiers
1923             ids_in_page = []
1924
1925             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1926                 if mobj.group(1) not in ids_in_page:
1927                     ids_in_page.append(mobj.group(1))
1928
1929             video_ids.extend(ids_in_page)
1930
1931             # A little optimization - if current page is not
1932             # "full", ie. does not contain PAGE_SIZE video ids then
1933             # we can assume that this page is the last one - there
1934             # are no more ids on further pages - no need to query
1935             # again.
1936
1937             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1938                 break
1939
1940             pagenum += 1
1941
1942         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1943         url_results = [self.url_result(url) for url in urls]
1944         return [self.playlist_result(url_results, playlist_title = username)]
1945
1946
1947 class BlipTVUserIE(InfoExtractor):
1948     """Information Extractor for blip.tv users."""
1949
1950     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1951     _PAGE_SIZE = 12
1952     IE_NAME = u'blip.tv:user'
1953
1954     def __init__(self, downloader=None):
1955         InfoExtractor.__init__(self, downloader)
1956
1957     def report_download_page(self, username, pagenum):
1958         """Report attempt to download user page."""
1959         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1960                 (self.IE_NAME, username, pagenum))
1961
1962     def _real_extract(self, url):
1963         # Extract username
1964         mobj = re.match(self._VALID_URL, url)
1965         if mobj is None:
1966             self._downloader.report_error(u'invalid url: %s' % url)
1967             return
1968
1969         username = mobj.group(1)
1970
1971         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1972
1973         request = compat_urllib_request.Request(url)
1974
1975         try:
1976             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1977             mobj = re.search(r'data-users-id="([^"]+)"', page)
1978             page_base = page_base % mobj.group(1)
1979         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1981             return
1982
1983
1984         # Download video ids using BlipTV Ajax calls. Result size per
1985         # query is limited (currently to 12 videos) so we need to query
1986         # page by page until there are no video ids - it means we got
1987         # all of them.
1988
1989         video_ids = []
1990         pagenum = 1
1991
1992         while True:
1993             self.report_download_page(username, pagenum)
1994             url = page_base + "&page=" + str(pagenum)
1995             request = compat_urllib_request.Request( url )
1996             try:
1997                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1998             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1999                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2000                 return
2001
2002             # Extract video identifiers
2003             ids_in_page = []
2004
2005             for mobj in re.finditer(r'href="/([^"]+)"', page):
2006                 if mobj.group(1) not in ids_in_page:
2007                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2008
2009             video_ids.extend(ids_in_page)
2010
2011             # A little optimization - if current page is not
2012             # "full", ie. does not contain PAGE_SIZE video ids then
2013             # we can assume that this page is the last one - there
2014             # are no more ids on further pages - no need to query
2015             # again.
2016
2017             if len(ids_in_page) < self._PAGE_SIZE:
2018                 break
2019
2020             pagenum += 1
2021
2022         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2023                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2024
2025         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2026         url_entries = [self.url_result(url) for url in urls]
2027         return [self.playlist_result(url_entries, playlist_title = username)]
2028
2029
2030 class DepositFilesIE(InfoExtractor):
2031     """Information extractor for depositfiles.com"""
2032
2033     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2034
2035     def report_download_webpage(self, file_id):
2036         """Report webpage download."""
2037         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2038
2039     def report_extraction(self, file_id):
2040         """Report information extraction."""
2041         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2042
2043     def _real_extract(self, url):
2044         file_id = url.split('/')[-1]
2045         # Rebuild url in english locale
2046         url = 'http://depositfiles.com/en/files/' + file_id
2047
2048         # Retrieve file webpage with 'Free download' button pressed
2049         free_download_indication = { 'gateway_result' : '1' }
2050         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2051         try:
2052             self.report_download_webpage(file_id)
2053             webpage = compat_urllib_request.urlopen(request).read()
2054         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2056             return
2057
2058         # Search for the real file URL
2059         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2060         if (mobj is None) or (mobj.group(1) is None):
2061             # Try to figure out reason of the error.
2062             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2063             if (mobj is not None) and (mobj.group(1) is not None):
2064                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2065                 self._downloader.report_error(u'%s' % restriction_message)
2066             else:
2067                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2068             return
2069
2070         file_url = mobj.group(1)
2071         file_extension = os.path.splitext(file_url)[1][1:]
2072
2073         # Search for file title
2074         mobj = re.search(r'<b title="(.*?)">', webpage)
2075         if mobj is None:
2076             self._downloader.report_error(u'unable to extract title')
2077             return
2078         file_title = mobj.group(1).decode('utf-8')
2079
2080         return [{
2081             'id':       file_id.decode('utf-8'),
2082             'url':      file_url.decode('utf-8'),
2083             'uploader': None,
2084             'upload_date':  None,
2085             'title':    file_title,
2086             'ext':      file_extension.decode('utf-8'),
2087         }]
2088
2089
2090 class FacebookIE(InfoExtractor):
2091     """Information Extractor for Facebook"""
2092
2093     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2094     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2095     _NETRC_MACHINE = 'facebook'
2096     IE_NAME = u'facebook'
2097
2098     def report_login(self):
2099         """Report attempt to log in."""
2100         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2101
2102     def _real_initialize(self):
2103         if self._downloader is None:
2104             return
2105
2106         useremail = None
2107         password = None
2108         downloader_params = self._downloader.params
2109
2110         # Attempt to use provided username and password or .netrc data
2111         if downloader_params.get('username', None) is not None:
2112             useremail = downloader_params['username']
2113             password = downloader_params['password']
2114         elif downloader_params.get('usenetrc', False):
2115             try:
2116                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2117                 if info is not None:
2118                     useremail = info[0]
2119                     password = info[2]
2120                 else:
2121                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2122             except (IOError, netrc.NetrcParseError) as err:
2123                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2124                 return
2125
2126         if useremail is None:
2127             return
2128
2129         # Log in
2130         login_form = {
2131             'email': useremail,
2132             'pass': password,
2133             'login': 'Log+In'
2134             }
2135         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2136         try:
2137             self.report_login()
2138             login_results = compat_urllib_request.urlopen(request).read()
2139             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2140                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2141                 return
2142         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2143             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2144             return
2145
2146     def _real_extract(self, url):
2147         mobj = re.match(self._VALID_URL, url)
2148         if mobj is None:
2149             self._downloader.report_error(u'invalid URL: %s' % url)
2150             return
2151         video_id = mobj.group('ID')
2152
2153         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2154         webpage = self._download_webpage(url, video_id)
2155
2156         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2157         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2158         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2159         if not m:
2160             raise ExtractorError(u'Cannot parse data')
2161         data = dict(json.loads(m.group(1)))
2162         params_raw = compat_urllib_parse.unquote(data['params'])
2163         params = json.loads(params_raw)
2164         video_data = params['video_data'][0]
2165         video_url = video_data.get('hd_src')
2166         if not video_url:
2167             video_url = video_data['sd_src']
2168         if not video_url:
2169             raise ExtractorError(u'Cannot find video URL')
2170         video_duration = int(video_data['video_duration'])
2171         thumbnail = video_data['thumbnail_src']
2172
2173         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2174         if not m:
2175             raise ExtractorError(u'Cannot find title in webpage')
2176         video_title = unescapeHTML(m.group(1))
2177
2178         info = {
2179             'id': video_id,
2180             'title': video_title,
2181             'url': video_url,
2182             'ext': 'mp4',
2183             'duration': video_duration,
2184             'thumbnail': thumbnail,
2185         }
2186         return [info]
2187
2188
2189 class BlipTVIE(InfoExtractor):
2190     """Information extractor for blip.tv"""
2191
2192     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2193     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2194     IE_NAME = u'blip.tv'
2195
2196     def report_extraction(self, file_id):
2197         """Report information extraction."""
2198         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2199
2200     def report_direct_download(self, title):
2201         """Report information extraction."""
2202         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2203
2204     def _real_extract(self, url):
2205         mobj = re.match(self._VALID_URL, url)
2206         if mobj is None:
2207             self._downloader.report_error(u'invalid URL: %s' % url)
2208             return
2209
2210         urlp = compat_urllib_parse_urlparse(url)
2211         if urlp.path.startswith('/play/'):
2212             request = compat_urllib_request.Request(url)
2213             response = compat_urllib_request.urlopen(request)
2214             redirecturl = response.geturl()
2215             rurlp = compat_urllib_parse_urlparse(redirecturl)
2216             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2217             url = 'http://blip.tv/a/a-' + file_id
2218             return self._real_extract(url)
2219
2220
2221         if '?' in url:
2222             cchar = '&'
2223         else:
2224             cchar = '?'
2225         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2226         request = compat_urllib_request.Request(json_url)
2227         request.add_header('User-Agent', 'iTunes/10.6.1')
2228         self.report_extraction(mobj.group(1))
2229         info = None
2230         try:
2231             urlh = compat_urllib_request.urlopen(request)
2232             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2233                 basename = url.split('/')[-1]
2234                 title,ext = os.path.splitext(basename)
2235                 title = title.decode('UTF-8')
2236                 ext = ext.replace('.', '')
2237                 self.report_direct_download(title)
2238                 info = {
2239                     'id': title,
2240                     'url': url,
2241                     'uploader': None,
2242                     'upload_date': None,
2243                     'title': title,
2244                     'ext': ext,
2245                     'urlhandle': urlh
2246                 }
2247         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2248             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2249         if info is None: # Regular URL
2250             try:
2251                 json_code_bytes = urlh.read()
2252                 json_code = json_code_bytes.decode('utf-8')
2253             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2254                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2255                 return
2256
2257             try:
2258                 json_data = json.loads(json_code)
2259                 if 'Post' in json_data:
2260                     data = json_data['Post']
2261                 else:
2262                     data = json_data
2263
2264                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2265                 video_url = data['media']['url']
2266                 umobj = re.match(self._URL_EXT, video_url)
2267                 if umobj is None:
2268                     raise ValueError('Can not determine filename extension')
2269                 ext = umobj.group(1)
2270
2271                 info = {
2272                     'id': data['item_id'],
2273                     'url': video_url,
2274                     'uploader': data['display_name'],
2275                     'upload_date': upload_date,
2276                     'title': data['title'],
2277                     'ext': ext,
2278                     'format': data['media']['mimeType'],
2279                     'thumbnail': data['thumbnailUrl'],
2280                     'description': data['description'],
2281                     'player_url': data['embedUrl'],
2282                     'user_agent': 'iTunes/10.6.1',
2283                 }
2284             except (ValueError,KeyError) as err:
2285                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2286                 return
2287
2288         return [info]
2289
2290
2291 class MyVideoIE(InfoExtractor):
2292     """Information Extractor for myvideo.de."""
2293
2294     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2295     IE_NAME = u'myvideo'
2296
2297     def __init__(self, downloader=None):
2298         InfoExtractor.__init__(self, downloader)
2299
2300     def report_extraction(self, video_id):
2301         """Report information extraction."""
2302         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2303
2304     def _real_extract(self,url):
2305         mobj = re.match(self._VALID_URL, url)
2306         if mobj is None:
2307             self._download.report_error(u'invalid URL: %s' % url)
2308             return
2309
2310         video_id = mobj.group(1)
2311
2312         # Get video webpage
2313         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2314         webpage = self._download_webpage(webpage_url, video_id)
2315
2316         self.report_extraction(video_id)
2317         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2318                  webpage)
2319         if mobj is None:
2320             self._downloader.report_error(u'unable to extract media URL')
2321             return
2322         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2323
2324         mobj = re.search('<title>([^<]+)</title>', webpage)
2325         if mobj is None:
2326             self._downloader.report_error(u'unable to extract title')
2327             return
2328
2329         video_title = mobj.group(1)
2330
2331         return [{
2332             'id':       video_id,
2333             'url':      video_url,
2334             'uploader': None,
2335             'upload_date':  None,
2336             'title':    video_title,
2337             'ext':      u'flv',
2338         }]
2339
2340 class ComedyCentralIE(InfoExtractor):
2341     """Information extractor for The Daily Show and Colbert Report """
2342
2343     # urls can be abbreviations like :thedailyshow or :colbert
2344     # urls for episodes like:
2345     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2346     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2347     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2348     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2349                       |(https?://)?(www\.)?
2350                           (?P<showname>thedailyshow|colbertnation)\.com/
2351                          (full-episodes/(?P<episode>.*)|
2352                           (?P<clip>
2353                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2354                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2355                      $"""
2356
2357     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2358
2359     _video_extensions = {
2360         '3500': 'mp4',
2361         '2200': 'mp4',
2362         '1700': 'mp4',
2363         '1200': 'mp4',
2364         '750': 'mp4',
2365         '400': 'mp4',
2366     }
2367     _video_dimensions = {
2368         '3500': '1280x720',
2369         '2200': '960x540',
2370         '1700': '768x432',
2371         '1200': '640x360',
2372         '750': '512x288',
2373         '400': '384x216',
2374     }
2375
2376     @classmethod
2377     def suitable(cls, url):
2378         """Receives a URL and returns True if suitable for this IE."""
2379         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2380
2381     def report_extraction(self, episode_id):
2382         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2383
2384     def report_config_download(self, episode_id, media_id):
2385         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2386
2387     def report_index_download(self, episode_id):
2388         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2389
2390     def _print_formats(self, formats):
2391         print('Available formats:')
2392         for x in formats:
2393             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2394
2395
2396     def _real_extract(self, url):
2397         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2398         if mobj is None:
2399             self._downloader.report_error(u'invalid URL: %s' % url)
2400             return
2401
2402         if mobj.group('shortname'):
2403             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2404                 url = u'http://www.thedailyshow.com/full-episodes/'
2405             else:
2406                 url = u'http://www.colbertnation.com/full-episodes/'
2407             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2408             assert mobj is not None
2409
2410         if mobj.group('clip'):
2411             if mobj.group('showname') == 'thedailyshow':
2412                 epTitle = mobj.group('tdstitle')
2413             else:
2414                 epTitle = mobj.group('cntitle')
2415             dlNewest = False
2416         else:
2417             dlNewest = not mobj.group('episode')
2418             if dlNewest:
2419                 epTitle = mobj.group('showname')
2420             else:
2421                 epTitle = mobj.group('episode')
2422
2423         req = compat_urllib_request.Request(url)
2424         self.report_extraction(epTitle)
2425         try:
2426             htmlHandle = compat_urllib_request.urlopen(req)
2427             html = htmlHandle.read()
2428             webpage = html.decode('utf-8')
2429         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2430             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2431             return
2432         if dlNewest:
2433             url = htmlHandle.geturl()
2434             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2435             if mobj is None:
2436                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2437                 return
2438             if mobj.group('episode') == '':
2439                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2440                 return
2441             epTitle = mobj.group('episode')
2442
2443         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2444
2445         if len(mMovieParams) == 0:
2446             # The Colbert Report embeds the information in a without
2447             # a URL prefix; so extract the alternate reference
2448             # and then add the URL prefix manually.
2449
2450             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2451             if len(altMovieParams) == 0:
2452                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2453                 return
2454             else:
2455                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2456
2457         uri = mMovieParams[0][1]
2458         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2459         self.report_index_download(epTitle)
2460         try:
2461             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2464             return
2465
2466         results = []
2467
2468         idoc = xml.etree.ElementTree.fromstring(indexXml)
2469         itemEls = idoc.findall('.//item')
2470         for partNum,itemEl in enumerate(itemEls):
2471             mediaId = itemEl.findall('./guid')[0].text
2472             shortMediaId = mediaId.split(':')[-1]
2473             showId = mediaId.split(':')[-2].replace('.com', '')
2474             officialTitle = itemEl.findall('./title')[0].text
2475             officialDate = itemEl.findall('./pubDate')[0].text
2476
2477             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2478                         compat_urllib_parse.urlencode({'uri': mediaId}))
2479             configReq = compat_urllib_request.Request(configUrl)
2480             self.report_config_download(epTitle, shortMediaId)
2481             try:
2482                 configXml = compat_urllib_request.urlopen(configReq).read()
2483             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2485                 return
2486
2487             cdoc = xml.etree.ElementTree.fromstring(configXml)
2488             turls = []
2489             for rendition in cdoc.findall('.//rendition'):
2490                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2491                 turls.append(finfo)
2492
2493             if len(turls) == 0:
2494                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2495                 continue
2496
2497             if self._downloader.params.get('listformats', None):
2498                 self._print_formats([i[0] for i in turls])
2499                 return
2500
2501             # For now, just pick the highest bitrate
2502             format,rtmp_video_url = turls[-1]
2503
2504             # Get the format arg from the arg stream
2505             req_format = self._downloader.params.get('format', None)
2506
2507             # Select format if we can find one
2508             for f,v in turls:
2509                 if f == req_format:
2510                     format, rtmp_video_url = f, v
2511                     break
2512
2513             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2514             if not m:
2515                 raise ExtractorError(u'Cannot transform RTMP url')
2516             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2517             video_url = base + m.group('finalid')
2518
2519             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2520             info = {
2521                 'id': shortMediaId,
2522                 'url': video_url,
2523                 'uploader': showId,
2524                 'upload_date': officialDate,
2525                 'title': effTitle,
2526                 'ext': 'mp4',
2527                 'format': format,
2528                 'thumbnail': None,
2529                 'description': officialTitle,
2530             }
2531             results.append(info)
2532
2533         return results
2534
2535
2536 class EscapistIE(InfoExtractor):
2537     """Information extractor for The Escapist """
2538
2539     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540     IE_NAME = u'escapist'
2541
2542     def report_extraction(self, showName):
2543         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2544
2545     def report_config_download(self, showName):
2546         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2547
2548     def _real_extract(self, url):
2549         mobj = re.match(self._VALID_URL, url)
2550         if mobj is None:
2551             self._downloader.report_error(u'invalid URL: %s' % url)
2552             return
2553         showName = mobj.group('showname')
2554         videoId = mobj.group('episode')
2555
2556         self.report_extraction(showName)
2557         try:
2558             webPage = compat_urllib_request.urlopen(url)
2559             webPageBytes = webPage.read()
2560             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2561             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2562         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2564             return
2565
2566         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2567         description = unescapeHTML(descMatch.group(1))
2568         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2569         imgUrl = unescapeHTML(imgMatch.group(1))
2570         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2571         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572         configUrlMatch = re.search('config=(.*)$', playerUrl)
2573         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2574
2575         self.report_config_download(showName)
2576         try:
2577             configJSON = compat_urllib_request.urlopen(configUrl)
2578             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2579             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2580         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2582             return
2583
2584         # Technically, it's JavaScript, not JSON
2585         configJSON = configJSON.replace("'", '"')
2586
2587         try:
2588             config = json.loads(configJSON)
2589         except (ValueError,) as err:
2590             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2591             return
2592
2593         playlist = config['playlist']
2594         videoUrl = playlist[1]['url']
2595
2596         info = {
2597             'id': videoId,
2598             'url': videoUrl,
2599             'uploader': showName,
2600             'upload_date': None,
2601             'title': showName,
2602             'ext': 'mp4',
2603             'thumbnail': imgUrl,
2604             'description': description,
2605             'player_url': playerUrl,
2606         }
2607
2608         return [info]
2609
2610 class CollegeHumorIE(InfoExtractor):
2611     """Information extractor for collegehumor.com"""
2612
2613     _WORKING = False
2614     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2615     IE_NAME = u'collegehumor'
2616
2617     def report_manifest(self, video_id):
2618         """Report information extraction."""
2619         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2620
2621     def report_extraction(self, video_id):
2622         """Report information extraction."""
2623         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2624
2625     def _real_extract(self, url):
2626         mobj = re.match(self._VALID_URL, url)
2627         if mobj is None:
2628             self._downloader.report_error(u'invalid URL: %s' % url)
2629             return
2630         video_id = mobj.group('videoid')
2631
2632         info = {
2633             'id': video_id,
2634             'uploader': None,
2635             'upload_date': None,
2636         }
2637
2638         self.report_extraction(video_id)
2639         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2640         try:
2641             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2642         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2644             return
2645
2646         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2647         try:
2648             videoNode = mdoc.findall('./video')[0]
2649             info['description'] = videoNode.findall('./description')[0].text
2650             info['title'] = videoNode.findall('./caption')[0].text
2651             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2652             manifest_url = videoNode.findall('./file')[0].text
2653         except IndexError:
2654             self._downloader.report_error(u'Invalid metadata XML file')
2655             return
2656
2657         manifest_url += '?hdcore=2.10.3'
2658         self.report_manifest(video_id)
2659         try:
2660             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2661         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2662             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2663             return
2664
2665         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2666         try:
2667             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2668             node_id = media_node.attrib['url']
2669             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2670         except IndexError as err:
2671             self._downloader.report_error(u'Invalid manifest file')
2672             return
2673
2674         url_pr = compat_urllib_parse_urlparse(manifest_url)
2675         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2676
2677         info['url'] = url
2678         info['ext'] = 'f4f'
2679         return [info]
2680
2681
2682 class XVideosIE(InfoExtractor):
2683     """Information extractor for xvideos.com"""
2684
2685     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2686     IE_NAME = u'xvideos'
2687
2688     def report_extraction(self, video_id):
2689         """Report information extraction."""
2690         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2691
2692     def _real_extract(self, url):
2693         mobj = re.match(self._VALID_URL, url)
2694         if mobj is None:
2695             self._downloader.report_error(u'invalid URL: %s' % url)
2696             return
2697         video_id = mobj.group(1)
2698
2699         webpage = self._download_webpage(url, video_id)
2700
2701         self.report_extraction(video_id)
2702
2703
2704         # Extract video URL
2705         mobj = re.search(r'flv_url=(.+?)&', webpage)
2706         if mobj is None:
2707             self._downloader.report_error(u'unable to extract video url')
2708             return
2709         video_url = compat_urllib_parse.unquote(mobj.group(1))
2710
2711
2712         # Extract title
2713         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2714         if mobj is None:
2715             self._downloader.report_error(u'unable to extract video title')
2716             return
2717         video_title = mobj.group(1)
2718
2719
2720         # Extract video thumbnail
2721         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2722         if mobj is None:
2723             self._downloader.report_error(u'unable to extract video thumbnail')
2724             return
2725         video_thumbnail = mobj.group(0)
2726
2727         info = {
2728             'id': video_id,
2729             'url': video_url,
2730             'uploader': None,
2731             'upload_date': None,
2732             'title': video_title,
2733             'ext': 'flv',
2734             'thumbnail': video_thumbnail,
2735             'description': None,
2736         }
2737
2738         return [info]
2739
2740
2741 class SoundcloudIE(InfoExtractor):
2742     """Information extractor for soundcloud.com
2743        To access the media, the uid of the song and a stream token
2744        must be extracted from the page source and the script must make
2745        a request to media.soundcloud.com/crossdomain.xml. Then
2746        the media can be grabbed by requesting from an url composed
2747        of the stream token and uid
2748      """
2749
2750     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2751     IE_NAME = u'soundcloud'
2752
2753     def __init__(self, downloader=None):
2754         InfoExtractor.__init__(self, downloader)
2755
2756     def report_resolve(self, video_id):
2757         """Report information extraction."""
2758         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2759
2760     def report_extraction(self, video_id):
2761         """Report information extraction."""
2762         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2763
2764     def _real_extract(self, url):
2765         mobj = re.match(self._VALID_URL, url)
2766         if mobj is None:
2767             self._downloader.report_error(u'invalid URL: %s' % url)
2768             return
2769
2770         # extract uploader (which is in the url)
2771         uploader = mobj.group(1)
2772         # extract simple title (uploader + slug of song title)
2773         slug_title =  mobj.group(2)
2774         simple_title = uploader + u'-' + slug_title
2775
2776         self.report_resolve('%s/%s' % (uploader, slug_title))
2777
2778         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2779         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2780         request = compat_urllib_request.Request(resolv_url)
2781         try:
2782             info_json_bytes = compat_urllib_request.urlopen(request).read()
2783             info_json = info_json_bytes.decode('utf-8')
2784         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2786             return
2787
2788         info = json.loads(info_json)
2789         video_id = info['id']
2790         self.report_extraction('%s/%s' % (uploader, slug_title))
2791
2792         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793         request = compat_urllib_request.Request(streams_url)
2794         try:
2795             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2796             stream_json = stream_json_bytes.decode('utf-8')
2797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2798             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2799             return
2800
2801         streams = json.loads(stream_json)
2802         mediaURL = streams['http_mp3_128_url']
2803
2804         return [{
2805             'id':       info['id'],
2806             'url':      mediaURL,
2807             'uploader': info['user']['username'],
2808             'upload_date':  info['created_at'],
2809             'title':    info['title'],
2810             'ext':      u'mp3',
2811             'description': info['description'],
2812         }]
2813
2814 class SoundcloudSetIE(InfoExtractor):
2815     """Information extractor for soundcloud.com sets
2816        To access the media, the uid of the song and a stream token
2817        must be extracted from the page source and the script must make
2818        a request to media.soundcloud.com/crossdomain.xml. Then
2819        the media can be grabbed by requesting from an url composed
2820        of the stream token and uid
2821      """
2822
2823     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2824     IE_NAME = u'soundcloud'
2825
2826     def __init__(self, downloader=None):
2827         InfoExtractor.__init__(self, downloader)
2828
2829     def report_resolve(self, video_id):
2830         """Report information extraction."""
2831         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2832
2833     def report_extraction(self, video_id):
2834         """Report information extraction."""
2835         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2836
2837     def _real_extract(self, url):
2838         mobj = re.match(self._VALID_URL, url)
2839         if mobj is None:
2840             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2841             return
2842
2843         # extract uploader (which is in the url)
2844         uploader = mobj.group(1)
2845         # extract simple title (uploader + slug of song title)
2846         slug_title =  mobj.group(2)
2847         simple_title = uploader + u'-' + slug_title
2848
2849         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2850
2851         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2852         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2853         request = compat_urllib_request.Request(resolv_url)
2854         try:
2855             info_json_bytes = compat_urllib_request.urlopen(request).read()
2856             info_json = info_json_bytes.decode('utf-8')
2857         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2858             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2859             return
2860
2861         videos = []
2862         info = json.loads(info_json)
2863         if 'errors' in info:
2864             for err in info['errors']:
2865                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2866             return
2867
2868         for track in info['tracks']:
2869             video_id = track['id']
2870             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2871
2872             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2873             request = compat_urllib_request.Request(streams_url)
2874             try:
2875                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2876                 stream_json = stream_json_bytes.decode('utf-8')
2877             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2878                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2879                 return
2880
2881             streams = json.loads(stream_json)
2882             mediaURL = streams['http_mp3_128_url']
2883
2884             videos.append({
2885                 'id':       video_id,
2886                 'url':      mediaURL,
2887                 'uploader': track['user']['username'],
2888                 'upload_date':  track['created_at'],
2889                 'title':    track['title'],
2890                 'ext':      u'mp3',
2891                 'description': track['description'],
2892             })
2893         return videos
2894
2895
2896 class InfoQIE(InfoExtractor):
2897     """Information extractor for infoq.com"""
2898     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2899
2900     def report_extraction(self, video_id):
2901         """Report information extraction."""
2902         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2903
2904     def _real_extract(self, url):
2905         mobj = re.match(self._VALID_URL, url)
2906         if mobj is None:
2907             self._downloader.report_error(u'invalid URL: %s' % url)
2908             return
2909
2910         webpage = self._download_webpage(url, video_id=url)
2911         self.report_extraction(url)
2912
2913         # Extract video URL
2914         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2915         if mobj is None:
2916             self._downloader.report_error(u'unable to extract video url')
2917             return
2918         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2919         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2920
2921         # Extract title
2922         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2923         if mobj is None:
2924             self._downloader.report_error(u'unable to extract video title')
2925             return
2926         video_title = mobj.group(1)
2927
2928         # Extract description
2929         video_description = u'No description available.'
2930         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2931         if mobj is not None:
2932             video_description = mobj.group(1)
2933
2934         video_filename = video_url.split('/')[-1]
2935         video_id, extension = video_filename.split('.')
2936
2937         info = {
2938             'id': video_id,
2939             'url': video_url,
2940             'uploader': None,
2941             'upload_date': None,
2942             'title': video_title,
2943             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2944             'thumbnail': None,
2945             'description': video_description,
2946         }
2947
2948         return [info]
2949
2950 class MixcloudIE(InfoExtractor):
2951     """Information extractor for www.mixcloud.com"""
2952
2953     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2954     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2955     IE_NAME = u'mixcloud'
2956
2957     def __init__(self, downloader=None):
2958         InfoExtractor.__init__(self, downloader)
2959
2960     def report_download_json(self, file_id):
2961         """Report JSON download."""
2962         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2963
2964     def report_extraction(self, file_id):
2965         """Report information extraction."""
2966         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2967
2968     def get_urls(self, jsonData, fmt, bitrate='best'):
2969         """Get urls from 'audio_formats' section in json"""
2970         file_url = None
2971         try:
2972             bitrate_list = jsonData[fmt]
2973             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2974                 bitrate = max(bitrate_list) # select highest
2975
2976             url_list = jsonData[fmt][bitrate]
2977         except TypeError: # we have no bitrate info.
2978             url_list = jsonData[fmt]
2979         return url_list
2980
2981     def check_urls(self, url_list):
2982         """Returns 1st active url from list"""
2983         for url in url_list:
2984             try:
2985                 compat_urllib_request.urlopen(url)
2986                 return url
2987             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2988                 url = None
2989
2990         return None
2991
2992     def _print_formats(self, formats):
2993         print('Available formats:')
2994         for fmt in formats.keys():
2995             for b in formats[fmt]:
2996                 try:
2997                     ext = formats[fmt][b][0]
2998                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2999                 except TypeError: # we have no bitrate info
3000                     ext = formats[fmt][0]
3001                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3002                     break
3003
3004     def _real_extract(self, url):
3005         mobj = re.match(self._VALID_URL, url)
3006         if mobj is None:
3007             self._downloader.report_error(u'invalid URL: %s' % url)
3008             return
3009         # extract uploader & filename from url
3010         uploader = mobj.group(1).decode('utf-8')
3011         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3012
3013         # construct API request
3014         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3015         # retrieve .json file with links to files
3016         request = compat_urllib_request.Request(file_url)
3017         try:
3018             self.report_download_json(file_url)
3019             jsonData = compat_urllib_request.urlopen(request).read()
3020         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3022             return
3023
3024         # parse JSON
3025         json_data = json.loads(jsonData)
3026         player_url = json_data['player_swf_url']
3027         formats = dict(json_data['audio_formats'])
3028
3029         req_format = self._downloader.params.get('format', None)
3030         bitrate = None
3031
3032         if self._downloader.params.get('listformats', None):
3033             self._print_formats(formats)
3034             return
3035
3036         if req_format is None or req_format == 'best':
3037             for format_param in formats.keys():
3038                 url_list = self.get_urls(formats, format_param)
3039                 # check urls
3040                 file_url = self.check_urls(url_list)
3041                 if file_url is not None:
3042                     break # got it!
3043         else:
3044             if req_format not in formats:
3045                 self._downloader.report_error(u'format is not available')
3046                 return
3047
3048             url_list = self.get_urls(formats, req_format)
3049             file_url = self.check_urls(url_list)
3050             format_param = req_format
3051
3052         return [{
3053             'id': file_id.decode('utf-8'),
3054             'url': file_url.decode('utf-8'),
3055             'uploader': uploader.decode('utf-8'),
3056             'upload_date': None,
3057             'title': json_data['name'],
3058             'ext': file_url.split('.')[-1].decode('utf-8'),
3059             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3060             'thumbnail': json_data['thumbnail_url'],
3061             'description': json_data['description'],
3062             'player_url': player_url.decode('utf-8'),
3063         }]
3064
3065 class StanfordOpenClassroomIE(InfoExtractor):
3066     """Information extractor for Stanford's Open ClassRoom"""
3067
3068     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3069     IE_NAME = u'stanfordoc'
3070
3071     def report_download_webpage(self, objid):
3072         """Report information extraction."""
3073         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3074
3075     def report_extraction(self, video_id):
3076         """Report information extraction."""
3077         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3078
3079     def _real_extract(self, url):
3080         mobj = re.match(self._VALID_URL, url)
3081         if mobj is None:
3082             raise ExtractorError(u'Invalid URL: %s' % url)
3083
3084         if mobj.group('course') and mobj.group('video'): # A specific video
3085             course = mobj.group('course')
3086             video = mobj.group('video')
3087             info = {
3088                 'id': course + '_' + video,
3089                 'uploader': None,
3090                 'upload_date': None,
3091             }
3092
3093             self.report_extraction(info['id'])
3094             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3095             xmlUrl = baseUrl + video + '.xml'
3096             try:
3097                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3098             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3099                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3100                 return
3101             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3102             try:
3103                 info['title'] = mdoc.findall('./title')[0].text
3104                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3105             except IndexError:
3106                 self._downloader.report_error(u'Invalid metadata XML file')
3107                 return
3108             info['ext'] = info['url'].rpartition('.')[2]
3109             return [info]
3110         elif mobj.group('course'): # A course page
3111             course = mobj.group('course')
3112             info = {
3113                 'id': course,
3114                 'type': 'playlist',
3115                 'uploader': None,
3116                 'upload_date': None,
3117             }
3118
3119             coursepage = self._download_webpage(url, info['id'],
3120                                         note='Downloading course info page',
3121                                         errnote='Unable to download course info page')
3122
3123             m = re.search('<h1>([^<]+)</h1>', coursepage)
3124             if m:
3125                 info['title'] = unescapeHTML(m.group(1))
3126             else:
3127                 info['title'] = info['id']
3128
3129             m = re.search('<description>([^<]+)</description>', coursepage)
3130             if m:
3131                 info['description'] = unescapeHTML(m.group(1))
3132
3133             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3134             info['list'] = [
3135                 {
3136                     'type': 'reference',
3137                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3138                 }
3139                     for vpage in links]
3140             results = []
3141             for entry in info['list']:
3142                 assert entry['type'] == 'reference'
3143                 results += self.extract(entry['url'])
3144             return results
3145         else: # Root page
3146             info = {
3147                 'id': 'Stanford OpenClassroom',
3148                 'type': 'playlist',
3149                 'uploader': None,
3150                 'upload_date': None,
3151             }
3152
3153             self.report_download_webpage(info['id'])
3154             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3155             try:
3156                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3157             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3158                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3159                 return
3160
3161             info['title'] = info['id']
3162
3163             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3164             info['list'] = [
3165                 {
3166                     'type': 'reference',
3167                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3168                 }
3169                     for cpage in links]
3170
3171             results = []
3172             for entry in info['list']:
3173                 assert entry['type'] == 'reference'
3174                 results += self.extract(entry['url'])
3175             return results
3176
3177 class MTVIE(InfoExtractor):
3178     """Information extractor for MTV.com"""
3179
3180     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3181     IE_NAME = u'mtv'
3182
3183     def report_extraction(self, video_id):
3184         """Report information extraction."""
3185         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3186
3187     def _real_extract(self, url):
3188         mobj = re.match(self._VALID_URL, url)
3189         if mobj is None:
3190             self._downloader.report_error(u'invalid URL: %s' % url)
3191             return
3192         if not mobj.group('proto'):
3193             url = 'http://' + url
3194         video_id = mobj.group('videoid')
3195
3196         webpage = self._download_webpage(url, video_id)
3197
3198         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3199         if mobj is None:
3200             self._downloader.report_error(u'unable to extract song name')
3201             return
3202         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3203         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3204         if mobj is None:
3205             self._downloader.report_error(u'unable to extract performer')
3206             return
3207         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3208         video_title = performer + ' - ' + song_name
3209
3210         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3211         if mobj is None:
3212             self._downloader.report_error(u'unable to mtvn_uri')
3213             return
3214         mtvn_uri = mobj.group(1)
3215
3216         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3217         if mobj is None:
3218             self._downloader.report_error(u'unable to extract content id')
3219             return
3220         content_id = mobj.group(1)
3221
3222         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3223         self.report_extraction(video_id)
3224         request = compat_urllib_request.Request(videogen_url)
3225         try:
3226             metadataXml = compat_urllib_request.urlopen(request).read()
3227         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3229             return
3230
3231         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3232         renditions = mdoc.findall('.//rendition')
3233
3234         # For now, always pick the highest quality.
3235         rendition = renditions[-1]
3236
3237         try:
3238             _,_,ext = rendition.attrib['type'].partition('/')
3239             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3240             video_url = rendition.find('./src').text
3241         except KeyError:
3242             self._downloader.trouble('Invalid rendition field.')
3243             return
3244
3245         info = {
3246             'id': video_id,
3247             'url': video_url,
3248             'uploader': performer,
3249             'upload_date': None,
3250             'title': video_title,
3251             'ext': ext,
3252             'format': format,
3253         }
3254
3255         return [info]
3256
3257
3258 class YoukuIE(InfoExtractor):
3259     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3260
3261     def report_download_webpage(self, file_id):
3262         """Report webpage download."""
3263         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3264
3265     def report_extraction(self, file_id):
3266         """Report information extraction."""
3267         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3268
3269     def _gen_sid(self):
3270         nowTime = int(time.time() * 1000)
3271         random1 = random.randint(1000,1998)
3272         random2 = random.randint(1000,9999)
3273
3274         return "%d%d%d" %(nowTime,random1,random2)
3275
3276     def _get_file_ID_mix_string(self, seed):
3277         mixed = []
3278         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3279         seed = float(seed)
3280         for i in range(len(source)):
3281             seed  =  (seed * 211 + 30031 ) % 65536
3282             index  =  math.floor(seed / 65536 * len(source) )
3283             mixed.append(source[int(index)])
3284             source.remove(source[int(index)])
3285         #return ''.join(mixed)
3286         return mixed
3287
3288     def _get_file_id(self, fileId, seed):
3289         mixed = self._get_file_ID_mix_string(seed)
3290         ids = fileId.split('*')
3291         realId = []
3292         for ch in ids:
3293             if ch:
3294                 realId.append(mixed[int(ch)])
3295         return ''.join(realId)
3296
3297     def _real_extract(self, url):
3298         mobj = re.match(self._VALID_URL, url)
3299         if mobj is None:
3300             self._downloader.report_error(u'invalid URL: %s' % url)
3301             return
3302         video_id = mobj.group('ID')
3303
3304         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3305
3306         request = compat_urllib_request.Request(info_url, None, std_headers)
3307         try:
3308             self.report_download_webpage(video_id)
3309             jsondata = compat_urllib_request.urlopen(request).read()
3310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3311             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3312             return
3313
3314         self.report_extraction(video_id)
3315         try:
3316             jsonstr = jsondata.decode('utf-8')
3317             config = json.loads(jsonstr)
3318
3319             video_title =  config['data'][0]['title']
3320             seed = config['data'][0]['seed']
3321
3322             format = self._downloader.params.get('format', None)
3323             supported_format = list(config['data'][0]['streamfileids'].keys())
3324
3325             if format is None or format == 'best':
3326                 if 'hd2' in supported_format:
3327                     format = 'hd2'
3328                 else:
3329                     format = 'flv'
3330                 ext = u'flv'
3331             elif format == 'worst':
3332                 format = 'mp4'
3333                 ext = u'mp4'
3334             else:
3335                 format = 'flv'
3336                 ext = u'flv'
3337
3338
3339             fileid = config['data'][0]['streamfileids'][format]
3340             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3341         except (UnicodeDecodeError, ValueError, KeyError):
3342             self._downloader.report_error(u'unable to extract info section')
3343             return
3344
3345         files_info=[]
3346         sid = self._gen_sid()
3347         fileid = self._get_file_id(fileid, seed)
3348
3349         #column 8,9 of fileid represent the segment number
3350         #fileid[7:9] should be changed
3351         for index, key in enumerate(keys):
3352
3353             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3354             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3355
3356             info = {
3357                 'id': '%s_part%02d' % (video_id, index),
3358                 'url': download_url,
3359                 'uploader': None,
3360                 'upload_date': None,
3361                 'title': video_title,
3362                 'ext': ext,
3363             }
3364             files_info.append(info)
3365
3366         return files_info
3367
3368
3369 class XNXXIE(InfoExtractor):
3370     """Information extractor for xnxx.com"""
3371
3372     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3373     IE_NAME = u'xnxx'
3374     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3375     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3376     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3377
3378     def report_webpage(self, video_id):
3379         """Report information extraction"""
3380         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3381
3382     def report_extraction(self, video_id):
3383         """Report information extraction"""
3384         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3385
3386     def _real_extract(self, url):
3387         mobj = re.match(self._VALID_URL, url)
3388         if mobj is None:
3389             self._downloader.report_error(u'invalid URL: %s' % url)
3390             return
3391         video_id = mobj.group(1)
3392
3393         self.report_webpage(video_id)
3394
3395         # Get webpage content
3396         try:
3397             webpage_bytes = compat_urllib_request.urlopen(url).read()
3398             webpage = webpage_bytes.decode('utf-8')
3399         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3400             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3401             return
3402
3403         result = re.search(self.VIDEO_URL_RE, webpage)
3404         if result is None:
3405             self._downloader.report_error(u'unable to extract video url')
3406             return
3407         video_url = compat_urllib_parse.unquote(result.group(1))
3408
3409         result = re.search(self.VIDEO_TITLE_RE, webpage)
3410         if result is None:
3411             self._downloader.report_error(u'unable to extract video title')
3412             return
3413         video_title = result.group(1)
3414
3415         result = re.search(self.VIDEO_THUMB_RE, webpage)
3416         if result is None:
3417             self._downloader.report_error(u'unable to extract video thumbnail')
3418             return
3419         video_thumbnail = result.group(1)
3420
3421         return [{
3422             'id': video_id,
3423             'url': video_url,
3424             'uploader': None,
3425             'upload_date': None,
3426             'title': video_title,
3427             'ext': 'flv',
3428             'thumbnail': video_thumbnail,
3429             'description': None,
3430         }]
3431
3432
3433 class GooglePlusIE(InfoExtractor):
3434     """Information extractor for plus.google.com."""
3435
3436     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3437     IE_NAME = u'plus.google'
3438
3439     def __init__(self, downloader=None):
3440         InfoExtractor.__init__(self, downloader)
3441
3442     def report_extract_entry(self, url):
3443         """Report downloading extry"""
3444         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3445
3446     def report_date(self, upload_date):
3447         """Report downloading extry"""
3448         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3449
3450     def report_uploader(self, uploader):
3451         """Report downloading extry"""
3452         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3453
3454     def report_title(self, video_title):
3455         """Report downloading extry"""
3456         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3457
3458     def report_extract_vid_page(self, video_page):
3459         """Report information extraction."""
3460         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3461
3462     def _real_extract(self, url):
3463         # Extract id from URL
3464         mobj = re.match(self._VALID_URL, url)
3465         if mobj is None:
3466             self._downloader.report_error(u'Invalid URL: %s' % url)
3467             return
3468
3469         post_url = mobj.group(0)
3470         video_id = mobj.group(1)
3471
3472         video_extension = 'flv'
3473
3474         # Step 1, Retrieve post webpage to extract further information
3475         self.report_extract_entry(post_url)
3476         request = compat_urllib_request.Request(post_url)
3477         try:
3478             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3479         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3480             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3481             return
3482
3483         # Extract update date
3484         upload_date = None
3485         pattern = 'title="Timestamp">(.*?)</a>'
3486         mobj = re.search(pattern, webpage)
3487         if mobj:
3488             upload_date = mobj.group(1)
3489             # Convert timestring to a format suitable for filename
3490             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3491             upload_date = upload_date.strftime('%Y%m%d')
3492         self.report_date(upload_date)
3493
3494         # Extract uploader
3495         uploader = None
3496         pattern = r'rel\="author".*?>(.*?)</a>'
3497         mobj = re.search(pattern, webpage)
3498         if mobj:
3499             uploader = mobj.group(1)
3500         self.report_uploader(uploader)
3501
3502         # Extract title
3503         # Get the first line for title
3504         video_title = u'NA'
3505         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3506         mobj = re.search(pattern, webpage)
3507         if mobj:
3508             video_title = mobj.group(1)
3509         self.report_title(video_title)
3510
3511         # Step 2, Stimulate clicking the image box to launch video
3512         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3513         mobj = re.search(pattern, webpage)
3514         if mobj is None:
3515             self._downloader.report_error(u'unable to extract video page URL')
3516
3517         video_page = mobj.group(1)
3518         request = compat_urllib_request.Request(video_page)
3519         try:
3520             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3521         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3522             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3523             return
3524         self.report_extract_vid_page(video_page)
3525
3526
3527         # Extract video links on video page
3528         """Extract video links of all sizes"""
3529         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3530         mobj = re.findall(pattern, webpage)
3531         if len(mobj) == 0:
3532             self._downloader.report_error(u'unable to extract video links')
3533
3534         # Sort in resolution
3535         links = sorted(mobj)
3536
3537         # Choose the lowest of the sort, i.e. highest resolution
3538         video_url = links[-1]
3539         # Only get the url. The resolution part in the tuple has no use anymore
3540         video_url = video_url[-1]
3541         # Treat escaped \u0026 style hex
3542         try:
3543             video_url = video_url.decode("unicode_escape")
3544         except AttributeError: # Python 3
3545             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3546
3547
3548         return [{
3549             'id':       video_id,
3550             'url':      video_url,
3551             'uploader': uploader,
3552             'upload_date':  upload_date,
3553             'title':    video_title,
3554             'ext':      video_extension,
3555         }]
3556
3557 class NBAIE(InfoExtractor):
3558     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3559     IE_NAME = u'nba'
3560
3561     def _real_extract(self, url):
3562         mobj = re.match(self._VALID_URL, url)
3563         if mobj is None:
3564             self._downloader.report_error(u'invalid URL: %s' % url)
3565             return
3566
3567         video_id = mobj.group(1)
3568         if video_id.endswith('/index.html'):
3569             video_id = video_id[:-len('/index.html')]
3570
3571         webpage = self._download_webpage(url, video_id)
3572
3573         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3574         def _findProp(rexp, default=None):
3575             m = re.search(rexp, webpage)
3576             if m:
3577                 return unescapeHTML(m.group(1))
3578             else:
3579                 return default
3580
3581         shortened_video_id = video_id.rpartition('/')[2]
3582         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3583         info = {
3584             'id': shortened_video_id,
3585             'url': video_url,
3586             'ext': 'mp4',
3587             'title': title,
3588             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3589             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3590         }
3591         return [info]
3592
3593 class JustinTVIE(InfoExtractor):
3594     """Information extractor for justin.tv and twitch.tv"""
3595     # TODO: One broadcast may be split into multiple videos. The key
3596     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3597     # starts at 1 and increases. Can we treat all parts as one video?
3598
3599     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3600         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3601     _JUSTIN_PAGE_LIMIT = 100
3602     IE_NAME = u'justin.tv'
3603
3604     def report_extraction(self, file_id):
3605         """Report information extraction."""
3606         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3607
3608     def report_download_page(self, channel, offset):
3609         """Report attempt to download a single page of videos."""
3610         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3611                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3612
3613     # Return count of items, list of *valid* items
3614     def _parse_page(self, url):
3615         try:
3616             urlh = compat_urllib_request.urlopen(url)
3617             webpage_bytes = urlh.read()
3618             webpage = webpage_bytes.decode('utf-8', 'ignore')
3619         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3620             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3621             return
3622
3623         response = json.loads(webpage)
3624         if type(response) != list:
3625             error_text = response.get('error', 'unknown error')
3626             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3627             return
3628         info = []
3629         for clip in response:
3630             video_url = clip['video_file_url']
3631             if video_url:
3632                 video_extension = os.path.splitext(video_url)[1][1:]
3633                 video_date = re.sub('-', '', clip['start_time'][:10])
3634                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3635                 video_id = clip['id']
3636                 video_title = clip.get('title', video_id)
3637                 info.append({
3638                     'id': video_id,
3639                     'url': video_url,
3640                     'title': video_title,
3641                     'uploader': clip.get('channel_name', video_uploader_id),
3642                     'uploader_id': video_uploader_id,
3643                     'upload_date': video_date,
3644                     'ext': video_extension,
3645                 })
3646         return (len(response), info)
3647
3648     def _real_extract(self, url):
3649         mobj = re.match(self._VALID_URL, url)
3650         if mobj is None:
3651             self._downloader.report_error(u'invalid URL: %s' % url)
3652             return
3653
3654         api = 'http://api.justin.tv'
3655         video_id = mobj.group(mobj.lastindex)
3656         paged = False
3657         if mobj.lastindex == 1:
3658             paged = True
3659             api += '/channel/archives/%s.json'
3660         else:
3661             api += '/broadcast/by_archive/%s.json'
3662         api = api % (video_id,)
3663
3664         self.report_extraction(video_id)
3665
3666         info = []
3667         offset = 0
3668         limit = self._JUSTIN_PAGE_LIMIT
3669         while True:
3670             if paged:
3671                 self.report_download_page(video_id, offset)
3672             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3673             page_count, page_info = self._parse_page(page_url)
3674             info.extend(page_info)
3675             if not paged or page_count != limit:
3676                 break
3677             offset += limit
3678         return info
3679
3680 class FunnyOrDieIE(InfoExtractor):
3681     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3682
3683     def _real_extract(self, url):
3684         mobj = re.match(self._VALID_URL, url)
3685         if mobj is None:
3686             self._downloader.report_error(u'invalid URL: %s' % url)
3687             return
3688
3689         video_id = mobj.group('id')
3690         webpage = self._download_webpage(url, video_id)
3691
3692         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3693         if not m:
3694             self._downloader.report_error(u'unable to find video information')
3695         video_url = unescapeHTML(m.group('url'))
3696
3697         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3698         if not m:
3699             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3700             if not m:
3701                 self._downloader.trouble(u'Cannot find video title')
3702         title = clean_html(m.group('title'))
3703
3704         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3705         if m:
3706             desc = unescapeHTML(m.group('desc'))
3707         else:
3708             desc = None
3709
3710         info = {
3711             'id': video_id,
3712             'url': video_url,
3713             'ext': 'mp4',
3714             'title': title,
3715             'description': desc,
3716         }
3717         return [info]
3718
3719 class SteamIE(InfoExtractor):
3720     _VALID_URL = r"""http://store.steampowered.com/
3721                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3722                 (?P<gameID>\d+)/?
3723                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3724                 """
3725
3726     @classmethod
3727     def suitable(cls, url):
3728         """Receives a URL and returns True if suitable for this IE."""
3729         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3730
3731     def _real_extract(self, url):
3732         m = re.match(self._VALID_URL, url, re.VERBOSE)
3733         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3734         gameID = m.group('gameID')
3735         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3736         webpage = self._download_webpage(videourl, gameID)
3737         mweb = re.finditer(urlRE, webpage)
3738         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3739         titles = re.finditer(namesRE, webpage)
3740         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3741         thumbs = re.finditer(thumbsRE, webpage)
3742         videos = []
3743         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3744             video_id = vid.group('videoID')
3745             title = vtitle.group('videoName')
3746             video_url = vid.group('videoURL')
3747             video_thumb = thumb.group('thumbnail')
3748             if not video_url:
3749                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3750             info = {
3751                 'id':video_id,
3752                 'url':video_url,
3753                 'ext': 'flv',
3754                 'title': unescapeHTML(title),
3755                 'thumbnail': video_thumb
3756                   }
3757             videos.append(info)
3758         return videos
3759
3760 class UstreamIE(InfoExtractor):
3761     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3762     IE_NAME = u'ustream'
3763
3764     def _real_extract(self, url):
3765         m = re.match(self._VALID_URL, url)
3766         video_id = m.group('videoID')
3767         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3768         webpage = self._download_webpage(url, video_id)
3769         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3770         title = m.group('title')
3771         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3772         uploader = m.group('uploader')
3773         info = {
3774                 'id':video_id,
3775                 'url':video_url,
3776                 'ext': 'flv',
3777                 'title': title,
3778                 'uploader': uploader
3779                   }
3780         return [info]
3781
3782 class WorldStarHipHopIE(InfoExtractor):
3783     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3784     IE_NAME = u'WorldStarHipHop'
3785
3786     def _real_extract(self, url):
3787         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3788
3789         webpage_src = compat_urllib_request.urlopen(url).read()
3790         webpage_src = webpage_src.decode('utf-8')
3791
3792         mobj = re.search(_src_url, webpage_src)
3793
3794         m = re.match(self._VALID_URL, url)
3795         video_id = m.group('id')
3796
3797         if mobj is not None:
3798             video_url = mobj.group()
3799             if 'mp4' in video_url:
3800                 ext = 'mp4'
3801             else:
3802                 ext = 'flv'
3803         else:
3804             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3805             return
3806
3807         _title = r"""<title>(.*)</title>"""
3808
3809         mobj = re.search(_title, webpage_src)
3810
3811         if mobj is not None:
3812             title = mobj.group(1)
3813         else:
3814             title = 'World Start Hip Hop - %s' % time.ctime()
3815
3816         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3817         mobj = re.search(_thumbnail, webpage_src)
3818
3819         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3820         if mobj is not None:
3821             thumbnail = mobj.group(1)
3822         else:
3823             _title = r"""candytitles.*>(.*)</span>"""
3824             mobj = re.search(_title, webpage_src)
3825             if mobj is not None:
3826                 title = mobj.group(1)
3827             thumbnail = None
3828
3829         results = [{
3830                     'id': video_id,
3831                     'url' : video_url,
3832                     'title' : title,
3833                     'thumbnail' : thumbnail,
3834                     'ext' : ext,
3835                     }]
3836         return results
3837
3838 class RBMARadioIE(InfoExtractor):
3839     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3840
3841     def _real_extract(self, url):
3842         m = re.match(self._VALID_URL, url)
3843         video_id = m.group('videoID')
3844
3845         webpage = self._download_webpage(url, video_id)
3846         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3847         if not m:
3848             raise ExtractorError(u'Cannot find metadata')
3849         json_data = m.group(1)
3850
3851         try:
3852             data = json.loads(json_data)
3853         except ValueError as e:
3854             raise ExtractorError(u'Invalid JSON: ' + str(e))
3855
3856         video_url = data['akamai_url'] + '&cbr=256'
3857         url_parts = compat_urllib_parse_urlparse(video_url)
3858         video_ext = url_parts.path.rpartition('.')[2]
3859         info = {
3860                 'id': video_id,
3861                 'url': video_url,
3862                 'ext': video_ext,
3863                 'title': data['title'],
3864                 'description': data.get('teaser_text'),
3865                 'location': data.get('country_of_origin'),
3866                 'uploader': data.get('host', {}).get('name'),
3867                 'uploader_id': data.get('host', {}).get('slug'),
3868                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3869                 'duration': data.get('duration'),
3870         }
3871         return [info]
3872
3873
3874 class YouPornIE(InfoExtractor):
3875     """Information extractor for youporn.com."""
3876     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3877
3878     def _print_formats(self, formats):
3879         """Print all available formats"""
3880         print(u'Available formats:')
3881         print(u'ext\t\tformat')
3882         print(u'---------------------------------')
3883         for format in formats:
3884             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3885
3886     def _specific(self, req_format, formats):
3887         for x in formats:
3888             if(x["format"]==req_format):
3889                 return x
3890         return None
3891
3892     def _real_extract(self, url):
3893         mobj = re.match(self._VALID_URL, url)
3894         if mobj is None:
3895             self._downloader.report_error(u'invalid URL: %s' % url)
3896             return
3897
3898         video_id = mobj.group('videoid')
3899
3900         req = compat_urllib_request.Request(url)
3901         req.add_header('Cookie', 'age_verified=1')
3902         webpage = self._download_webpage(req, video_id)
3903
3904         # Get the video title
3905         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3906         if result is None:
3907             raise ExtractorError(u'Unable to extract video title')
3908         video_title = result.group('title').strip()
3909
3910         # Get the video date
3911         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3912         if result is None:
3913             self._downloader.report_warning(u'unable to extract video date')
3914             upload_date = None
3915         else:
3916             upload_date = result.group('date').strip()
3917
3918         # Get the video uploader
3919         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3920         if result is None:
3921             self._downloader.report_warning(u'unable to extract uploader')
3922             video_uploader = None
3923         else:
3924             video_uploader = result.group('uploader').strip()
3925             video_uploader = clean_html( video_uploader )
3926
3927         # Get all of the formats available
3928         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3929         result = re.search(DOWNLOAD_LIST_RE, webpage)
3930         if result is None:
3931             raise ExtractorError(u'Unable to extract download list')
3932         download_list_html = result.group('download_list').strip()
3933
3934         # Get all of the links from the page
3935         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3936         links = re.findall(LINK_RE, download_list_html)
3937         if(len(links) == 0):
3938             raise ExtractorError(u'ERROR: no known formats available for video')
3939
3940         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3941
3942         formats = []
3943         for link in links:
3944
3945             # A link looks like this:
3946             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3947             # A path looks like this:
3948             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3949             video_url = unescapeHTML( link )
3950             path = compat_urllib_parse_urlparse( video_url ).path
3951             extension = os.path.splitext( path )[1][1:]
3952             format = path.split('/')[4].split('_')[:2]
3953             size = format[0]
3954             bitrate = format[1]
3955             format = "-".join( format )
3956             title = u'%s-%s-%s' % (video_title, size, bitrate)
3957
3958             formats.append({
3959                 'id': video_id,
3960                 'url': video_url,
3961                 'uploader': video_uploader,
3962                 'upload_date': upload_date,
3963                 'title': title,
3964                 'ext': extension,
3965                 'format': format,
3966                 'thumbnail': None,
3967                 'description': None,
3968                 'player_url': None
3969             })
3970
3971         if self._downloader.params.get('listformats', None):
3972             self._print_formats(formats)
3973             return
3974
3975         req_format = self._downloader.params.get('format', None)
3976         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3977
3978         if req_format is None or req_format == 'best':
3979             return [formats[0]]
3980         elif req_format == 'worst':
3981             return [formats[-1]]
3982         elif req_format in ('-1', 'all'):
3983             return formats
3984         else:
3985             format = self._specific( req_format, formats )
3986             if result is None:
3987                 self._downloader.report_error(u'requested format not available')
3988                 return
3989             return [format]
3990
3991
3992
3993 class PornotubeIE(InfoExtractor):
3994     """Information extractor for pornotube.com."""
3995     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3996
3997     def _real_extract(self, url):
3998         mobj = re.match(self._VALID_URL, url)
3999         if mobj is None:
4000             self._downloader.report_error(u'invalid URL: %s' % url)
4001             return
4002
4003         video_id = mobj.group('videoid')
4004         video_title = mobj.group('title')
4005
4006         # Get webpage content
4007         webpage = self._download_webpage(url, video_id)
4008
4009         # Get the video URL
4010         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4011         result = re.search(VIDEO_URL_RE, webpage)
4012         if result is None:
4013             self._downloader.report_error(u'unable to extract video url')
4014             return
4015         video_url = compat_urllib_parse.unquote(result.group('url'))
4016
4017         #Get the uploaded date
4018         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4019         result = re.search(VIDEO_UPLOADED_RE, webpage)
4020         if result is None:
4021             self._downloader.report_error(u'unable to extract video title')
4022             return
4023         upload_date = result.group('date')
4024
4025         info = {'id': video_id,
4026                 'url': video_url,
4027                 'uploader': None,
4028                 'upload_date': upload_date,
4029                 'title': video_title,
4030                 'ext': 'flv',
4031                 'format': 'flv'}
4032
4033         return [info]
4034
4035 class YouJizzIE(InfoExtractor):
4036     """Information extractor for youjizz.com."""
4037     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4038
4039     def _real_extract(self, url):
4040         mobj = re.match(self._VALID_URL, url)
4041         if mobj is None:
4042             self._downloader.report_error(u'invalid URL: %s' % url)
4043             return
4044
4045         video_id = mobj.group('videoid')
4046
4047         # Get webpage content
4048         webpage = self._download_webpage(url, video_id)
4049
4050         # Get the video title
4051         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4052         if result is None:
4053             raise ExtractorError(u'ERROR: unable to extract video title')
4054         video_title = result.group('title').strip()
4055
4056         # Get the embed page
4057         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4058         if result is None:
4059             raise ExtractorError(u'ERROR: unable to extract embed page')
4060
4061         embed_page_url = result.group(0).strip()
4062         video_id = result.group('videoid')
4063
4064         webpage = self._download_webpage(embed_page_url, video_id)
4065
4066         # Get the video URL
4067         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4068         if result is None:
4069             raise ExtractorError(u'ERROR: unable to extract video url')
4070         video_url = result.group('source')
4071
4072         info = {'id': video_id,
4073                 'url': video_url,
4074                 'title': video_title,
4075                 'ext': 'flv',
4076                 'format': 'flv',
4077                 'player_url': embed_page_url}
4078
4079         return [info]
4080
4081 class EightTracksIE(InfoExtractor):
4082     IE_NAME = '8tracks'
4083     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4084
4085     def _real_extract(self, url):
4086         mobj = re.match(self._VALID_URL, url)
4087         if mobj is None:
4088             raise ExtractorError(u'Invalid URL: %s' % url)
4089         playlist_id = mobj.group('id')
4090
4091         webpage = self._download_webpage(url, playlist_id)
4092
4093         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4094         if not m:
4095             raise ExtractorError(u'Cannot find trax information')
4096         json_like = m.group(1)
4097         data = json.loads(json_like)
4098
4099         session = str(random.randint(0, 1000000000))
4100         mix_id = data['id']
4101         track_count = data['tracks_count']
4102         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4103         next_url = first_url
4104         res = []
4105         for i in itertools.count():
4106             api_json = self._download_webpage(next_url, playlist_id,
4107                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4108                 errnote=u'Failed to download song information')
4109             api_data = json.loads(api_json)
4110             track_data = api_data[u'set']['track']
4111             info = {
4112                 'id': track_data['id'],
4113                 'url': track_data['track_file_stream_url'],
4114                 'title': track_data['performer'] + u' - ' + track_data['name'],
4115                 'raw_title': track_data['name'],
4116                 'uploader_id': data['user']['login'],
4117                 'ext': 'm4a',
4118             }
4119             res.append(info)
4120             if api_data['set']['at_last_track']:
4121                 break
4122             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4123         return res
4124
4125 class KeekIE(InfoExtractor):
4126     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4127     IE_NAME = u'keek'
4128
4129     def _real_extract(self, url):
4130         m = re.match(self._VALID_URL, url)
4131         video_id = m.group('videoID')
4132         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4133         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4134         webpage = self._download_webpage(url, video_id)
4135         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4136         title = unescapeHTML(m.group('title'))
4137         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4138         uploader = clean_html(m.group('uploader'))
4139         info = {
4140                 'id': video_id,
4141                 'url': video_url,
4142                 'ext': 'mp4',
4143                 'title': title,
4144                 'thumbnail': thumbnail,
4145                 'uploader': uploader
4146         }
4147         return [info]
4148
4149 class TEDIE(InfoExtractor):
4150     _VALID_URL=r'''http://www.ted.com/
4151                    (
4152                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4153                         |
4154                         ((?P<type_talk>talks)) # We have a simple talk
4155                    )
4156                    /(?P<name>\w+) # Here goes the name and then ".html"
4157                    '''
4158
4159     @classmethod
4160     def suitable(cls, url):
4161         """Receives a URL and returns True if suitable for this IE."""
4162         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4163
4164     def _real_extract(self, url):
4165         m=re.match(self._VALID_URL, url, re.VERBOSE)
4166         if m.group('type_talk'):
4167             return [self._talk_info(url)]
4168         else :
4169             playlist_id=m.group('playlist_id')
4170             name=m.group('name')
4171             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4172             return self._playlist_videos_info(url,name,playlist_id)
4173
4174     def _talk_video_link(self,mediaSlug):
4175         '''Returns the video link for that mediaSlug'''
4176         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4177
4178     def _playlist_videos_info(self,url,name,playlist_id=0):
4179         '''Returns the videos of the playlist'''
4180         video_RE=r'''
4181                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4182                      ([.\s]*?)data-playlist_item_id="(\d+)"
4183                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4184                      '''
4185         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4186         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4187         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4188         m_names=re.finditer(video_name_RE,webpage)
4189         info=[]
4190         for m_video, m_name in zip(m_videos,m_names):
4191             video_id=m_video.group('video_id')
4192             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4193             info.append(self._talk_info(talk_url,video_id))
4194         return info
4195
4196     def _talk_info(self, url, video_id=0):
4197         """Return the video for the talk in the url"""
4198         m=re.match(self._VALID_URL, url,re.VERBOSE)
4199         videoName=m.group('name')
4200         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4201         # If the url includes the language we get the title translated
4202         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4203         title=re.search(title_RE, webpage).group('title')
4204         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4205                         "id":(?P<videoID>[\d]+).*?
4206                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4207         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4208         thumb_match=re.search(thumb_RE,webpage)
4209         info_match=re.search(info_RE,webpage,re.VERBOSE)
4210         video_id=info_match.group('videoID')
4211         mediaSlug=info_match.group('mediaSlug')
4212         video_url=self._talk_video_link(mediaSlug)
4213         info = {
4214                 'id': video_id,
4215                 'url': video_url,
4216                 'ext': 'mp4',
4217                 'title': title,
4218                 'thumbnail': thumb_match.group('thumbnail')
4219                 }
4220         return info
4221
4222 class MySpassIE(InfoExtractor):
4223     _VALID_URL = r'http://www.myspass.de/.*'
4224
4225     def _real_extract(self, url):
4226         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4227
4228         # video id is the last path element of the URL
4229         # usually there is a trailing slash, so also try the second but last
4230         url_path = compat_urllib_parse_urlparse(url).path
4231         url_parent_path, video_id = os.path.split(url_path)
4232         if not video_id:
4233             _, video_id = os.path.split(url_parent_path)
4234
4235         # get metadata
4236         metadata_url = META_DATA_URL_TEMPLATE % video_id
4237         metadata_text = self._download_webpage(metadata_url, video_id)
4238         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4239
4240         # extract values from metadata
4241         url_flv_el = metadata.find('url_flv')
4242         if url_flv_el is None:
4243             self._downloader.report_error(u'unable to extract download url')
4244             return
4245         video_url = url_flv_el.text
4246         extension = os.path.splitext(video_url)[1][1:]
4247         title_el = metadata.find('title')
4248         if title_el is None:
4249             self._downloader.report_error(u'unable to extract title')
4250             return
4251         title = title_el.text
4252         format_id_el = metadata.find('format_id')
4253         if format_id_el is None:
4254             format = ext
4255         else:
4256             format = format_id_el.text
4257         description_el = metadata.find('description')
4258         if description_el is not None:
4259             description = description_el.text
4260         else:
4261             description = None
4262         imagePreview_el = metadata.find('imagePreview')
4263         if imagePreview_el is not None:
4264             thumbnail = imagePreview_el.text
4265         else:
4266             thumbnail = None
4267         info = {
4268             'id': video_id,
4269             'url': video_url,
4270             'title': title,
4271             'ext': extension,
4272             'format': format,
4273             'thumbnail': thumbnail,
4274             'description': description
4275         }
4276         return [info]
4277
4278 class SpiegelIE(InfoExtractor):
4279     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4280
4281     def _real_extract(self, url):
4282         m = re.match(self._VALID_URL, url)
4283         video_id = m.group('videoID')
4284
4285         webpage = self._download_webpage(url, video_id)
4286         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4287         if not m:
4288             raise ExtractorError(u'Cannot find title')
4289         video_title = unescapeHTML(m.group(1))
4290
4291         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4292         xml_code = self._download_webpage(xml_url, video_id,
4293                     note=u'Downloading XML', errnote=u'Failed to download XML')
4294
4295         idoc = xml.etree.ElementTree.fromstring(xml_code)
4296         last_type = idoc[-1]
4297         filename = last_type.findall('./filename')[0].text
4298         duration = float(last_type.findall('./duration')[0].text)
4299
4300         video_url = 'http://video2.spiegel.de/flash/' + filename
4301         video_ext = filename.rpartition('.')[2]
4302         info = {
4303             'id': video_id,
4304             'url': video_url,
4305             'ext': video_ext,
4306             'title': video_title,
4307             'duration': duration,
4308         }
4309         return [info]
4310
4311 class LiveLeakIE(InfoExtractor):
4312
4313     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4314     IE_NAME = u'liveleak'
4315
4316     def _real_extract(self, url):
4317         mobj = re.match(self._VALID_URL, url)
4318         if mobj is None:
4319             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4320             return
4321
4322         video_id = mobj.group('video_id')
4323
4324         webpage = self._download_webpage(url, video_id)
4325
4326         m = re.search(r'file: "(.*?)",', webpage)
4327         if not m:
4328             self._downloader.report_error(u'unable to find video url')
4329             return
4330         video_url = m.group(1)
4331
4332         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4333         if not m:
4334             self._downloader.trouble(u'Cannot find video title')
4335         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4336
4337         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4338         if m:
4339             desc = unescapeHTML(m.group('desc'))
4340         else:
4341             desc = None
4342
4343         m = re.search(r'By:.*?(\w+)</a>', webpage)
4344         if m:
4345             uploader = clean_html(m.group(1))
4346         else:
4347             uploader = None
4348
4349         info = {
4350             'id':  video_id,
4351             'url': video_url,
4352             'ext': 'mp4',
4353             'title': title,
4354             'description': desc,
4355             'uploader': uploader
4356         }
4357
4358         return [info]
4359
4360 class ARDIE(InfoExtractor):
4361     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4362     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4363     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4364
4365     def _real_extract(self, url):
4366         # determine video id from url
4367         m = re.match(self._VALID_URL, url)
4368
4369         numid = re.search(r'documentId=([0-9]+)', url)
4370         if numid:
4371             video_id = numid.group(1)
4372         else:
4373             video_id = m.group('video_id')
4374
4375         # determine title and media streams from webpage
4376         html = self._download_webpage(url, video_id)
4377         title = re.search(self._TITLE, html).group('title')
4378         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4379         if not streams:
4380             assert '"fsk"' in html
4381             self._downloader.report_error(u'this video is only available after 8:00 pm')
4382             return
4383
4384         # choose default media type and highest quality for now
4385         stream = max([s for s in streams if int(s["media_type"]) == 0],
4386                      key=lambda s: int(s["quality"]))
4387
4388         # there's two possibilities: RTMP stream or HTTP download
4389         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4390         if stream['rtmp_url']:
4391             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4392             assert stream['video_url'].startswith('mp4:')
4393             info["url"] = stream["rtmp_url"]
4394             info["play_path"] = stream['video_url']
4395         else:
4396             assert stream["video_url"].endswith('.mp4')
4397             info["url"] = stream["video_url"]
4398         return [info]
4399
4400
4401 def gen_extractors():
4402     """ Return a list of an instance of every supported extractor.
4403     The order does matter; the first extractor matched is the one handling the URL.
4404     """
4405     return [
4406         YoutubePlaylistIE(),
4407         YoutubeChannelIE(),
4408         YoutubeUserIE(),
4409         YoutubeSearchIE(),
4410         YoutubeIE(),
4411         MetacafeIE(),
4412         DailymotionIE(),
4413         GoogleSearchIE(),
4414         PhotobucketIE(),
4415         YahooIE(),
4416         YahooSearchIE(),
4417         DepositFilesIE(),
4418         FacebookIE(),
4419         BlipTVUserIE(),
4420         BlipTVIE(),
4421         VimeoIE(),
4422         MyVideoIE(),
4423         ComedyCentralIE(),
4424         EscapistIE(),
4425         CollegeHumorIE(),
4426         XVideosIE(),
4427         SoundcloudSetIE(),
4428         SoundcloudIE(),
4429         InfoQIE(),
4430         MixcloudIE(),
4431         StanfordOpenClassroomIE(),
4432         MTVIE(),
4433         YoukuIE(),
4434         XNXXIE(),
4435         YouJizzIE(),
4436         PornotubeIE(),
4437         YouPornIE(),
4438         GooglePlusIE(),
4439         ArteTvIE(),
4440         NBAIE(),
4441         WorldStarHipHopIE(),
4442         JustinTVIE(),
4443         FunnyOrDieIE(),
4444         SteamIE(),
4445         UstreamIE(),
4446         RBMARadioIE(),
4447         EightTracksIE(),
4448         KeekIE(),
4449         TEDIE(),
4450         MySpassIE(),
4451         SpiegelIE(),
4452         LiveLeakIE(),
4453         ARDIE(),
4454         GenericIE()
4455     ]