_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_age_confirmation(self):
 156         """Report attempt to confirm age."""
 157         self.to_screen(u'Confirming age')
 158
 159     #Methods for following #608
 160     #They set the correct value of the '_type' key
 161     def video_result(self, video_info):
 162         """Returns a video"""
 163         video_info['_type'] = 'video'
 164         return video_info
 165     def url_result(self, url, ie=None):
 166         """Returns a url that points to a page that should be processed"""
 167         #TODO: ie should be the class used for getting the info
 168         video_info = {'_type': 'url',
 169                       'url': url,
 170                       'ie_key': ie}
 171         return video_info
 172     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 173         """Returns a playlist"""
 174         video_info = {'_type': 'playlist',
 175                       'entries': entries}
 176         if playlist_id:
 177             video_info['id'] = playlist_id
 178         if playlist_title:
 179             video_info['title'] = playlist_title
 180         return video_info
 181
 182
 183 class YoutubeIE(InfoExtractor):
 184     """Information extractor for youtube.com."""
 185
 186     _VALID_URL = r"""^
 187                      (
 188                          (?:https?://)?                                       # http(s):// (optional)
 189                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 190                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 191                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 192                          (?:                                                  # the various things that can precede the ID:
 193                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 194                              |(?:                                             # or the v= param in all its forms
 195                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 196                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 197                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 198                                  v=
 199                              )
 200                          )?                                                   # optional -> youtube.com/xxxx is OK
 201                      )?                                                       # all until now is optional -> you can pass the naked ID
 202                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 203                      (?(1).+)?                                                # if we found the ID, everything can follow
 204                      $"""
 205     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 206     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 207     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 208     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 209     _NETRC_MACHINE = 'youtube'
 210     # Listed in order of quality
 211     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 212     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 213     _video_extensions = {
 214         '13': '3gp',
 215         '17': 'mp4',
 216         '18': 'mp4',
 217         '22': 'mp4',
 218         '37': 'mp4',
 219         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 220         '43': 'webm',
 221         '44': 'webm',
 222         '45': 'webm',
 223         '46': 'webm',
 224     }
 225     _video_dimensions = {
 226         '5': '240x400',
 227         '6': '???',
 228         '13': '???',
 229         '17': '144x176',
 230         '18': '360x640',
 231         '22': '720x1280',
 232         '34': '360x640',
 233         '35': '480x854',
 234         '37': '1080x1920',
 235         '38': '3072x4096',
 236         '43': '360x640',
 237         '44': '480x854',
 238         '45': '720x1280',
 239         '46': '1080x1920',
 240     }
 241     IE_NAME = u'youtube'
 242
 243     @classmethod
 244     def suitable(cls, url):
 245         """Receives a URL and returns True if suitable for this IE."""
 246         if YoutubePlaylistIE.suitable(url): return False
 247         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 248
 249     def report_lang(self):
 250         """Report attempt to set language."""
 251         self.to_screen(u'Setting language')
 252
 253     def report_login(self):
 254         """Report attempt to log in."""
 255         self.to_screen(u'Logging in')
 256
 257     def report_video_webpage_download(self, video_id):
 258         """Report attempt to download video webpage."""
 259         self.to_screen(u'%s: Downloading video webpage' % video_id)
 260
 261     def report_video_info_webpage_download(self, video_id):
 262         """Report attempt to download video info webpage."""
 263         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 264
 265     def report_video_subtitles_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Checking available subtitles' % video_id)
 268
 269     def report_video_subtitles_request(self, video_id, sub_lang, format):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 272
 273     def report_video_subtitles_available(self, video_id, sub_lang_list):
 274         """Report available subtitles."""
 275         sub_lang = ",".join(list(sub_lang_list.keys()))
 276         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 277
 278     def report_information_extraction(self, video_id):
 279         """Report attempt to extract video information."""
 280         self.to_screen(u'%s: Extracting video information' % video_id)
 281
 282     def report_unavailable_format(self, video_id, format):
 283         """Report extracted video URL."""
 284         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 285
 286     def report_rtmp_download(self):
 287         """Indicate the download will use the RTMP protocol."""
 288         self.to_screen(u'RTMP download detected')
 289
 290     def _get_available_subtitles(self, video_id):
 291         self.report_video_subtitles_download(video_id)
 292         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 293         try:
 294             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 295         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 296             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 297         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 298         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 299         if not sub_lang_list:
 300             return (u'video doesn\'t have subtitles', None)
 301         return sub_lang_list
 302
 303     def _list_available_subtitles(self, video_id):
 304         sub_lang_list = self._get_available_subtitles(video_id)
 305         self.report_video_subtitles_available(video_id, sub_lang_list)
 306
 307     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 308         """
 309         Return tuple:
 310         (error_message, sub_lang, sub)
 311         """
 312         self.report_video_subtitles_request(video_id, sub_lang, format)
 313         params = compat_urllib_parse.urlencode({
 314             'lang': sub_lang,
 315             'name': sub_name,
 316             'v': video_id,
 317             'fmt': format,
 318         })
 319         url = 'http://www.youtube.com/api/timedtext?' + params
 320         try:
 321             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 322         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 323             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 324         if not sub:
 325             return (u'Did not fetch video subtitles', None, None)
 326         return (None, sub_lang, sub)
 327
 328     def _extract_subtitle(self, video_id):
 329         """
 330         Return a list with a tuple:
 331         [(error_message, sub_lang, sub)]
 332         """
 333         sub_lang_list = self._get_available_subtitles(video_id)
 334         sub_format = self._downloader.params.get('subtitlesformat')
 335         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 336             return [(sub_lang_list[0], None, None)]
 337         if self._downloader.params.get('subtitleslang', False):
 338             sub_lang = self._downloader.params.get('subtitleslang')
 339         elif 'en' in sub_lang_list:
 340             sub_lang = 'en'
 341         else:
 342             sub_lang = list(sub_lang_list.keys())[0]
 343         if not sub_lang in sub_lang_list:
 344             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 345
 346         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 347         return [subtitle]
 348
 349     def _extract_all_subtitles(self, video_id):
 350         sub_lang_list = self._get_available_subtitles(video_id)
 351         sub_format = self._downloader.params.get('subtitlesformat')
 352         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 353             return [(sub_lang_list[0], None, None)]
 354         subtitles = []
 355         for sub_lang in sub_lang_list:
 356             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 357             subtitles.append(subtitle)
 358         return subtitles
 359
 360     def _print_formats(self, formats):
 361         print('Available formats:')
 362         for x in formats:
 363             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 364
 365     def _real_initialize(self):
 366         if self._downloader is None:
 367             return
 368
 369         username = None
 370         password = None
 371         downloader_params = self._downloader.params
 372
 373         # Attempt to use provided username and password or .netrc data
 374         if downloader_params.get('username', None) is not None:
 375             username = downloader_params['username']
 376             password = downloader_params['password']
 377         elif downloader_params.get('usenetrc', False):
 378             try:
 379                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 380                 if info is not None:
 381                     username = info[0]
 382                     password = info[2]
 383                 else:
 384                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 385             except (IOError, netrc.NetrcParseError) as err:
 386                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 387                 return
 388
 389         # Set language
 390         request = compat_urllib_request.Request(self._LANG_URL)
 391         try:
 392             self.report_lang()
 393             compat_urllib_request.urlopen(request).read()
 394         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 395             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 396             return
 397
 398         # No authentication to be performed
 399         if username is None:
 400             return
 401
 402         request = compat_urllib_request.Request(self._LOGIN_URL)
 403         try:
 404             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 406             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 407             return
 408
 409         galx = None
 410         dsh = None
 411         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 412         if match:
 413           galx = match.group(1)
 414
 415         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           dsh = match.group(1)
 418
 419         # Log in
 420         login_form_strs = {
 421                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 422                 u'Email': username,
 423                 u'GALX': galx,
 424                 u'Passwd': password,
 425                 u'PersistentCookie': u'yes',
 426                 u'_utf8': u'霱',
 427                 u'bgresponse': u'js_disabled',
 428                 u'checkConnection': u'',
 429                 u'checkedDomains': u'youtube',
 430                 u'dnConn': u'',
 431                 u'dsh': dsh,
 432                 u'pstMsg': u'0',
 433                 u'rmShown': u'1',
 434                 u'secTok': u'',
 435                 u'signIn': u'Sign in',
 436                 u'timeStmp': u'',
 437                 u'service': u'youtube',
 438                 u'uilel': u'3',
 439                 u'hl': u'en_US',
 440         }
 441         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 442         # chokes on unicode
 443         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 444         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 445         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 446         try:
 447             self.report_login()
 448             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 449             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 450                 self._downloader.report_warning(u'unable to log in: bad username or password')
 451                 return
 452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 453             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 454             return
 455
 456         # Confirm age
 457         age_form = {
 458                 'next_url':     '/',
 459                 'action_confirm':   'Confirm',
 460                 }
 461         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 462         try:
 463             self.report_age_confirmation()
 464             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 465         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 466             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 467             return
 468
 469     def _extract_id(self, url):
 470         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 471         if mobj is None:
 472             self._downloader.report_error(u'invalid URL: %s' % url)
 473             return
 474         video_id = mobj.group(2)
 475         return video_id
 476
 477     def _real_extract(self, url):
 478         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 479         mobj = re.search(self._NEXT_URL_RE, url)
 480         if mobj:
 481             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 482         video_id = self._extract_id(url)
 483
 484         # Get video webpage
 485         self.report_video_webpage_download(video_id)
 486         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 487         request = compat_urllib_request.Request(url)
 488         try:
 489             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 491             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 492             return
 493
 494         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 495
 496         # Attempt to extract SWF player URL
 497         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 498         if mobj is not None:
 499             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 500         else:
 501             player_url = None
 502
 503         # Get video info
 504         self.report_video_info_webpage_download(video_id)
 505         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 506             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 507                     % (video_id, el_type))
 508             video_info_webpage = self._download_webpage(video_info_url, video_id,
 509                                     note=False,
 510                                     errnote='unable to download video info webpage')
 511             video_info = compat_parse_qs(video_info_webpage)
 512             if 'token' in video_info:
 513                 break
 514         if 'token' not in video_info:
 515             if 'reason' in video_info:
 516                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 517             else:
 518                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 519             return
 520
 521         # Check for "rental" videos
 522         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 523             self._downloader.report_error(u'"rental" videos not supported')
 524             return
 525
 526         # Start extracting information
 527         self.report_information_extraction(video_id)
 528
 529         # uploader
 530         if 'author' not in video_info:
 531             self._downloader.report_error(u'unable to extract uploader name')
 532             return
 533         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 534
 535         # uploader_id
 536         video_uploader_id = None
 537         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 538         if mobj is not None:
 539             video_uploader_id = mobj.group(1)
 540         else:
 541             self._downloader.report_warning(u'unable to extract uploader nickname')
 542
 543         # title
 544         if 'title' not in video_info:
 545             self._downloader.report_error(u'unable to extract video title')
 546             return
 547         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 548
 549         # thumbnail image
 550         if 'thumbnail_url' not in video_info:
 551             self._downloader.report_warning(u'unable to extract video thumbnail')
 552             video_thumbnail = ''
 553         else:   # don't panic if we can't find it
 554             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 555
 556         # upload date
 557         upload_date = None
 558         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 559         if mobj is not None:
 560             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 561             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 562             for expression in format_expressions:
 563                 try:
 564                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 565                 except:
 566                     pass
 567
 568         # description
 569         video_description = get_element_by_id("eow-description", video_webpage)
 570         if video_description:
 571             video_description = clean_html(video_description)
 572         else:
 573             video_description = ''
 574
 575         # subtitles
 576         video_subtitles = None
 577
 578         if self._downloader.params.get('writesubtitles', False):
 579             video_subtitles = self._extract_subtitle(video_id)
 580             if video_subtitles:
 581                 (sub_error, sub_lang, sub) = video_subtitles[0]
 582                 if sub_error:
 583                     self._downloader.report_error(sub_error)
 584
 585         if self._downloader.params.get('allsubtitles', False):
 586             video_subtitles = self._extract_all_subtitles(video_id)
 587             for video_subtitle in video_subtitles:
 588                 (sub_error, sub_lang, sub) = video_subtitle
 589                 if sub_error:
 590                     self._downloader.report_error(sub_error)
 591
 592         if self._downloader.params.get('listsubtitles', False):
 593             sub_lang_list = self._list_available_subtitles(video_id)
 594             return
 595
 596         if 'length_seconds' not in video_info:
 597             self._downloader.report_warning(u'unable to extract video duration')
 598             video_duration = ''
 599         else:
 600             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 601
 602         # token
 603         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 604
 605         # Decide which formats to download
 606         req_format = self._downloader.params.get('format', None)
 607
 608         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 609             self.report_rtmp_download()
 610             video_url_list = [(None, video_info['conn'][0])]
 611         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 612             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 613             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 614             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 615             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 616
 617             format_limit = self._downloader.params.get('format_limit', None)
 618             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 619             if format_limit is not None and format_limit in available_formats:
 620                 format_list = available_formats[available_formats.index(format_limit):]
 621             else:
 622                 format_list = available_formats
 623             existing_formats = [x for x in format_list if x in url_map]
 624             if len(existing_formats) == 0:
 625                 self._downloader.report_error(u'no known formats available for video')
 626                 return
 627             if self._downloader.params.get('listformats', None):
 628                 self._print_formats(existing_formats)
 629                 return
 630             if req_format is None or req_format == 'best':
 631                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 632             elif req_format == 'worst':
 633                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 634             elif req_format in ('-1', 'all'):
 635                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 636             else:
 637                 # Specific formats. We pick the first in a slash-delimeted sequence.
 638                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 639                 req_formats = req_format.split('/')
 640                 video_url_list = None
 641                 for rf in req_formats:
 642                     if rf in url_map:
 643                         video_url_list = [(rf, url_map[rf])]
 644                         break
 645                 if video_url_list is None:
 646                     self._downloader.report_error(u'requested format not available')
 647                     return
 648         else:
 649             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 650             return
 651
 652         results = []
 653         for format_param, video_real_url in video_url_list:
 654             # Extension
 655             video_extension = self._video_extensions.get(format_param, 'flv')
 656
 657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 658                                               self._video_dimensions.get(format_param, '???'))
 659
 660             results.append({
 661                 'id':       video_id,
 662                 'url':      video_real_url,
 663                 'uploader': video_uploader,
 664                 'uploader_id': video_uploader_id,
 665                 'upload_date':  upload_date,
 666                 'title':    video_title,
 667                 'ext':      video_extension,
 668                 'format':   video_format,
 669                 'thumbnail':    video_thumbnail,
 670                 'description':  video_description,
 671                 'player_url':   player_url,
 672                 'subtitles':    video_subtitles,
 673                 'duration':     video_duration
 674             })
 675         return results
 676
 677
 678 class MetacafeIE(InfoExtractor):
 679     """Information Extractor for metacafe.com."""
 680
 681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 684     IE_NAME = u'metacafe'
 685
 686     def __init__(self, downloader=None):
 687         InfoExtractor.__init__(self, downloader)
 688
 689     def report_disclaimer(self):
 690         """Report disclaimer retrieval."""
 691         self.to_screen(u'Retrieving disclaimer')
 692
 693     def report_download_webpage(self, video_id):
 694         """Report webpage download."""
 695         self.to_screen(u'%s: Downloading webpage' % video_id)
 696
 697     def _real_initialize(self):
 698         # Retrieve disclaimer
 699         request = compat_urllib_request.Request(self._DISCLAIMER)
 700         try:
 701             self.report_disclaimer()
 702             disclaimer = compat_urllib_request.urlopen(request).read()
 703         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 704             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 705             return
 706
 707         # Confirm age
 708         disclaimer_form = {
 709             'filters': '0',
 710             'submit': "Continue - I'm over 18",
 711             }
 712         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 713         try:
 714             self.report_age_confirmation()
 715             disclaimer = compat_urllib_request.urlopen(request).read()
 716         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 717             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 718             return
 719
 720     def _real_extract(self, url):
 721         # Extract id and simplified title from URL
 722         mobj = re.match(self._VALID_URL, url)
 723         if mobj is None:
 724             self._downloader.report_error(u'invalid URL: %s' % url)
 725             return
 726
 727         video_id = mobj.group(1)
 728
 729         # Check if video comes from YouTube
 730         mobj2 = re.match(r'^yt-(.*)$', video_id)
 731         if mobj2 is not None:
 732             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 733
 734         # Retrieve video webpage to extract further information
 735         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 736
 737         # Extract URL, uploader and title from webpage
 738         self.report_extraction(video_id)
 739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 740         if mobj is not None:
 741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 742             video_extension = mediaURL[-3:]
 743
 744             # Extract gdaKey if available
 745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 746             if mobj is None:
 747                 video_url = mediaURL
 748             else:
 749                 gdaKey = mobj.group(1)
 750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 751         else:
 752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 753             if mobj is None:
 754                 self._downloader.report_error(u'unable to extract media URL')
 755                 return
 756             vardict = compat_parse_qs(mobj.group(1))
 757             if 'mediaData' not in vardict:
 758                 self._downloader.report_error(u'unable to extract media URL')
 759                 return
 760             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 761             if mobj is None:
 762                 self._downloader.report_error(u'unable to extract media URL')
 763                 return
 764             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 765             video_extension = mediaURL[-3:]
 766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 767
 768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 769         if mobj is None:
 770             self._downloader.report_error(u'unable to extract title')
 771             return
 772         video_title = mobj.group(1).decode('utf-8')
 773
 774         mobj = re.search(r'submitter=(.*?);', webpage)
 775         if mobj is None:
 776             self._downloader.report_error(u'unable to extract uploader nickname')
 777             return
 778         video_uploader = mobj.group(1)
 779
 780         return [{
 781             'id':       video_id.decode('utf-8'),
 782             'url':      video_url.decode('utf-8'),
 783             'uploader': video_uploader.decode('utf-8'),
 784             'upload_date':  None,
 785             'title':    video_title,
 786             'ext':      video_extension.decode('utf-8'),
 787         }]
 788
 789
 790 class DailymotionIE(InfoExtractor):
 791     """Information Extractor for Dailymotion"""
 792
 793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 794     IE_NAME = u'dailymotion'
 795     _WORKING = False
 796
 797     def __init__(self, downloader=None):
 798         InfoExtractor.__init__(self, downloader)
 799
 800     def _real_extract(self, url):
 801         # Extract id and simplified title from URL
 802         mobj = re.match(self._VALID_URL, url)
 803         if mobj is None:
 804             self._downloader.report_error(u'invalid URL: %s' % url)
 805             return
 806
 807         video_id = mobj.group(1).split('_')[0].split('?')[0]
 808
 809         video_extension = 'mp4'
 810
 811         # Retrieve video webpage to extract further information
 812         request = compat_urllib_request.Request(url)
 813         request.add_header('Cookie', 'family_filter=off')
 814         webpage = self._download_webpage(request, video_id)
 815
 816         # Extract URL, uploader and title from webpage
 817         self.report_extraction(video_id)
 818         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 819         if mobj is None:
 820             self._downloader.report_error(u'unable to extract media URL')
 821             return
 822         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 823
 824         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 825             if key in flashvars:
 826                 max_quality = key
 827                 self.to_screen(u'Using %s' % key)
 828                 break
 829         else:
 830             self._downloader.report_error(u'unable to extract video URL')
 831             return
 832
 833         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 834         if mobj is None:
 835             self._downloader.report_error(u'unable to extract video URL')
 836             return
 837
 838         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 839
 840         # TODO: support choosing qualities
 841
 842         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 843         if mobj is None:
 844             self._downloader.report_error(u'unable to extract title')
 845             return
 846         video_title = unescapeHTML(mobj.group('title'))
 847
 848         video_uploader = None
 849         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 850         if mobj is None:
 851             # lookin for official user
 852             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 853             if mobj_official is None:
 854                 self._downloader.report_warning(u'unable to extract uploader nickname')
 855             else:
 856                 video_uploader = mobj_official.group(1)
 857         else:
 858             video_uploader = mobj.group(1)
 859
 860         video_upload_date = None
 861         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 862         if mobj is not None:
 863             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 864
 865         return [{
 866             'id':       video_id,
 867             'url':      video_url,
 868             'uploader': video_uploader,
 869             'upload_date':  video_upload_date,
 870             'title':    video_title,
 871             'ext':      video_extension,
 872         }]
 873
 874
 875 class PhotobucketIE(InfoExtractor):
 876     """Information extractor for photobucket.com."""
 877
 878     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 879     IE_NAME = u'photobucket'
 880
 881     def __init__(self, downloader=None):
 882         InfoExtractor.__init__(self, downloader)
 883
 884     def report_download_webpage(self, video_id):
 885         """Report webpage download."""
 886         self.to_screen(u'%s: Downloading webpage' % video_id)
 887
 888     def _real_extract(self, url):
 889         # Extract id from URL
 890         mobj = re.match(self._VALID_URL, url)
 891         if mobj is None:
 892             self._downloader.report_error(u'Invalid URL: %s' % url)
 893             return
 894
 895         video_id = mobj.group(1)
 896
 897         video_extension = 'flv'
 898
 899         # Retrieve video webpage to extract further information
 900         request = compat_urllib_request.Request(url)
 901         try:
 902             self.report_download_webpage(video_id)
 903             webpage = compat_urllib_request.urlopen(request).read()
 904         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 905             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 906             return
 907
 908         # Extract URL, uploader, and title from webpage
 909         self.report_extraction(video_id)
 910         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 911         if mobj is None:
 912             self._downloader.report_error(u'unable to extract media URL')
 913             return
 914         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 915
 916         video_url = mediaURL
 917
 918         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 919         if mobj is None:
 920             self._downloader.report_error(u'unable to extract title')
 921             return
 922         video_title = mobj.group(1).decode('utf-8')
 923
 924         video_uploader = mobj.group(2).decode('utf-8')
 925
 926         return [{
 927             'id':       video_id.decode('utf-8'),
 928             'url':      video_url.decode('utf-8'),
 929             'uploader': video_uploader,
 930             'upload_date':  None,
 931             'title':    video_title,
 932             'ext':      video_extension.decode('utf-8'),
 933         }]
 934
 935
 936 class YahooIE(InfoExtractor):
 937     """Information extractor for video.yahoo.com."""
 938
 939     _WORKING = False
 940     # _VALID_URL matches all Yahoo! Video URLs
 941     # _VPAGE_URL matches only the extractable '/watch/' URLs
 942     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 943     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 944     IE_NAME = u'video.yahoo'
 945
 946     def __init__(self, downloader=None):
 947         InfoExtractor.__init__(self, downloader)
 948
 949     def report_download_webpage(self, video_id):
 950         """Report webpage download."""
 951         self.to_screen(u'%s: Downloading webpage' % video_id)
 952
 953     def _real_extract(self, url, new_video=True):
 954         # Extract ID from URL
 955         mobj = re.match(self._VALID_URL, url)
 956         if mobj is None:
 957             self._downloader.report_error(u'Invalid URL: %s' % url)
 958             return
 959
 960         video_id = mobj.group(2)
 961         video_extension = 'flv'
 962
 963         # Rewrite valid but non-extractable URLs as
 964         # extractable English language /watch/ URLs
 965         if re.match(self._VPAGE_URL, url) is None:
 966             request = compat_urllib_request.Request(url)
 967             try:
 968                 webpage = compat_urllib_request.urlopen(request).read()
 969             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 970                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 971                 return
 972
 973             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 974             if mobj is None:
 975                 self._downloader.report_error(u'Unable to extract id field')
 976                 return
 977             yahoo_id = mobj.group(1)
 978
 979             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 980             if mobj is None:
 981                 self._downloader.report_error(u'Unable to extract vid field')
 982                 return
 983             yahoo_vid = mobj.group(1)
 984
 985             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 986             return self._real_extract(url, new_video=False)
 987
 988         # Retrieve video webpage to extract further information
 989         request = compat_urllib_request.Request(url)
 990         try:
 991             self.report_download_webpage(video_id)
 992             webpage = compat_urllib_request.urlopen(request).read()
 993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 994             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 995             return
 996
 997         # Extract uploader and title from webpage
 998         self.report_extraction(video_id)
 999         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1000         if mobj is None:
1001             self._downloader.report_error(u'unable to extract video title')
1002             return
1003         video_title = mobj.group(1).decode('utf-8')
1004
1005         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1006         if mobj is None:
1007             self._downloader.report_error(u'unable to extract video uploader')
1008             return
1009         video_uploader = mobj.group(1).decode('utf-8')
1010
1011         # Extract video thumbnail
1012         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1013         if mobj is None:
1014             self._downloader.report_error(u'unable to extract video thumbnail')
1015             return
1016         video_thumbnail = mobj.group(1).decode('utf-8')
1017
1018         # Extract video description
1019         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1020         if mobj is None:
1021             self._downloader.report_error(u'unable to extract video description')
1022             return
1023         video_description = mobj.group(1).decode('utf-8')
1024         if not video_description:
1025             video_description = 'No description available.'
1026
1027         # Extract video height and width
1028         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1029         if mobj is None:
1030             self._downloader.report_error(u'unable to extract video height')
1031             return
1032         yv_video_height = mobj.group(1)
1033
1034         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1035         if mobj is None:
1036             self._downloader.report_error(u'unable to extract video width')
1037             return
1038         yv_video_width = mobj.group(1)
1039
1040         # Retrieve video playlist to extract media URL
1041         # I'm not completely sure what all these options are, but we
1042         # seem to need most of them, otherwise the server sends a 401.
1043         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1044         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1045         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1046                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1047                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1048         try:
1049             self.report_download_webpage(video_id)
1050             webpage = compat_urllib_request.urlopen(request).read()
1051         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1052             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1053             return
1054
1055         # Extract media URL from playlist XML
1056         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1057         if mobj is None:
1058             self._downloader.report_error(u'Unable to extract media URL')
1059             return
1060         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1061         video_url = unescapeHTML(video_url)
1062
1063         return [{
1064             'id':       video_id.decode('utf-8'),
1065             'url':      video_url,
1066             'uploader': video_uploader,
1067             'upload_date':  None,
1068             'title':    video_title,
1069             'ext':      video_extension.decode('utf-8'),
1070             'thumbnail':    video_thumbnail.decode('utf-8'),
1071             'description':  video_description,
1072         }]
1073
1074
1075 class VimeoIE(InfoExtractor):
1076     """Information extractor for vimeo.com."""
1077
1078     # _VALID_URL matches Vimeo URLs
1079     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1080     IE_NAME = u'vimeo'
1081
1082     def __init__(self, downloader=None):
1083         InfoExtractor.__init__(self, downloader)
1084
1085     def report_download_webpage(self, video_id):
1086         """Report webpage download."""
1087         self.to_screen(u'%s: Downloading webpage' % video_id)
1088
1089     def _real_extract(self, url, new_video=True):
1090         # Extract ID from URL
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             self._downloader.report_error(u'Invalid URL: %s' % url)
1094             return
1095
1096         video_id = mobj.group('id')
1097         if not mobj.group('proto'):
1098             url = 'https://' + url
1099         if mobj.group('direct_link'):
1100             url = 'https://vimeo.com/' + video_id
1101
1102         # Retrieve video webpage to extract further information
1103         request = compat_urllib_request.Request(url, None, std_headers)
1104         try:
1105             self.report_download_webpage(video_id)
1106             webpage_bytes = compat_urllib_request.urlopen(request).read()
1107             webpage = webpage_bytes.decode('utf-8')
1108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1109             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1110             return
1111
1112         # Now we begin extracting as much information as we can from what we
1113         # retrieved. First we extract the information common to all extractors,
1114         # and latter we extract those that are Vimeo specific.
1115         self.report_extraction(video_id)
1116
1117         # Extract the config JSON
1118         try:
1119             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120             config = json.loads(config)
1121         except:
1122             self._downloader.report_error(u'unable to extract info section')
1123             return
1124
1125         # Extract title
1126         video_title = config["video"]["title"]
1127
1128         # Extract uploader and uploader_id
1129         video_uploader = config["video"]["owner"]["name"]
1130         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1131
1132         # Extract video thumbnail
1133         video_thumbnail = config["video"]["thumbnail"]
1134
1135         # Extract video description
1136         video_description = get_element_by_attribute("itemprop", "description", webpage)
1137         if video_description: video_description = clean_html(video_description)
1138         else: video_description = u''
1139
1140         # Extract upload date
1141         video_upload_date = None
1142         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1143         if mobj is not None:
1144             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1145
1146         # Vimeo specific: extract request signature and timestamp
1147         sig = config['request']['signature']
1148         timestamp = config['request']['timestamp']
1149
1150         # Vimeo specific: extract video codec and quality information
1151         # First consider quality, then codecs, then take everything
1152         # TODO bind to format param
1153         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1154         files = { 'hd': [], 'sd': [], 'other': []}
1155         for codec_name, codec_extension in codecs:
1156             if codec_name in config["video"]["files"]:
1157                 if 'hd' in config["video"]["files"][codec_name]:
1158                     files['hd'].append((codec_name, codec_extension, 'hd'))
1159                 elif 'sd' in config["video"]["files"][codec_name]:
1160                     files['sd'].append((codec_name, codec_extension, 'sd'))
1161                 else:
1162                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1163
1164         for quality in ('hd', 'sd', 'other'):
1165             if len(files[quality]) > 0:
1166                 video_quality = files[quality][0][2]
1167                 video_codec = files[quality][0][0]
1168                 video_extension = files[quality][0][1]
1169                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1170                 break
1171         else:
1172             self._downloader.report_error(u'no known codec found')
1173             return
1174
1175         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1176                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1177
1178         return [{
1179             'id':       video_id,
1180             'url':      video_url,
1181             'uploader': video_uploader,
1182             'uploader_id': video_uploader_id,
1183             'upload_date':  video_upload_date,
1184             'title':    video_title,
1185             'ext':      video_extension,
1186             'thumbnail':    video_thumbnail,
1187             'description':  video_description,
1188         }]
1189
1190
1191 class ArteTvIE(InfoExtractor):
1192     """arte.tv information extractor."""
1193
1194     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1195     _LIVE_URL = r'index-[0-9]+\.html$'
1196
1197     IE_NAME = u'arte.tv'
1198
1199     def __init__(self, downloader=None):
1200         InfoExtractor.__init__(self, downloader)
1201
1202     def report_download_webpage(self, video_id):
1203         """Report webpage download."""
1204         self.to_screen(u'%s: Downloading webpage' % video_id)
1205
1206     def fetch_webpage(self, url):
1207         request = compat_urllib_request.Request(url)
1208         try:
1209             self.report_download_webpage(url)
1210             webpage = compat_urllib_request.urlopen(request).read()
1211         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213             return
1214         except ValueError as err:
1215             self._downloader.report_error(u'Invalid URL: %s' % url)
1216             return
1217         return webpage
1218
1219     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1220         page = self.fetch_webpage(url)
1221         mobj = re.search(regex, page, regexFlags)
1222         info = {}
1223
1224         if mobj is None:
1225             self._downloader.report_error(u'Invalid URL: %s' % url)
1226             return
1227
1228         for (i, key, err) in matchTuples:
1229             if mobj.group(i) is None:
1230                 self._downloader.trouble(err)
1231                 return
1232             else:
1233                 info[key] = mobj.group(i)
1234
1235         return info
1236
1237     def extractLiveStream(self, url):
1238         video_lang = url.split('/')[-4]
1239         info = self.grep_webpage(
1240             url,
1241             r'src="(.*?/videothek_js.*?\.js)',
1242             0,
1243             [
1244                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1245             ]
1246         )
1247         http_host = url.split('/')[2]
1248         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1249         info = self.grep_webpage(
1250             next_url,
1251             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1252                 '(http://.*?\.swf).*?' +
1253                 '(rtmp://.*?)\'',
1254             re.DOTALL,
1255             [
1256                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1257                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1258                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1259             ]
1260         )
1261         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1262
1263     def extractPlus7Stream(self, url):
1264         video_lang = url.split('/')[-3]
1265         info = self.grep_webpage(
1266             url,
1267             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1268             0,
1269             [
1270                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1271             ]
1272         )
1273         next_url = compat_urllib_parse.unquote(info.get('url'))
1274         info = self.grep_webpage(
1275             next_url,
1276             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1277             0,
1278             [
1279                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1280             ]
1281         )
1282         next_url = compat_urllib_parse.unquote(info.get('url'))
1283
1284         info = self.grep_webpage(
1285             next_url,
1286             r'<video id="(.*?)".*?>.*?' +
1287                 '<name>(.*?)</name>.*?' +
1288                 '<dateVideo>(.*?)</dateVideo>.*?' +
1289                 '<url quality="hd">(.*?)</url>',
1290             re.DOTALL,
1291             [
1292                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1293                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1294                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1295                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1296             ]
1297         )
1298
1299         return {
1300             'id':           info.get('id'),
1301             'url':          compat_urllib_parse.unquote(info.get('url')),
1302             'uploader':     u'arte.tv',
1303             'upload_date':  info.get('date'),
1304             'title':        info.get('title').decode('utf-8'),
1305             'ext':          u'mp4',
1306             'format':       u'NA',
1307             'player_url':   None,
1308         }
1309
1310     def _real_extract(self, url):
1311         video_id = url.split('/')[-1]
1312         self.report_extraction(video_id)
1313
1314         if re.search(self._LIVE_URL, video_id) is not None:
1315             self.extractLiveStream(url)
1316             return
1317         else:
1318             info = self.extractPlus7Stream(url)
1319
1320         return [info]
1321
1322
1323 class GenericIE(InfoExtractor):
1324     """Generic last-resort information extractor."""
1325
1326     _VALID_URL = r'.*'
1327     IE_NAME = u'generic'
1328
1329     def __init__(self, downloader=None):
1330         InfoExtractor.__init__(self, downloader)
1331
1332     def report_download_webpage(self, video_id):
1333         """Report webpage download."""
1334         if not self._downloader.params.get('test', False):
1335             self._downloader.report_warning(u'Falling back on generic information extractor.')
1336         self.to_screen(u'%s: Downloading webpage' % video_id)
1337
1338     def report_following_redirect(self, new_url):
1339         """Report information extraction."""
1340         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1341
1342     def _test_redirect(self, url):
1343         """Check if it is a redirect, like url shorteners, in case return the new url."""
1344         class HeadRequest(compat_urllib_request.Request):
1345             def get_method(self):
1346                 return "HEAD"
1347
1348         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1349             """
1350             Subclass the HTTPRedirectHandler to make it use our
1351             HeadRequest also on the redirected URL
1352             """
1353             def redirect_request(self, req, fp, code, msg, headers, newurl):
1354                 if code in (301, 302, 303, 307):
1355                     newurl = newurl.replace(' ', '%20')
1356                     newheaders = dict((k,v) for k,v in req.headers.items()
1357                                       if k.lower() not in ("content-length", "content-type"))
1358                     return HeadRequest(newurl,
1359                                        headers=newheaders,
1360                                        origin_req_host=req.get_origin_req_host(),
1361                                        unverifiable=True)
1362                 else:
1363                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1364
1365         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1366             """
1367             Fallback to GET if HEAD is not allowed (405 HTTP error)
1368             """
1369             def http_error_405(self, req, fp, code, msg, headers):
1370                 fp.read()
1371                 fp.close()
1372
1373                 newheaders = dict((k,v) for k,v in req.headers.items()
1374                                   if k.lower() not in ("content-length", "content-type"))
1375                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1376                                                  headers=newheaders,
1377                                                  origin_req_host=req.get_origin_req_host(),
1378                                                  unverifiable=True))
1379
1380         # Build our opener
1381         opener = compat_urllib_request.OpenerDirector()
1382         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1383                         HTTPMethodFallback, HEADRedirectHandler,
1384                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1385             opener.add_handler(handler())
1386
1387         response = opener.open(HeadRequest(url))
1388         new_url = response.geturl()
1389
1390         if url == new_url:
1391             return False
1392
1393         self.report_following_redirect(new_url)
1394         return new_url
1395
1396     def _real_extract(self, url):
1397         new_url = self._test_redirect(url)
1398         if new_url: return [self.url_result(new_url)]
1399
1400         video_id = url.split('/')[-1]
1401         try:
1402             webpage = self._download_webpage(url, video_id)
1403         except ValueError as err:
1404             # since this is the last-resort InfoExtractor, if
1405             # this error is thrown, it'll be thrown here
1406             self._downloader.report_error(u'Invalid URL: %s' % url)
1407             return
1408
1409         self.report_extraction(video_id)
1410         # Start with something easy: JW Player in SWFObject
1411         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             # Broaden the search a little bit
1414             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1415         if mobj is None:
1416             # Broaden the search a little bit: JWPlayer JS loader
1417             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1418         if mobj is None:
1419             self._downloader.report_error(u'Invalid URL: %s' % url)
1420             return
1421
1422         # It's possible that one of the regexes
1423         # matched, but returned an empty group:
1424         if mobj.group(1) is None:
1425             self._downloader.report_error(u'Invalid URL: %s' % url)
1426             return
1427
1428         video_url = compat_urllib_parse.unquote(mobj.group(1))
1429         video_id = os.path.basename(video_url)
1430
1431         # here's a fun little line of code for you:
1432         video_extension = os.path.splitext(video_id)[1][1:]
1433         video_id = os.path.splitext(video_id)[0]
1434
1435         # it's tempting to parse this further, but you would
1436         # have to take into account all the variations like
1437         #   Video Title - Site Name
1438         #   Site Name | Video Title
1439         #   Video Title - Tagline | Site Name
1440         # and so on and so forth; it's just not practical
1441         mobj = re.search(r'<title>(.*)</title>', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'unable to extract title')
1444             return
1445         video_title = mobj.group(1)
1446
1447         # video uploader is domain name
1448         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1449         if mobj is None:
1450             self._downloader.report_error(u'unable to extract title')
1451             return
1452         video_uploader = mobj.group(1)
1453
1454         return [{
1455             'id':       video_id,
1456             'url':      video_url,
1457             'uploader': video_uploader,
1458             'upload_date':  None,
1459             'title':    video_title,
1460             'ext':      video_extension,
1461         }]
1462
1463
1464 class YoutubeSearchIE(InfoExtractor):
1465     """Information Extractor for YouTube search queries."""
1466     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1467     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1468     _max_youtube_results = 1000
1469     IE_NAME = u'youtube:search'
1470
1471     def __init__(self, downloader=None):
1472         InfoExtractor.__init__(self, downloader)
1473
1474     def report_download_page(self, query, pagenum):
1475         """Report attempt to download search page with given number."""
1476         query = query.decode(preferredencoding())
1477         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1478
1479     def _real_extract(self, query):
1480         mobj = re.match(self._VALID_URL, query)
1481         if mobj is None:
1482             self._downloader.report_error(u'invalid search query "%s"' % query)
1483             return
1484
1485         prefix, query = query.split(':')
1486         prefix = prefix[8:]
1487         query = query.encode('utf-8')
1488         if prefix == '':
1489             return self._get_n_results(query, 1)
1490         elif prefix == 'all':
1491             self._get_n_results(query, self._max_youtube_results)
1492         else:
1493             try:
1494                 n = int(prefix)
1495                 if n <= 0:
1496                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1497                     return
1498                 elif n > self._max_youtube_results:
1499                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1500                     n = self._max_youtube_results
1501                 return self._get_n_results(query, n)
1502             except ValueError: # parsing prefix as integer fails
1503                 return self._get_n_results(query, 1)
1504
1505     def _get_n_results(self, query, n):
1506         """Get a specified number of results for a query"""
1507
1508         video_ids = []
1509         pagenum = 0
1510         limit = n
1511
1512         while (50 * pagenum) < limit:
1513             self.report_download_page(query, pagenum+1)
1514             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1515             request = compat_urllib_request.Request(result_url)
1516             try:
1517                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1518             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1519                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1520                 return
1521             api_response = json.loads(data)['data']
1522
1523             if not 'items' in api_response:
1524                 self._downloader.trouble(u'[youtube] No video results')
1525                 return
1526
1527             new_ids = list(video['id'] for video in api_response['items'])
1528             video_ids += new_ids
1529
1530             limit = min(n, api_response['totalItems'])
1531             pagenum += 1
1532
1533         if len(video_ids) > n:
1534             video_ids = video_ids[:n]
1535         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1536         return videos
1537
1538
1539 class GoogleSearchIE(InfoExtractor):
1540     """Information Extractor for Google Video search queries."""
1541     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1542     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1543     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1544     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1545     _max_google_results = 1000
1546     IE_NAME = u'video.google:search'
1547
1548     def __init__(self, downloader=None):
1549         InfoExtractor.__init__(self, downloader)
1550
1551     def report_download_page(self, query, pagenum):
1552         """Report attempt to download playlist page with given number."""
1553         query = query.decode(preferredencoding())
1554         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1555
1556     def _real_extract(self, query):
1557         mobj = re.match(self._VALID_URL, query)
1558         if mobj is None:
1559             self._downloader.report_error(u'invalid search query "%s"' % query)
1560             return
1561
1562         prefix, query = query.split(':')
1563         prefix = prefix[8:]
1564         query = query.encode('utf-8')
1565         if prefix == '':
1566             self._download_n_results(query, 1)
1567             return
1568         elif prefix == 'all':
1569             self._download_n_results(query, self._max_google_results)
1570             return
1571         else:
1572             try:
1573                 n = int(prefix)
1574                 if n <= 0:
1575                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1576                     return
1577                 elif n > self._max_google_results:
1578                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1579                     n = self._max_google_results
1580                 self._download_n_results(query, n)
1581                 return
1582             except ValueError: # parsing prefix as integer fails
1583                 self._download_n_results(query, 1)
1584                 return
1585
1586     def _download_n_results(self, query, n):
1587         """Downloads a specified number of results for a query"""
1588
1589         video_ids = []
1590         pagenum = 0
1591
1592         while True:
1593             self.report_download_page(query, pagenum)
1594             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1595             request = compat_urllib_request.Request(result_url)
1596             try:
1597                 page = compat_urllib_request.urlopen(request).read()
1598             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1599                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1600                 return
1601
1602             # Extract video identifiers
1603             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1604                 video_id = mobj.group(1)
1605                 if video_id not in video_ids:
1606                     video_ids.append(video_id)
1607                     if len(video_ids) == n:
1608                         # Specified n videos reached
1609                         for id in video_ids:
1610                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1611                         return
1612
1613             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1614                 for id in video_ids:
1615                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1616                 return
1617
1618             pagenum = pagenum + 1
1619
1620
1621 class YahooSearchIE(InfoExtractor):
1622     """Information Extractor for Yahoo! Video search queries."""
1623
1624     _WORKING = False
1625     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1626     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1627     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1628     _MORE_PAGES_INDICATOR = r'\s*Next'
1629     _max_yahoo_results = 1000
1630     IE_NAME = u'video.yahoo:search'
1631
1632     def __init__(self, downloader=None):
1633         InfoExtractor.__init__(self, downloader)
1634
1635     def report_download_page(self, query, pagenum):
1636         """Report attempt to download playlist page with given number."""
1637         query = query.decode(preferredencoding())
1638         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1639
1640     def _real_extract(self, query):
1641         mobj = re.match(self._VALID_URL, query)
1642         if mobj is None:
1643             self._downloader.report_error(u'invalid search query "%s"' % query)
1644             return
1645
1646         prefix, query = query.split(':')
1647         prefix = prefix[8:]
1648         query = query.encode('utf-8')
1649         if prefix == '':
1650             self._download_n_results(query, 1)
1651             return
1652         elif prefix == 'all':
1653             self._download_n_results(query, self._max_yahoo_results)
1654             return
1655         else:
1656             try:
1657                 n = int(prefix)
1658                 if n <= 0:
1659                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1660                     return
1661                 elif n > self._max_yahoo_results:
1662                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1663                     n = self._max_yahoo_results
1664                 self._download_n_results(query, n)
1665                 return
1666             except ValueError: # parsing prefix as integer fails
1667                 self._download_n_results(query, 1)
1668                 return
1669
1670     def _download_n_results(self, query, n):
1671         """Downloads a specified number of results for a query"""
1672
1673         video_ids = []
1674         already_seen = set()
1675         pagenum = 1
1676
1677         while True:
1678             self.report_download_page(query, pagenum)
1679             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1680             request = compat_urllib_request.Request(result_url)
1681             try:
1682                 page = compat_urllib_request.urlopen(request).read()
1683             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1684                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1685                 return
1686
1687             # Extract video identifiers
1688             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1689                 video_id = mobj.group(1)
1690                 if video_id not in already_seen:
1691                     video_ids.append(video_id)
1692                     already_seen.add(video_id)
1693                     if len(video_ids) == n:
1694                         # Specified n videos reached
1695                         for id in video_ids:
1696                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1697                         return
1698
1699             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1700                 for id in video_ids:
1701                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1702                 return
1703
1704             pagenum = pagenum + 1
1705
1706
1707 class YoutubePlaylistIE(InfoExtractor):
1708     """Information Extractor for YouTube playlists."""
1709
1710     _VALID_URL = r"""(?:
1711                         (?:https?://)?
1712                         (?:\w+\.)?
1713                         youtube\.com/
1714                         (?:
1715                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1716                            \? (?:.*?&)*? (?:p|a|list)=
1717                         |  p/
1718                         )
1719                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1720                         .*
1721                      |
1722                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1723                      )"""
1724     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1725     _MAX_RESULTS = 50
1726     IE_NAME = u'youtube:playlist'
1727
1728     def __init__(self, downloader=None):
1729         InfoExtractor.__init__(self, downloader)
1730
1731     @classmethod
1732     def suitable(cls, url):
1733         """Receives a URL and returns True if suitable for this IE."""
1734         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1735
1736     def report_download_page(self, playlist_id, pagenum):
1737         """Report attempt to download playlist page with given number."""
1738         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1739
1740     def _real_extract(self, url):
1741         # Extract playlist id
1742         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1743         if mobj is None:
1744             self._downloader.report_error(u'invalid url: %s' % url)
1745             return
1746
1747         # Download playlist videos from API
1748         playlist_id = mobj.group(1) or mobj.group(2)
1749         page_num = 1
1750         videos = []
1751
1752         while True:
1753             self.report_download_page(playlist_id, page_num)
1754
1755             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1756             try:
1757                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1758             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1759                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1760                 return
1761
1762             try:
1763                 response = json.loads(page)
1764             except ValueError as err:
1765                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1766                 return
1767
1768             if 'feed' not in response:
1769                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1770                 return
1771             if 'entry' not in response['feed']:
1772                 # Number of videos is a multiple of self._MAX_RESULTS
1773                 break
1774
1775             playlist_title = response['feed']['title']['$t']
1776
1777             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1778                         for entry in response['feed']['entry']
1779                         if 'content' in entry ]
1780
1781             if len(response['feed']['entry']) < self._MAX_RESULTS:
1782                 break
1783             page_num += 1
1784
1785         videos = [v[1] for v in sorted(videos)]
1786
1787         url_results = [self.url_result(url, 'Youtube') for url in videos]
1788         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1789
1790
1791 class YoutubeChannelIE(InfoExtractor):
1792     """Information Extractor for YouTube channels."""
1793
1794     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1795     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1796     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1797     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1798     IE_NAME = u'youtube:channel'
1799
1800     def report_download_page(self, channel_id, pagenum):
1801         """Report attempt to download channel page with given number."""
1802         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1803
1804     def extract_videos_from_page(self, page):
1805         ids_in_page = []
1806         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1807             if mobj.group(1) not in ids_in_page:
1808                 ids_in_page.append(mobj.group(1))
1809         return ids_in_page
1810
1811     def _real_extract(self, url):
1812         # Extract channel id
1813         mobj = re.match(self._VALID_URL, url)
1814         if mobj is None:
1815             self._downloader.report_error(u'invalid url: %s' % url)
1816             return
1817
1818         # Download channel page
1819         channel_id = mobj.group(1)
1820         video_ids = []
1821         pagenum = 1
1822
1823         self.report_download_page(channel_id, pagenum)
1824         url = self._TEMPLATE_URL % (channel_id, pagenum)
1825         request = compat_urllib_request.Request(url)
1826         try:
1827             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1828         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1829             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1830             return
1831
1832         # Extract video identifiers
1833         ids_in_page = self.extract_videos_from_page(page)
1834         video_ids.extend(ids_in_page)
1835
1836         # Download any subsequent channel pages using the json-based channel_ajax query
1837         if self._MORE_PAGES_INDICATOR in page:
1838             while True:
1839                 pagenum = pagenum + 1
1840
1841                 self.report_download_page(channel_id, pagenum)
1842                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1843                 request = compat_urllib_request.Request(url)
1844                 try:
1845                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1846                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1847                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1848                     return
1849
1850                 page = json.loads(page)
1851
1852                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1853                 video_ids.extend(ids_in_page)
1854
1855                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1856                     break
1857
1858         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1859
1860         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1861         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1862         return [self.playlist_result(url_entries, channel_id)]
1863
1864
1865 class YoutubeUserIE(InfoExtractor):
1866     """Information Extractor for YouTube users."""
1867
1868     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1869     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1870     _GDATA_PAGE_SIZE = 50
1871     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1872     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1873     IE_NAME = u'youtube:user'
1874
1875     def __init__(self, downloader=None):
1876         InfoExtractor.__init__(self, downloader)
1877
1878     def report_download_page(self, username, start_index):
1879         """Report attempt to download user page."""
1880         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1881                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1882
1883     def _real_extract(self, url):
1884         # Extract username
1885         mobj = re.match(self._VALID_URL, url)
1886         if mobj is None:
1887             self._downloader.report_error(u'invalid url: %s' % url)
1888             return
1889
1890         username = mobj.group(1)
1891
1892         # Download video ids using YouTube Data API. Result size per
1893         # query is limited (currently to 50 videos) so we need to query
1894         # page by page until there are no video ids - it means we got
1895         # all of them.
1896
1897         video_ids = []
1898         pagenum = 0
1899
1900         while True:
1901             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1902             self.report_download_page(username, start_index)
1903
1904             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1905
1906             try:
1907                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1908             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1909                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1910                 return
1911
1912             # Extract video identifiers
1913             ids_in_page = []
1914
1915             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1916                 if mobj.group(1) not in ids_in_page:
1917                     ids_in_page.append(mobj.group(1))
1918
1919             video_ids.extend(ids_in_page)
1920
1921             # A little optimization - if current page is not
1922             # "full", ie. does not contain PAGE_SIZE video ids then
1923             # we can assume that this page is the last one - there
1924             # are no more ids on further pages - no need to query
1925             # again.
1926
1927             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1928                 break
1929
1930             pagenum += 1
1931
1932         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1933         url_results = [self.url_result(url, 'Youtube') for url in urls]
1934         return [self.playlist_result(url_results, playlist_title = username)]
1935
1936
1937 class BlipTVUserIE(InfoExtractor):
1938     """Information Extractor for blip.tv users."""
1939
1940     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1941     _PAGE_SIZE = 12
1942     IE_NAME = u'blip.tv:user'
1943
1944     def __init__(self, downloader=None):
1945         InfoExtractor.__init__(self, downloader)
1946
1947     def report_download_page(self, username, pagenum):
1948         """Report attempt to download user page."""
1949         self.to_screen(u'user %s: Downloading video ids from page %d' %
1950                 (username, pagenum))
1951
1952     def _real_extract(self, url):
1953         # Extract username
1954         mobj = re.match(self._VALID_URL, url)
1955         if mobj is None:
1956             self._downloader.report_error(u'invalid url: %s' % url)
1957             return
1958
1959         username = mobj.group(1)
1960
1961         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1962
1963         request = compat_urllib_request.Request(url)
1964
1965         try:
1966             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1967             mobj = re.search(r'data-users-id="([^"]+)"', page)
1968             page_base = page_base % mobj.group(1)
1969         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1970             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1971             return
1972
1973
1974         # Download video ids using BlipTV Ajax calls. Result size per
1975         # query is limited (currently to 12 videos) so we need to query
1976         # page by page until there are no video ids - it means we got
1977         # all of them.
1978
1979         video_ids = []
1980         pagenum = 1
1981
1982         while True:
1983             self.report_download_page(username, pagenum)
1984             url = page_base + "&page=" + str(pagenum)
1985             request = compat_urllib_request.Request( url )
1986             try:
1987                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1988             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1989                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1990                 return
1991
1992             # Extract video identifiers
1993             ids_in_page = []
1994
1995             for mobj in re.finditer(r'href="/([^"]+)"', page):
1996                 if mobj.group(1) not in ids_in_page:
1997                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1998
1999             video_ids.extend(ids_in_page)
2000
2001             # A little optimization - if current page is not
2002             # "full", ie. does not contain PAGE_SIZE video ids then
2003             # we can assume that this page is the last one - there
2004             # are no more ids on further pages - no need to query
2005             # again.
2006
2007             if len(ids_in_page) < self._PAGE_SIZE:
2008                 break
2009
2010             pagenum += 1
2011
2012         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2013         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2014         return [self.playlist_result(url_entries, playlist_title = username)]
2015
2016
2017 class DepositFilesIE(InfoExtractor):
2018     """Information extractor for depositfiles.com"""
2019
2020     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2021
2022     def report_download_webpage(self, file_id):
2023         """Report webpage download."""
2024         self.to_screen(u'%s: Downloading webpage' % file_id)
2025
2026     def _real_extract(self, url):
2027         file_id = url.split('/')[-1]
2028         # Rebuild url in english locale
2029         url = 'http://depositfiles.com/en/files/' + file_id
2030
2031         # Retrieve file webpage with 'Free download' button pressed
2032         free_download_indication = { 'gateway_result' : '1' }
2033         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2034         try:
2035             self.report_download_webpage(file_id)
2036             webpage = compat_urllib_request.urlopen(request).read()
2037         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2038             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2039             return
2040
2041         # Search for the real file URL
2042         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2043         if (mobj is None) or (mobj.group(1) is None):
2044             # Try to figure out reason of the error.
2045             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2046             if (mobj is not None) and (mobj.group(1) is not None):
2047                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2048                 self._downloader.report_error(u'%s' % restriction_message)
2049             else:
2050                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2051             return
2052
2053         file_url = mobj.group(1)
2054         file_extension = os.path.splitext(file_url)[1][1:]
2055
2056         # Search for file title
2057         mobj = re.search(r'<b title="(.*?)">', webpage)
2058         if mobj is None:
2059             self._downloader.report_error(u'unable to extract title')
2060             return
2061         file_title = mobj.group(1).decode('utf-8')
2062
2063         return [{
2064             'id':       file_id.decode('utf-8'),
2065             'url':      file_url.decode('utf-8'),
2066             'uploader': None,
2067             'upload_date':  None,
2068             'title':    file_title,
2069             'ext':      file_extension.decode('utf-8'),
2070         }]
2071
2072
2073 class FacebookIE(InfoExtractor):
2074     """Information Extractor for Facebook"""
2075
2076     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2077     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2078     _NETRC_MACHINE = 'facebook'
2079     IE_NAME = u'facebook'
2080
2081     def report_login(self):
2082         """Report attempt to log in."""
2083         self.to_screen(u'Logging in')
2084
2085     def _real_initialize(self):
2086         if self._downloader is None:
2087             return
2088
2089         useremail = None
2090         password = None
2091         downloader_params = self._downloader.params
2092
2093         # Attempt to use provided username and password or .netrc data
2094         if downloader_params.get('username', None) is not None:
2095             useremail = downloader_params['username']
2096             password = downloader_params['password']
2097         elif downloader_params.get('usenetrc', False):
2098             try:
2099                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2100                 if info is not None:
2101                     useremail = info[0]
2102                     password = info[2]
2103                 else:
2104                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2105             except (IOError, netrc.NetrcParseError) as err:
2106                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2107                 return
2108
2109         if useremail is None:
2110             return
2111
2112         # Log in
2113         login_form = {
2114             'email': useremail,
2115             'pass': password,
2116             'login': 'Log+In'
2117             }
2118         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2119         try:
2120             self.report_login()
2121             login_results = compat_urllib_request.urlopen(request).read()
2122             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2123                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2124                 return
2125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2126             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2127             return
2128
2129     def _real_extract(self, url):
2130         mobj = re.match(self._VALID_URL, url)
2131         if mobj is None:
2132             self._downloader.report_error(u'invalid URL: %s' % url)
2133             return
2134         video_id = mobj.group('ID')
2135
2136         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2137         webpage = self._download_webpage(url, video_id)
2138
2139         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2140         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2141         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2142         if not m:
2143             raise ExtractorError(u'Cannot parse data')
2144         data = dict(json.loads(m.group(1)))
2145         params_raw = compat_urllib_parse.unquote(data['params'])
2146         params = json.loads(params_raw)
2147         video_data = params['video_data'][0]
2148         video_url = video_data.get('hd_src')
2149         if not video_url:
2150             video_url = video_data['sd_src']
2151         if not video_url:
2152             raise ExtractorError(u'Cannot find video URL')
2153         video_duration = int(video_data['video_duration'])
2154         thumbnail = video_data['thumbnail_src']
2155
2156         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2157         if not m:
2158             raise ExtractorError(u'Cannot find title in webpage')
2159         video_title = unescapeHTML(m.group(1))
2160
2161         info = {
2162             'id': video_id,
2163             'title': video_title,
2164             'url': video_url,
2165             'ext': 'mp4',
2166             'duration': video_duration,
2167             'thumbnail': thumbnail,
2168         }
2169         return [info]
2170
2171
2172 class BlipTVIE(InfoExtractor):
2173     """Information extractor for blip.tv"""
2174
2175     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2176     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2177     IE_NAME = u'blip.tv'
2178
2179     def report_direct_download(self, title):
2180         """Report information extraction."""
2181         self.to_screen(u'%s: Direct download detected' % title)
2182
2183     def _real_extract(self, url):
2184         mobj = re.match(self._VALID_URL, url)
2185         if mobj is None:
2186             self._downloader.report_error(u'invalid URL: %s' % url)
2187             return
2188
2189         urlp = compat_urllib_parse_urlparse(url)
2190         if urlp.path.startswith('/play/'):
2191             request = compat_urllib_request.Request(url)
2192             response = compat_urllib_request.urlopen(request)
2193             redirecturl = response.geturl()
2194             rurlp = compat_urllib_parse_urlparse(redirecturl)
2195             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2196             url = 'http://blip.tv/a/a-' + file_id
2197             return self._real_extract(url)
2198
2199
2200         if '?' in url:
2201             cchar = '&'
2202         else:
2203             cchar = '?'
2204         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2205         request = compat_urllib_request.Request(json_url)
2206         request.add_header('User-Agent', 'iTunes/10.6.1')
2207         self.report_extraction(mobj.group(1))
2208         info = None
2209         try:
2210             urlh = compat_urllib_request.urlopen(request)
2211             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212                 basename = url.split('/')[-1]
2213                 title,ext = os.path.splitext(basename)
2214                 title = title.decode('UTF-8')
2215                 ext = ext.replace('.', '')
2216                 self.report_direct_download(title)
2217                 info = {
2218                     'id': title,
2219                     'url': url,
2220                     'uploader': None,
2221                     'upload_date': None,
2222                     'title': title,
2223                     'ext': ext,
2224                     'urlhandle': urlh
2225                 }
2226         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228         if info is None: # Regular URL
2229             try:
2230                 json_code_bytes = urlh.read()
2231                 json_code = json_code_bytes.decode('utf-8')
2232             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2233                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2234                 return
2235
2236             try:
2237                 json_data = json.loads(json_code)
2238                 if 'Post' in json_data:
2239                     data = json_data['Post']
2240                 else:
2241                     data = json_data
2242
2243                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2244                 video_url = data['media']['url']
2245                 umobj = re.match(self._URL_EXT, video_url)
2246                 if umobj is None:
2247                     raise ValueError('Can not determine filename extension')
2248                 ext = umobj.group(1)
2249
2250                 info = {
2251                     'id': data['item_id'],
2252                     'url': video_url,
2253                     'uploader': data['display_name'],
2254                     'upload_date': upload_date,
2255                     'title': data['title'],
2256                     'ext': ext,
2257                     'format': data['media']['mimeType'],
2258                     'thumbnail': data['thumbnailUrl'],
2259                     'description': data['description'],
2260                     'player_url': data['embedUrl'],
2261                     'user_agent': 'iTunes/10.6.1',
2262                 }
2263             except (ValueError,KeyError) as err:
2264                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2265                 return
2266
2267         return [info]
2268
2269
2270 class MyVideoIE(InfoExtractor):
2271     """Information Extractor for myvideo.de."""
2272
2273     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2274     IE_NAME = u'myvideo'
2275
2276     def __init__(self, downloader=None):
2277         InfoExtractor.__init__(self, downloader)
2278
2279     def _real_extract(self,url):
2280         mobj = re.match(self._VALID_URL, url)
2281         if mobj is None:
2282             self._download.report_error(u'invalid URL: %s' % url)
2283             return
2284
2285         video_id = mobj.group(1)
2286
2287         # Get video webpage
2288         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2289         webpage = self._download_webpage(webpage_url, video_id)
2290
2291         self.report_extraction(video_id)
2292         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2293                  webpage)
2294         if mobj is None:
2295             self._downloader.report_error(u'unable to extract media URL')
2296             return
2297         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2298
2299         mobj = re.search('<title>([^<]+)</title>', webpage)
2300         if mobj is None:
2301             self._downloader.report_error(u'unable to extract title')
2302             return
2303
2304         video_title = mobj.group(1)
2305
2306         return [{
2307             'id':       video_id,
2308             'url':      video_url,
2309             'uploader': None,
2310             'upload_date':  None,
2311             'title':    video_title,
2312             'ext':      u'flv',
2313         }]
2314
2315 class ComedyCentralIE(InfoExtractor):
2316     """Information extractor for The Daily Show and Colbert Report """
2317
2318     # urls can be abbreviations like :thedailyshow or :colbert
2319     # urls for episodes like:
2320     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2321     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2322     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2323     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2324                       |(https?://)?(www\.)?
2325                           (?P<showname>thedailyshow|colbertnation)\.com/
2326                          (full-episodes/(?P<episode>.*)|
2327                           (?P<clip>
2328                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2329                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2330                      $"""
2331
2332     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2333
2334     _video_extensions = {
2335         '3500': 'mp4',
2336         '2200': 'mp4',
2337         '1700': 'mp4',
2338         '1200': 'mp4',
2339         '750': 'mp4',
2340         '400': 'mp4',
2341     }
2342     _video_dimensions = {
2343         '3500': '1280x720',
2344         '2200': '960x540',
2345         '1700': '768x432',
2346         '1200': '640x360',
2347         '750': '512x288',
2348         '400': '384x216',
2349     }
2350
2351     @classmethod
2352     def suitable(cls, url):
2353         """Receives a URL and returns True if suitable for this IE."""
2354         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2355
2356     def report_config_download(self, episode_id, media_id):
2357         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2358
2359     def report_index_download(self, episode_id):
2360         self.to_screen(u'%s: Downloading show index' % episode_id)
2361
2362     def _print_formats(self, formats):
2363         print('Available formats:')
2364         for x in formats:
2365             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2366
2367
2368     def _real_extract(self, url):
2369         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2370         if mobj is None:
2371             self._downloader.report_error(u'invalid URL: %s' % url)
2372             return
2373
2374         if mobj.group('shortname'):
2375             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2376                 url = u'http://www.thedailyshow.com/full-episodes/'
2377             else:
2378                 url = u'http://www.colbertnation.com/full-episodes/'
2379             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2380             assert mobj is not None
2381
2382         if mobj.group('clip'):
2383             if mobj.group('showname') == 'thedailyshow':
2384                 epTitle = mobj.group('tdstitle')
2385             else:
2386                 epTitle = mobj.group('cntitle')
2387             dlNewest = False
2388         else:
2389             dlNewest = not mobj.group('episode')
2390             if dlNewest:
2391                 epTitle = mobj.group('showname')
2392             else:
2393                 epTitle = mobj.group('episode')
2394
2395         req = compat_urllib_request.Request(url)
2396         self.report_extraction(epTitle)
2397         try:
2398             htmlHandle = compat_urllib_request.urlopen(req)
2399             html = htmlHandle.read()
2400             webpage = html.decode('utf-8')
2401         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2403             return
2404         if dlNewest:
2405             url = htmlHandle.geturl()
2406             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2407             if mobj is None:
2408                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2409                 return
2410             if mobj.group('episode') == '':
2411                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2412                 return
2413             epTitle = mobj.group('episode')
2414
2415         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2416
2417         if len(mMovieParams) == 0:
2418             # The Colbert Report embeds the information in a without
2419             # a URL prefix; so extract the alternate reference
2420             # and then add the URL prefix manually.
2421
2422             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2423             if len(altMovieParams) == 0:
2424                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2425                 return
2426             else:
2427                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2428
2429         uri = mMovieParams[0][1]
2430         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2431         self.report_index_download(epTitle)
2432         try:
2433             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2434         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2435             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2436             return
2437
2438         results = []
2439
2440         idoc = xml.etree.ElementTree.fromstring(indexXml)
2441         itemEls = idoc.findall('.//item')
2442         for partNum,itemEl in enumerate(itemEls):
2443             mediaId = itemEl.findall('./guid')[0].text
2444             shortMediaId = mediaId.split(':')[-1]
2445             showId = mediaId.split(':')[-2].replace('.com', '')
2446             officialTitle = itemEl.findall('./title')[0].text
2447             officialDate = itemEl.findall('./pubDate')[0].text
2448
2449             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2450                         compat_urllib_parse.urlencode({'uri': mediaId}))
2451             configReq = compat_urllib_request.Request(configUrl)
2452             self.report_config_download(epTitle, shortMediaId)
2453             try:
2454                 configXml = compat_urllib_request.urlopen(configReq).read()
2455             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2456                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2457                 return
2458
2459             cdoc = xml.etree.ElementTree.fromstring(configXml)
2460             turls = []
2461             for rendition in cdoc.findall('.//rendition'):
2462                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2463                 turls.append(finfo)
2464
2465             if len(turls) == 0:
2466                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2467                 continue
2468
2469             if self._downloader.params.get('listformats', None):
2470                 self._print_formats([i[0] for i in turls])
2471                 return
2472
2473             # For now, just pick the highest bitrate
2474             format,rtmp_video_url = turls[-1]
2475
2476             # Get the format arg from the arg stream
2477             req_format = self._downloader.params.get('format', None)
2478
2479             # Select format if we can find one
2480             for f,v in turls:
2481                 if f == req_format:
2482                     format, rtmp_video_url = f, v
2483                     break
2484
2485             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2486             if not m:
2487                 raise ExtractorError(u'Cannot transform RTMP url')
2488             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2489             video_url = base + m.group('finalid')
2490
2491             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2492             info = {
2493                 'id': shortMediaId,
2494                 'url': video_url,
2495                 'uploader': showId,
2496                 'upload_date': officialDate,
2497                 'title': effTitle,
2498                 'ext': 'mp4',
2499                 'format': format,
2500                 'thumbnail': None,
2501                 'description': officialTitle,
2502             }
2503             results.append(info)
2504
2505         return results
2506
2507
2508 class EscapistIE(InfoExtractor):
2509     """Information extractor for The Escapist """
2510
2511     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2512     IE_NAME = u'escapist'
2513
2514     def report_config_download(self, showName):
2515         self.to_screen(u'%s: Downloading configuration' % showName)
2516
2517     def _real_extract(self, url):
2518         mobj = re.match(self._VALID_URL, url)
2519         if mobj is None:
2520             self._downloader.report_error(u'invalid URL: %s' % url)
2521             return
2522         showName = mobj.group('showname')
2523         videoId = mobj.group('episode')
2524
2525         self.report_extraction(showName)
2526         try:
2527             webPage = compat_urllib_request.urlopen(url)
2528             webPageBytes = webPage.read()
2529             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2530             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2533             return
2534
2535         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2536         description = unescapeHTML(descMatch.group(1))
2537         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2538         imgUrl = unescapeHTML(imgMatch.group(1))
2539         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2540         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2541         configUrlMatch = re.search('config=(.*)$', playerUrl)
2542         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2543
2544         self.report_config_download(showName)
2545         try:
2546             configJSON = compat_urllib_request.urlopen(configUrl)
2547             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2548             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2549         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2551             return
2552
2553         # Technically, it's JavaScript, not JSON
2554         configJSON = configJSON.replace("'", '"')
2555
2556         try:
2557             config = json.loads(configJSON)
2558         except (ValueError,) as err:
2559             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2560             return
2561
2562         playlist = config['playlist']
2563         videoUrl = playlist[1]['url']
2564
2565         info = {
2566             'id': videoId,
2567             'url': videoUrl,
2568             'uploader': showName,
2569             'upload_date': None,
2570             'title': showName,
2571             'ext': 'mp4',
2572             'thumbnail': imgUrl,
2573             'description': description,
2574             'player_url': playerUrl,
2575         }
2576
2577         return [info]
2578
2579 class CollegeHumorIE(InfoExtractor):
2580     """Information extractor for collegehumor.com"""
2581
2582     _WORKING = False
2583     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2584     IE_NAME = u'collegehumor'
2585
2586     def report_manifest(self, video_id):
2587         """Report information extraction."""
2588         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2589
2590     def _real_extract(self, url):
2591         mobj = re.match(self._VALID_URL, url)
2592         if mobj is None:
2593             self._downloader.report_error(u'invalid URL: %s' % url)
2594             return
2595         video_id = mobj.group('videoid')
2596
2597         info = {
2598             'id': video_id,
2599             'uploader': None,
2600             'upload_date': None,
2601         }
2602
2603         self.report_extraction(video_id)
2604         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2605         try:
2606             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2607         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2608             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2609             return
2610
2611         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2612         try:
2613             videoNode = mdoc.findall('./video')[0]
2614             info['description'] = videoNode.findall('./description')[0].text
2615             info['title'] = videoNode.findall('./caption')[0].text
2616             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2617             manifest_url = videoNode.findall('./file')[0].text
2618         except IndexError:
2619             self._downloader.report_error(u'Invalid metadata XML file')
2620             return
2621
2622         manifest_url += '?hdcore=2.10.3'
2623         self.report_manifest(video_id)
2624         try:
2625             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2626         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2627             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2628             return
2629
2630         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2631         try:
2632             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2633             node_id = media_node.attrib['url']
2634             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2635         except IndexError as err:
2636             self._downloader.report_error(u'Invalid manifest file')
2637             return
2638
2639         url_pr = compat_urllib_parse_urlparse(manifest_url)
2640         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2641
2642         info['url'] = url
2643         info['ext'] = 'f4f'
2644         return [info]
2645
2646
2647 class XVideosIE(InfoExtractor):
2648     """Information extractor for xvideos.com"""
2649
2650     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2651     IE_NAME = u'xvideos'
2652
2653     def _real_extract(self, url):
2654         mobj = re.match(self._VALID_URL, url)
2655         if mobj is None:
2656             self._downloader.report_error(u'invalid URL: %s' % url)
2657             return
2658         video_id = mobj.group(1)
2659
2660         webpage = self._download_webpage(url, video_id)
2661
2662         self.report_extraction(video_id)
2663
2664
2665         # Extract video URL
2666         mobj = re.search(r'flv_url=(.+?)&', webpage)
2667         if mobj is None:
2668             self._downloader.report_error(u'unable to extract video url')
2669             return
2670         video_url = compat_urllib_parse.unquote(mobj.group(1))
2671
2672
2673         # Extract title
2674         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2675         if mobj is None:
2676             self._downloader.report_error(u'unable to extract video title')
2677             return
2678         video_title = mobj.group(1)
2679
2680
2681         # Extract video thumbnail
2682         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2683         if mobj is None:
2684             self._downloader.report_error(u'unable to extract video thumbnail')
2685             return
2686         video_thumbnail = mobj.group(0)
2687
2688         info = {
2689             'id': video_id,
2690             'url': video_url,
2691             'uploader': None,
2692             'upload_date': None,
2693             'title': video_title,
2694             'ext': 'flv',
2695             'thumbnail': video_thumbnail,
2696             'description': None,
2697         }
2698
2699         return [info]
2700
2701
2702 class SoundcloudIE(InfoExtractor):
2703     """Information extractor for soundcloud.com
2704        To access the media, the uid of the song and a stream token
2705        must be extracted from the page source and the script must make
2706        a request to media.soundcloud.com/crossdomain.xml. Then
2707        the media can be grabbed by requesting from an url composed
2708        of the stream token and uid
2709      """
2710
2711     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2712     IE_NAME = u'soundcloud'
2713
2714     def __init__(self, downloader=None):
2715         InfoExtractor.__init__(self, downloader)
2716
2717     def report_resolve(self, video_id):
2718         """Report information extraction."""
2719         self.to_screen(u'%s: Resolving id' % video_id)
2720
2721     def _real_extract(self, url):
2722         mobj = re.match(self._VALID_URL, url)
2723         if mobj is None:
2724             self._downloader.report_error(u'invalid URL: %s' % url)
2725             return
2726
2727         # extract uploader (which is in the url)
2728         uploader = mobj.group(1)
2729         # extract simple title (uploader + slug of song title)
2730         slug_title =  mobj.group(2)
2731         simple_title = uploader + u'-' + slug_title
2732
2733         self.report_resolve('%s/%s' % (uploader, slug_title))
2734
2735         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2736         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737         request = compat_urllib_request.Request(resolv_url)
2738         try:
2739             info_json_bytes = compat_urllib_request.urlopen(request).read()
2740             info_json = info_json_bytes.decode('utf-8')
2741         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2743             return
2744
2745         info = json.loads(info_json)
2746         video_id = info['id']
2747         self.report_extraction('%s/%s' % (uploader, slug_title))
2748
2749         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2750         request = compat_urllib_request.Request(streams_url)
2751         try:
2752             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2753             stream_json = stream_json_bytes.decode('utf-8')
2754         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2755             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2756             return
2757
2758         streams = json.loads(stream_json)
2759         mediaURL = streams['http_mp3_128_url']
2760
2761         return [{
2762             'id':       info['id'],
2763             'url':      mediaURL,
2764             'uploader': info['user']['username'],
2765             'upload_date':  info['created_at'],
2766             'title':    info['title'],
2767             'ext':      u'mp3',
2768             'description': info['description'],
2769         }]
2770
2771 class SoundcloudSetIE(InfoExtractor):
2772     """Information extractor for soundcloud.com sets
2773        To access the media, the uid of the song and a stream token
2774        must be extracted from the page source and the script must make
2775        a request to media.soundcloud.com/crossdomain.xml. Then
2776        the media can be grabbed by requesting from an url composed
2777        of the stream token and uid
2778      """
2779
2780     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2781     IE_NAME = u'soundcloud'
2782
2783     def __init__(self, downloader=None):
2784         InfoExtractor.__init__(self, downloader)
2785
2786     def report_resolve(self, video_id):
2787         """Report information extraction."""
2788         self.to_screen(u'%s: Resolving id' % video_id)
2789
2790     def _real_extract(self, url):
2791         mobj = re.match(self._VALID_URL, url)
2792         if mobj is None:
2793             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2794             return
2795
2796         # extract uploader (which is in the url)
2797         uploader = mobj.group(1)
2798         # extract simple title (uploader + slug of song title)
2799         slug_title =  mobj.group(2)
2800         simple_title = uploader + u'-' + slug_title
2801
2802         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2803
2804         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2805         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2806         request = compat_urllib_request.Request(resolv_url)
2807         try:
2808             info_json_bytes = compat_urllib_request.urlopen(request).read()
2809             info_json = info_json_bytes.decode('utf-8')
2810         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2812             return
2813
2814         videos = []
2815         info = json.loads(info_json)
2816         if 'errors' in info:
2817             for err in info['errors']:
2818                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2819             return
2820
2821         for track in info['tracks']:
2822             video_id = track['id']
2823             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2824
2825             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2826             request = compat_urllib_request.Request(streams_url)
2827             try:
2828                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2829                 stream_json = stream_json_bytes.decode('utf-8')
2830             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2831                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2832                 return
2833
2834             streams = json.loads(stream_json)
2835             mediaURL = streams['http_mp3_128_url']
2836
2837             videos.append({
2838                 'id':       video_id,
2839                 'url':      mediaURL,
2840                 'uploader': track['user']['username'],
2841                 'upload_date':  track['created_at'],
2842                 'title':    track['title'],
2843                 'ext':      u'mp3',
2844                 'description': track['description'],
2845             })
2846         return videos
2847
2848
2849 class InfoQIE(InfoExtractor):
2850     """Information extractor for infoq.com"""
2851     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2852
2853     def _real_extract(self, url):
2854         mobj = re.match(self._VALID_URL, url)
2855         if mobj is None:
2856             self._downloader.report_error(u'invalid URL: %s' % url)
2857             return
2858
2859         webpage = self._download_webpage(url, video_id=url)
2860         self.report_extraction(url)
2861
2862         # Extract video URL
2863         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2864         if mobj is None:
2865             self._downloader.report_error(u'unable to extract video url')
2866             return
2867         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2868         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2869
2870         # Extract title
2871         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2872         if mobj is None:
2873             self._downloader.report_error(u'unable to extract video title')
2874             return
2875         video_title = mobj.group(1)
2876
2877         # Extract description
2878         video_description = u'No description available.'
2879         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2880         if mobj is not None:
2881             video_description = mobj.group(1)
2882
2883         video_filename = video_url.split('/')[-1]
2884         video_id, extension = video_filename.split('.')
2885
2886         info = {
2887             'id': video_id,
2888             'url': video_url,
2889             'uploader': None,
2890             'upload_date': None,
2891             'title': video_title,
2892             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2893             'thumbnail': None,
2894             'description': video_description,
2895         }
2896
2897         return [info]
2898
2899 class MixcloudIE(InfoExtractor):
2900     """Information extractor for www.mixcloud.com"""
2901
2902     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2903     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2904     IE_NAME = u'mixcloud'
2905
2906     def __init__(self, downloader=None):
2907         InfoExtractor.__init__(self, downloader)
2908
2909     def report_download_json(self, file_id):
2910         """Report JSON download."""
2911         self.to_screen(u'Downloading json')
2912
2913     def get_urls(self, jsonData, fmt, bitrate='best'):
2914         """Get urls from 'audio_formats' section in json"""
2915         file_url = None
2916         try:
2917             bitrate_list = jsonData[fmt]
2918             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2919                 bitrate = max(bitrate_list) # select highest
2920
2921             url_list = jsonData[fmt][bitrate]
2922         except TypeError: # we have no bitrate info.
2923             url_list = jsonData[fmt]
2924         return url_list
2925
2926     def check_urls(self, url_list):
2927         """Returns 1st active url from list"""
2928         for url in url_list:
2929             try:
2930                 compat_urllib_request.urlopen(url)
2931                 return url
2932             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2933                 url = None
2934
2935         return None
2936
2937     def _print_formats(self, formats):
2938         print('Available formats:')
2939         for fmt in formats.keys():
2940             for b in formats[fmt]:
2941                 try:
2942                     ext = formats[fmt][b][0]
2943                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2944                 except TypeError: # we have no bitrate info
2945                     ext = formats[fmt][0]
2946                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2947                     break
2948
2949     def _real_extract(self, url):
2950         mobj = re.match(self._VALID_URL, url)
2951         if mobj is None:
2952             self._downloader.report_error(u'invalid URL: %s' % url)
2953             return
2954         # extract uploader & filename from url
2955         uploader = mobj.group(1).decode('utf-8')
2956         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2957
2958         # construct API request
2959         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2960         # retrieve .json file with links to files
2961         request = compat_urllib_request.Request(file_url)
2962         try:
2963             self.report_download_json(file_url)
2964             jsonData = compat_urllib_request.urlopen(request).read()
2965         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2966             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2967             return
2968
2969         # parse JSON
2970         json_data = json.loads(jsonData)
2971         player_url = json_data['player_swf_url']
2972         formats = dict(json_data['audio_formats'])
2973
2974         req_format = self._downloader.params.get('format', None)
2975         bitrate = None
2976
2977         if self._downloader.params.get('listformats', None):
2978             self._print_formats(formats)
2979             return
2980
2981         if req_format is None or req_format == 'best':
2982             for format_param in formats.keys():
2983                 url_list = self.get_urls(formats, format_param)
2984                 # check urls
2985                 file_url = self.check_urls(url_list)
2986                 if file_url is not None:
2987                     break # got it!
2988         else:
2989             if req_format not in formats:
2990                 self._downloader.report_error(u'format is not available')
2991                 return
2992
2993             url_list = self.get_urls(formats, req_format)
2994             file_url = self.check_urls(url_list)
2995             format_param = req_format
2996
2997         return [{
2998             'id': file_id.decode('utf-8'),
2999             'url': file_url.decode('utf-8'),
3000             'uploader': uploader.decode('utf-8'),
3001             'upload_date': None,
3002             'title': json_data['name'],
3003             'ext': file_url.split('.')[-1].decode('utf-8'),
3004             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3005             'thumbnail': json_data['thumbnail_url'],
3006             'description': json_data['description'],
3007             'player_url': player_url.decode('utf-8'),
3008         }]
3009
3010 class StanfordOpenClassroomIE(InfoExtractor):
3011     """Information extractor for Stanford's Open ClassRoom"""
3012
3013     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3014     IE_NAME = u'stanfordoc'
3015
3016     def report_download_webpage(self, objid):
3017         """Report information extraction."""
3018         self.to_screen(u'%s: Downloading webpage' % objid)
3019
3020     def _real_extract(self, url):
3021         mobj = re.match(self._VALID_URL, url)
3022         if mobj is None:
3023             raise ExtractorError(u'Invalid URL: %s' % url)
3024
3025         if mobj.group('course') and mobj.group('video'): # A specific video
3026             course = mobj.group('course')
3027             video = mobj.group('video')
3028             info = {
3029                 'id': course + '_' + video,
3030                 'uploader': None,
3031                 'upload_date': None,
3032             }
3033
3034             self.report_extraction(info['id'])
3035             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3036             xmlUrl = baseUrl + video + '.xml'
3037             try:
3038                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3039             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3040                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3041                 return
3042             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3043             try:
3044                 info['title'] = mdoc.findall('./title')[0].text
3045                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3046             except IndexError:
3047                 self._downloader.report_error(u'Invalid metadata XML file')
3048                 return
3049             info['ext'] = info['url'].rpartition('.')[2]
3050             return [info]
3051         elif mobj.group('course'): # A course page
3052             course = mobj.group('course')
3053             info = {
3054                 'id': course,
3055                 'type': 'playlist',
3056                 'uploader': None,
3057                 'upload_date': None,
3058             }
3059
3060             coursepage = self._download_webpage(url, info['id'],
3061                                         note='Downloading course info page',
3062                                         errnote='Unable to download course info page')
3063
3064             m = re.search('<h1>([^<]+)</h1>', coursepage)
3065             if m:
3066                 info['title'] = unescapeHTML(m.group(1))
3067             else:
3068                 info['title'] = info['id']
3069
3070             m = re.search('<description>([^<]+)</description>', coursepage)
3071             if m:
3072                 info['description'] = unescapeHTML(m.group(1))
3073
3074             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3075             info['list'] = [
3076                 {
3077                     'type': 'reference',
3078                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3079                 }
3080                     for vpage in links]
3081             results = []
3082             for entry in info['list']:
3083                 assert entry['type'] == 'reference'
3084                 results += self.extract(entry['url'])
3085             return results
3086         else: # Root page
3087             info = {
3088                 'id': 'Stanford OpenClassroom',
3089                 'type': 'playlist',
3090                 'uploader': None,
3091                 'upload_date': None,
3092             }
3093
3094             self.report_download_webpage(info['id'])
3095             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3096             try:
3097                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3098             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3099                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3100                 return
3101
3102             info['title'] = info['id']
3103
3104             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3105             info['list'] = [
3106                 {
3107                     'type': 'reference',
3108                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3109                 }
3110                     for cpage in links]
3111
3112             results = []
3113             for entry in info['list']:
3114                 assert entry['type'] == 'reference'
3115                 results += self.extract(entry['url'])
3116             return results
3117
3118 class MTVIE(InfoExtractor):
3119     """Information extractor for MTV.com"""
3120
3121     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3122     IE_NAME = u'mtv'
3123
3124     def _real_extract(self, url):
3125         mobj = re.match(self._VALID_URL, url)
3126         if mobj is None:
3127             self._downloader.report_error(u'invalid URL: %s' % url)
3128             return
3129         if not mobj.group('proto'):
3130             url = 'http://' + url
3131         video_id = mobj.group('videoid')
3132
3133         webpage = self._download_webpage(url, video_id)
3134
3135         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3136         if mobj is None:
3137             self._downloader.report_error(u'unable to extract song name')
3138             return
3139         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3140         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3141         if mobj is None:
3142             self._downloader.report_error(u'unable to extract performer')
3143             return
3144         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3145         video_title = performer + ' - ' + song_name
3146
3147         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3148         if mobj is None:
3149             self._downloader.report_error(u'unable to mtvn_uri')
3150             return
3151         mtvn_uri = mobj.group(1)
3152
3153         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3154         if mobj is None:
3155             self._downloader.report_error(u'unable to extract content id')
3156             return
3157         content_id = mobj.group(1)
3158
3159         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3160         self.report_extraction(video_id)
3161         request = compat_urllib_request.Request(videogen_url)
3162         try:
3163             metadataXml = compat_urllib_request.urlopen(request).read()
3164         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3165             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3166             return
3167
3168         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3169         renditions = mdoc.findall('.//rendition')
3170
3171         # For now, always pick the highest quality.
3172         rendition = renditions[-1]
3173
3174         try:
3175             _,_,ext = rendition.attrib['type'].partition('/')
3176             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3177             video_url = rendition.find('./src').text
3178         except KeyError:
3179             self._downloader.trouble('Invalid rendition field.')
3180             return
3181
3182         info = {
3183             'id': video_id,
3184             'url': video_url,
3185             'uploader': performer,
3186             'upload_date': None,
3187             'title': video_title,
3188             'ext': ext,
3189             'format': format,
3190         }
3191
3192         return [info]
3193
3194
3195 class YoukuIE(InfoExtractor):
3196     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3197
3198     def report_download_webpage(self, file_id):
3199         """Report webpage download."""
3200         self.to_screen(u'%s: Downloading webpage' % file_id)
3201
3202     def _gen_sid(self):
3203         nowTime = int(time.time() * 1000)
3204         random1 = random.randint(1000,1998)
3205         random2 = random.randint(1000,9999)
3206
3207         return "%d%d%d" %(nowTime,random1,random2)
3208
3209     def _get_file_ID_mix_string(self, seed):
3210         mixed = []
3211         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3212         seed = float(seed)
3213         for i in range(len(source)):
3214             seed  =  (seed * 211 + 30031 ) % 65536
3215             index  =  math.floor(seed / 65536 * len(source) )
3216             mixed.append(source[int(index)])
3217             source.remove(source[int(index)])
3218         #return ''.join(mixed)
3219         return mixed
3220
3221     def _get_file_id(self, fileId, seed):
3222         mixed = self._get_file_ID_mix_string(seed)
3223         ids = fileId.split('*')
3224         realId = []
3225         for ch in ids:
3226             if ch:
3227                 realId.append(mixed[int(ch)])
3228         return ''.join(realId)
3229
3230     def _real_extract(self, url):
3231         mobj = re.match(self._VALID_URL, url)
3232         if mobj is None:
3233             self._downloader.report_error(u'invalid URL: %s' % url)
3234             return
3235         video_id = mobj.group('ID')
3236
3237         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3238
3239         request = compat_urllib_request.Request(info_url, None, std_headers)
3240         try:
3241             self.report_download_webpage(video_id)
3242             jsondata = compat_urllib_request.urlopen(request).read()
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3245             return
3246
3247         self.report_extraction(video_id)
3248         try:
3249             jsonstr = jsondata.decode('utf-8')
3250             config = json.loads(jsonstr)
3251
3252             video_title =  config['data'][0]['title']
3253             seed = config['data'][0]['seed']
3254
3255             format = self._downloader.params.get('format', None)
3256             supported_format = list(config['data'][0]['streamfileids'].keys())
3257
3258             if format is None or format == 'best':
3259                 if 'hd2' in supported_format:
3260                     format = 'hd2'
3261                 else:
3262                     format = 'flv'
3263                 ext = u'flv'
3264             elif format == 'worst':
3265                 format = 'mp4'
3266                 ext = u'mp4'
3267             else:
3268                 format = 'flv'
3269                 ext = u'flv'
3270
3271
3272             fileid = config['data'][0]['streamfileids'][format]
3273             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3274         except (UnicodeDecodeError, ValueError, KeyError):
3275             self._downloader.report_error(u'unable to extract info section')
3276             return
3277
3278         files_info=[]
3279         sid = self._gen_sid()
3280         fileid = self._get_file_id(fileid, seed)
3281
3282         #column 8,9 of fileid represent the segment number
3283         #fileid[7:9] should be changed
3284         for index, key in enumerate(keys):
3285
3286             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3287             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3288
3289             info = {
3290                 'id': '%s_part%02d' % (video_id, index),
3291                 'url': download_url,
3292                 'uploader': None,
3293                 'upload_date': None,
3294                 'title': video_title,
3295                 'ext': ext,
3296             }
3297             files_info.append(info)
3298
3299         return files_info
3300
3301
3302 class XNXXIE(InfoExtractor):
3303     """Information extractor for xnxx.com"""
3304
3305     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3306     IE_NAME = u'xnxx'
3307     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3308     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3309     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3310
3311     def report_webpage(self, video_id):
3312         """Report information extraction"""
3313         self.to_screen(u'%s: Downloading webpage' % video_id)
3314
3315     def _real_extract(self, url):
3316         mobj = re.match(self._VALID_URL, url)
3317         if mobj is None:
3318             self._downloader.report_error(u'invalid URL: %s' % url)
3319             return
3320         video_id = mobj.group(1)
3321
3322         self.report_webpage(video_id)
3323
3324         # Get webpage content
3325         try:
3326             webpage_bytes = compat_urllib_request.urlopen(url).read()
3327             webpage = webpage_bytes.decode('utf-8')
3328         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3329             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3330             return
3331
3332         result = re.search(self.VIDEO_URL_RE, webpage)
3333         if result is None:
3334             self._downloader.report_error(u'unable to extract video url')
3335             return
3336         video_url = compat_urllib_parse.unquote(result.group(1))
3337
3338         result = re.search(self.VIDEO_TITLE_RE, webpage)
3339         if result is None:
3340             self._downloader.report_error(u'unable to extract video title')
3341             return
3342         video_title = result.group(1)
3343
3344         result = re.search(self.VIDEO_THUMB_RE, webpage)
3345         if result is None:
3346             self._downloader.report_error(u'unable to extract video thumbnail')
3347             return
3348         video_thumbnail = result.group(1)
3349
3350         return [{
3351             'id': video_id,
3352             'url': video_url,
3353             'uploader': None,
3354             'upload_date': None,
3355             'title': video_title,
3356             'ext': 'flv',
3357             'thumbnail': video_thumbnail,
3358             'description': None,
3359         }]
3360
3361
3362 class GooglePlusIE(InfoExtractor):
3363     """Information extractor for plus.google.com."""
3364
3365     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3366     IE_NAME = u'plus.google'
3367
3368     def __init__(self, downloader=None):
3369         InfoExtractor.__init__(self, downloader)
3370
3371     def report_extract_entry(self, url):
3372         """Report downloading extry"""
3373         self.to_screen(u'Downloading entry: %s' % url)
3374
3375     def report_date(self, upload_date):
3376         """Report downloading extry"""
3377         self.to_screen(u'Entry date: %s' % upload_date)
3378
3379     def report_uploader(self, uploader):
3380         """Report downloading extry"""
3381         self.to_screen(u'Uploader: %s' % uploader)
3382
3383     def report_title(self, video_title):
3384         """Report downloading extry"""
3385         self.to_screen(u'Title: %s' % video_title)
3386
3387     def report_extract_vid_page(self, video_page):
3388         """Report information extraction."""
3389         self.to_screen(u'Extracting video page: %s' % video_page)
3390
3391     def _real_extract(self, url):
3392         # Extract id from URL
3393         mobj = re.match(self._VALID_URL, url)
3394         if mobj is None:
3395             self._downloader.report_error(u'Invalid URL: %s' % url)
3396             return
3397
3398         post_url = mobj.group(0)
3399         video_id = mobj.group(1)
3400
3401         video_extension = 'flv'
3402
3403         # Step 1, Retrieve post webpage to extract further information
3404         self.report_extract_entry(post_url)
3405         request = compat_urllib_request.Request(post_url)
3406         try:
3407             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3408         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3409             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3410             return
3411
3412         # Extract update date
3413         upload_date = None
3414         pattern = 'title="Timestamp">(.*?)</a>'
3415         mobj = re.search(pattern, webpage)
3416         if mobj:
3417             upload_date = mobj.group(1)
3418             # Convert timestring to a format suitable for filename
3419             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3420             upload_date = upload_date.strftime('%Y%m%d')
3421         self.report_date(upload_date)
3422
3423         # Extract uploader
3424         uploader = None
3425         pattern = r'rel\="author".*?>(.*?)</a>'
3426         mobj = re.search(pattern, webpage)
3427         if mobj:
3428             uploader = mobj.group(1)
3429         self.report_uploader(uploader)
3430
3431         # Extract title
3432         # Get the first line for title
3433         video_title = u'NA'
3434         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3435         mobj = re.search(pattern, webpage)
3436         if mobj:
3437             video_title = mobj.group(1)
3438         self.report_title(video_title)
3439
3440         # Step 2, Stimulate clicking the image box to launch video
3441         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3442         mobj = re.search(pattern, webpage)
3443         if mobj is None:
3444             self._downloader.report_error(u'unable to extract video page URL')
3445
3446         video_page = mobj.group(1)
3447         request = compat_urllib_request.Request(video_page)
3448         try:
3449             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3452             return
3453         self.report_extract_vid_page(video_page)
3454
3455
3456         # Extract video links on video page
3457         """Extract video links of all sizes"""
3458         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3459         mobj = re.findall(pattern, webpage)
3460         if len(mobj) == 0:
3461             self._downloader.report_error(u'unable to extract video links')
3462
3463         # Sort in resolution
3464         links = sorted(mobj)
3465
3466         # Choose the lowest of the sort, i.e. highest resolution
3467         video_url = links[-1]
3468         # Only get the url. The resolution part in the tuple has no use anymore
3469         video_url = video_url[-1]
3470         # Treat escaped \u0026 style hex
3471         try:
3472             video_url = video_url.decode("unicode_escape")
3473         except AttributeError: # Python 3
3474             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3475
3476
3477         return [{
3478             'id':       video_id,
3479             'url':      video_url,
3480             'uploader': uploader,
3481             'upload_date':  upload_date,
3482             'title':    video_title,
3483             'ext':      video_extension,
3484         }]
3485
3486 class NBAIE(InfoExtractor):
3487     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3488     IE_NAME = u'nba'
3489
3490     def _real_extract(self, url):
3491         mobj = re.match(self._VALID_URL, url)
3492         if mobj is None:
3493             self._downloader.report_error(u'invalid URL: %s' % url)
3494             return
3495
3496         video_id = mobj.group(1)
3497         if video_id.endswith('/index.html'):
3498             video_id = video_id[:-len('/index.html')]
3499
3500         webpage = self._download_webpage(url, video_id)
3501
3502         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3503         def _findProp(rexp, default=None):
3504             m = re.search(rexp, webpage)
3505             if m:
3506                 return unescapeHTML(m.group(1))
3507             else:
3508                 return default
3509
3510         shortened_video_id = video_id.rpartition('/')[2]
3511         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3512         info = {
3513             'id': shortened_video_id,
3514             'url': video_url,
3515             'ext': 'mp4',
3516             'title': title,
3517             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3518             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3519         }
3520         return [info]
3521
3522 class JustinTVIE(InfoExtractor):
3523     """Information extractor for justin.tv and twitch.tv"""
3524     # TODO: One broadcast may be split into multiple videos. The key
3525     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3526     # starts at 1 and increases. Can we treat all parts as one video?
3527
3528     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3529         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3530     _JUSTIN_PAGE_LIMIT = 100
3531     IE_NAME = u'justin.tv'
3532
3533     def report_download_page(self, channel, offset):
3534         """Report attempt to download a single page of videos."""
3535         self.to_screen(u'%s: Downloading video information from %d to %d' %
3536                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3537
3538     # Return count of items, list of *valid* items
3539     def _parse_page(self, url):
3540         try:
3541             urlh = compat_urllib_request.urlopen(url)
3542             webpage_bytes = urlh.read()
3543             webpage = webpage_bytes.decode('utf-8', 'ignore')
3544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3545             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3546             return
3547
3548         response = json.loads(webpage)
3549         if type(response) != list:
3550             error_text = response.get('error', 'unknown error')
3551             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3552             return
3553         info = []
3554         for clip in response:
3555             video_url = clip['video_file_url']
3556             if video_url:
3557                 video_extension = os.path.splitext(video_url)[1][1:]
3558                 video_date = re.sub('-', '', clip['start_time'][:10])
3559                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3560                 video_id = clip['id']
3561                 video_title = clip.get('title', video_id)
3562                 info.append({
3563                     'id': video_id,
3564                     'url': video_url,
3565                     'title': video_title,
3566                     'uploader': clip.get('channel_name', video_uploader_id),
3567                     'uploader_id': video_uploader_id,
3568                     'upload_date': video_date,
3569                     'ext': video_extension,
3570                 })
3571         return (len(response), info)
3572
3573     def _real_extract(self, url):
3574         mobj = re.match(self._VALID_URL, url)
3575         if mobj is None:
3576             self._downloader.report_error(u'invalid URL: %s' % url)
3577             return
3578
3579         api = 'http://api.justin.tv'
3580         video_id = mobj.group(mobj.lastindex)
3581         paged = False
3582         if mobj.lastindex == 1:
3583             paged = True
3584             api += '/channel/archives/%s.json'
3585         else:
3586             api += '/broadcast/by_archive/%s.json'
3587         api = api % (video_id,)
3588
3589         self.report_extraction(video_id)
3590
3591         info = []
3592         offset = 0
3593         limit = self._JUSTIN_PAGE_LIMIT
3594         while True:
3595             if paged:
3596                 self.report_download_page(video_id, offset)
3597             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3598             page_count, page_info = self._parse_page(page_url)
3599             info.extend(page_info)
3600             if not paged or page_count != limit:
3601                 break
3602             offset += limit
3603         return info
3604
3605 class FunnyOrDieIE(InfoExtractor):
3606     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3607
3608     def _real_extract(self, url):
3609         mobj = re.match(self._VALID_URL, url)
3610         if mobj is None:
3611             self._downloader.report_error(u'invalid URL: %s' % url)
3612             return
3613
3614         video_id = mobj.group('id')
3615         webpage = self._download_webpage(url, video_id)
3616
3617         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3618         if not m:
3619             self._downloader.report_error(u'unable to find video information')
3620         video_url = unescapeHTML(m.group('url'))
3621
3622         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3623         if not m:
3624             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3625             if not m:
3626                 self._downloader.trouble(u'Cannot find video title')
3627         title = clean_html(m.group('title'))
3628
3629         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3630         if m:
3631             desc = unescapeHTML(m.group('desc'))
3632         else:
3633             desc = None
3634
3635         info = {
3636             'id': video_id,
3637             'url': video_url,
3638             'ext': 'mp4',
3639             'title': title,
3640             'description': desc,
3641         }
3642         return [info]
3643
3644 class SteamIE(InfoExtractor):
3645     _VALID_URL = r"""http://store.steampowered.com/
3646                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3647                 (?P<gameID>\d+)/?
3648                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3649                 """
3650
3651     @classmethod
3652     def suitable(cls, url):
3653         """Receives a URL and returns True if suitable for this IE."""
3654         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3655
3656     def _real_extract(self, url):
3657         m = re.match(self._VALID_URL, url, re.VERBOSE)
3658         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3659         gameID = m.group('gameID')
3660         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3661         self.report_age_confirmation()
3662         webpage = self._download_webpage(videourl, gameID)
3663         mweb = re.finditer(urlRE, webpage)
3664         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3665         titles = re.finditer(namesRE, webpage)
3666         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3667         thumbs = re.finditer(thumbsRE, webpage)
3668         videos = []
3669         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3670             video_id = vid.group('videoID')
3671             title = vtitle.group('videoName')
3672             video_url = vid.group('videoURL')
3673             video_thumb = thumb.group('thumbnail')
3674             if not video_url:
3675                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3676             info = {
3677                 'id':video_id,
3678                 'url':video_url,
3679                 'ext': 'flv',
3680                 'title': unescapeHTML(title),
3681                 'thumbnail': video_thumb
3682                   }
3683             videos.append(info)
3684         return videos
3685
3686 class UstreamIE(InfoExtractor):
3687     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3688     IE_NAME = u'ustream'
3689
3690     def _real_extract(self, url):
3691         m = re.match(self._VALID_URL, url)
3692         video_id = m.group('videoID')
3693         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3694         webpage = self._download_webpage(url, video_id)
3695         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3696         title = m.group('title')
3697         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3698         uploader = m.group('uploader')
3699         info = {
3700                 'id':video_id,
3701                 'url':video_url,
3702                 'ext': 'flv',
3703                 'title': title,
3704                 'uploader': uploader
3705                   }
3706         return [info]
3707
3708 class WorldStarHipHopIE(InfoExtractor):
3709     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3710     IE_NAME = u'WorldStarHipHop'
3711
3712     def _real_extract(self, url):
3713         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3714
3715         webpage_src = compat_urllib_request.urlopen(url).read()
3716         webpage_src = webpage_src.decode('utf-8')
3717
3718         mobj = re.search(_src_url, webpage_src)
3719
3720         m = re.match(self._VALID_URL, url)
3721         video_id = m.group('id')
3722
3723         if mobj is not None:
3724             video_url = mobj.group()
3725             if 'mp4' in video_url:
3726                 ext = 'mp4'
3727             else:
3728                 ext = 'flv'
3729         else:
3730             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3731             return
3732
3733         _title = r"""<title>(.*)</title>"""
3734
3735         mobj = re.search(_title, webpage_src)
3736
3737         if mobj is not None:
3738             title = mobj.group(1)
3739         else:
3740             title = 'World Start Hip Hop - %s' % time.ctime()
3741
3742         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3743         mobj = re.search(_thumbnail, webpage_src)
3744
3745         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3746         if mobj is not None:
3747             thumbnail = mobj.group(1)
3748         else:
3749             _title = r"""candytitles.*>(.*)</span>"""
3750             mobj = re.search(_title, webpage_src)
3751             if mobj is not None:
3752                 title = mobj.group(1)
3753             thumbnail = None
3754
3755         results = [{
3756                     'id': video_id,
3757                     'url' : video_url,
3758                     'title' : title,
3759                     'thumbnail' : thumbnail,
3760                     'ext' : ext,
3761                     }]
3762         return results
3763
3764 class RBMARadioIE(InfoExtractor):
3765     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3766
3767     def _real_extract(self, url):
3768         m = re.match(self._VALID_URL, url)
3769         video_id = m.group('videoID')
3770
3771         webpage = self._download_webpage(url, video_id)
3772         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3773         if not m:
3774             raise ExtractorError(u'Cannot find metadata')
3775         json_data = m.group(1)
3776
3777         try:
3778             data = json.loads(json_data)
3779         except ValueError as e:
3780             raise ExtractorError(u'Invalid JSON: ' + str(e))
3781
3782         video_url = data['akamai_url'] + '&cbr=256'
3783         url_parts = compat_urllib_parse_urlparse(video_url)
3784         video_ext = url_parts.path.rpartition('.')[2]
3785         info = {
3786                 'id': video_id,
3787                 'url': video_url,
3788                 'ext': video_ext,
3789                 'title': data['title'],
3790                 'description': data.get('teaser_text'),
3791                 'location': data.get('country_of_origin'),
3792                 'uploader': data.get('host', {}).get('name'),
3793                 'uploader_id': data.get('host', {}).get('slug'),
3794                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3795                 'duration': data.get('duration'),
3796         }
3797         return [info]
3798
3799
3800 class YouPornIE(InfoExtractor):
3801     """Information extractor for youporn.com."""
3802     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3803
3804     def _print_formats(self, formats):
3805         """Print all available formats"""
3806         print(u'Available formats:')
3807         print(u'ext\t\tformat')
3808         print(u'---------------------------------')
3809         for format in formats:
3810             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3811
3812     def _specific(self, req_format, formats):
3813         for x in formats:
3814             if(x["format"]==req_format):
3815                 return x
3816         return None
3817
3818     def _real_extract(self, url):
3819         mobj = re.match(self._VALID_URL, url)
3820         if mobj is None:
3821             self._downloader.report_error(u'invalid URL: %s' % url)
3822             return
3823
3824         video_id = mobj.group('videoid')
3825
3826         req = compat_urllib_request.Request(url)
3827         req.add_header('Cookie', 'age_verified=1')
3828         webpage = self._download_webpage(req, video_id)
3829
3830         # Get the video title
3831         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3832         if result is None:
3833             raise ExtractorError(u'Unable to extract video title')
3834         video_title = result.group('title').strip()
3835
3836         # Get the video date
3837         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3838         if result is None:
3839             self._downloader.report_warning(u'unable to extract video date')
3840             upload_date = None
3841         else:
3842             upload_date = result.group('date').strip()
3843
3844         # Get the video uploader
3845         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3846         if result is None:
3847             self._downloader.report_warning(u'unable to extract uploader')
3848             video_uploader = None
3849         else:
3850             video_uploader = result.group('uploader').strip()
3851             video_uploader = clean_html( video_uploader )
3852
3853         # Get all of the formats available
3854         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3855         result = re.search(DOWNLOAD_LIST_RE, webpage)
3856         if result is None:
3857             raise ExtractorError(u'Unable to extract download list')
3858         download_list_html = result.group('download_list').strip()
3859
3860         # Get all of the links from the page
3861         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3862         links = re.findall(LINK_RE, download_list_html)
3863         if(len(links) == 0):
3864             raise ExtractorError(u'ERROR: no known formats available for video')
3865
3866         self.to_screen(u'Links found: %d' % len(links))
3867
3868         formats = []
3869         for link in links:
3870
3871             # A link looks like this:
3872             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3873             # A path looks like this:
3874             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3875             video_url = unescapeHTML( link )
3876             path = compat_urllib_parse_urlparse( video_url ).path
3877             extension = os.path.splitext( path )[1][1:]
3878             format = path.split('/')[4].split('_')[:2]
3879             size = format[0]
3880             bitrate = format[1]
3881             format = "-".join( format )
3882             title = u'%s-%s-%s' % (video_title, size, bitrate)
3883
3884             formats.append({
3885                 'id': video_id,
3886                 'url': video_url,
3887                 'uploader': video_uploader,
3888                 'upload_date': upload_date,
3889                 'title': title,
3890                 'ext': extension,
3891                 'format': format,
3892                 'thumbnail': None,
3893                 'description': None,
3894                 'player_url': None
3895             })
3896
3897         if self._downloader.params.get('listformats', None):
3898             self._print_formats(formats)
3899             return
3900
3901         req_format = self._downloader.params.get('format', None)
3902         self.to_screen(u'Format: %s' % req_format)
3903
3904         if req_format is None or req_format == 'best':
3905             return [formats[0]]
3906         elif req_format == 'worst':
3907             return [formats[-1]]
3908         elif req_format in ('-1', 'all'):
3909             return formats
3910         else:
3911             format = self._specific( req_format, formats )
3912             if result is None:
3913                 self._downloader.report_error(u'requested format not available')
3914                 return
3915             return [format]
3916
3917
3918
3919 class PornotubeIE(InfoExtractor):
3920     """Information extractor for pornotube.com."""
3921     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3922
3923     def _real_extract(self, url):
3924         mobj = re.match(self._VALID_URL, url)
3925         if mobj is None:
3926             self._downloader.report_error(u'invalid URL: %s' % url)
3927             return
3928
3929         video_id = mobj.group('videoid')
3930         video_title = mobj.group('title')
3931
3932         # Get webpage content
3933         webpage = self._download_webpage(url, video_id)
3934
3935         # Get the video URL
3936         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3937         result = re.search(VIDEO_URL_RE, webpage)
3938         if result is None:
3939             self._downloader.report_error(u'unable to extract video url')
3940             return
3941         video_url = compat_urllib_parse.unquote(result.group('url'))
3942
3943         #Get the uploaded date
3944         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3945         result = re.search(VIDEO_UPLOADED_RE, webpage)
3946         if result is None:
3947             self._downloader.report_error(u'unable to extract video title')
3948             return
3949         upload_date = result.group('date')
3950
3951         info = {'id': video_id,
3952                 'url': video_url,
3953                 'uploader': None,
3954                 'upload_date': upload_date,
3955                 'title': video_title,
3956                 'ext': 'flv',
3957                 'format': 'flv'}
3958
3959         return [info]
3960
3961 class YouJizzIE(InfoExtractor):
3962     """Information extractor for youjizz.com."""
3963     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3964
3965     def _real_extract(self, url):
3966         mobj = re.match(self._VALID_URL, url)
3967         if mobj is None:
3968             self._downloader.report_error(u'invalid URL: %s' % url)
3969             return
3970
3971         video_id = mobj.group('videoid')
3972
3973         # Get webpage content
3974         webpage = self._download_webpage(url, video_id)
3975
3976         # Get the video title
3977         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3978         if result is None:
3979             raise ExtractorError(u'ERROR: unable to extract video title')
3980         video_title = result.group('title').strip()
3981
3982         # Get the embed page
3983         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3984         if result is None:
3985             raise ExtractorError(u'ERROR: unable to extract embed page')
3986
3987         embed_page_url = result.group(0).strip()
3988         video_id = result.group('videoid')
3989
3990         webpage = self._download_webpage(embed_page_url, video_id)
3991
3992         # Get the video URL
3993         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3994         if result is None:
3995             raise ExtractorError(u'ERROR: unable to extract video url')
3996         video_url = result.group('source')
3997
3998         info = {'id': video_id,
3999                 'url': video_url,
4000                 'title': video_title,
4001                 'ext': 'flv',
4002                 'format': 'flv',
4003                 'player_url': embed_page_url}
4004
4005         return [info]
4006
4007 class EightTracksIE(InfoExtractor):
4008     IE_NAME = '8tracks'
4009     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4010
4011     def _real_extract(self, url):
4012         mobj = re.match(self._VALID_URL, url)
4013         if mobj is None:
4014             raise ExtractorError(u'Invalid URL: %s' % url)
4015         playlist_id = mobj.group('id')
4016
4017         webpage = self._download_webpage(url, playlist_id)
4018
4019         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4020         if not m:
4021             raise ExtractorError(u'Cannot find trax information')
4022         json_like = m.group(1)
4023         data = json.loads(json_like)
4024
4025         session = str(random.randint(0, 1000000000))
4026         mix_id = data['id']
4027         track_count = data['tracks_count']
4028         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4029         next_url = first_url
4030         res = []
4031         for i in itertools.count():
4032             api_json = self._download_webpage(next_url, playlist_id,
4033                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4034                 errnote=u'Failed to download song information')
4035             api_data = json.loads(api_json)
4036             track_data = api_data[u'set']['track']
4037             info = {
4038                 'id': track_data['id'],
4039                 'url': track_data['track_file_stream_url'],
4040                 'title': track_data['performer'] + u' - ' + track_data['name'],
4041                 'raw_title': track_data['name'],
4042                 'uploader_id': data['user']['login'],
4043                 'ext': 'm4a',
4044             }
4045             res.append(info)
4046             if api_data['set']['at_last_track']:
4047                 break
4048             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4049         return res
4050
4051 class KeekIE(InfoExtractor):
4052     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4053     IE_NAME = u'keek'
4054
4055     def _real_extract(self, url):
4056         m = re.match(self._VALID_URL, url)
4057         video_id = m.group('videoID')
4058         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4059         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4060         webpage = self._download_webpage(url, video_id)
4061         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4062         title = unescapeHTML(m.group('title'))
4063         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4064         uploader = clean_html(m.group('uploader'))
4065         info = {
4066                 'id': video_id,
4067                 'url': video_url,
4068                 'ext': 'mp4',
4069                 'title': title,
4070                 'thumbnail': thumbnail,
4071                 'uploader': uploader
4072         }
4073         return [info]
4074
4075 class TEDIE(InfoExtractor):
4076     _VALID_URL=r'''http://www.ted.com/
4077                    (
4078                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4079                         |
4080                         ((?P<type_talk>talks)) # We have a simple talk
4081                    )
4082                    /(?P<name>\w+) # Here goes the name and then ".html"
4083                    '''
4084
4085     @classmethod
4086     def suitable(cls, url):
4087         """Receives a URL and returns True if suitable for this IE."""
4088         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4089
4090     def _real_extract(self, url):
4091         m=re.match(self._VALID_URL, url, re.VERBOSE)
4092         if m.group('type_talk'):
4093             return [self._talk_info(url)]
4094         else :
4095             playlist_id=m.group('playlist_id')
4096             name=m.group('name')
4097             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4098             return [self._playlist_videos_info(url,name,playlist_id)]
4099
4100     def _talk_video_link(self,mediaSlug):
4101         '''Returns the video link for that mediaSlug'''
4102         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4103
4104     def _playlist_videos_info(self,url,name,playlist_id=0):
4105         '''Returns the videos of the playlist'''
4106         video_RE=r'''
4107                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4108                      ([.\s]*?)data-playlist_item_id="(\d+)"
4109                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4110                      '''
4111         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4112         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4113         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4114         m_names=re.finditer(video_name_RE,webpage)
4115
4116         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4117         m_playlist = re.search(playlist_RE, webpage)
4118         playlist_title = m_playlist.group('playlist_title')
4119
4120         playlist_entries = []
4121         for m_video, m_name in zip(m_videos,m_names):
4122             video_id=m_video.group('video_id')
4123             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4124             playlist_entries.append(self.url_result(talk_url, 'TED'))
4125         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4126
4127     def _talk_info(self, url, video_id=0):
4128         """Return the video for the talk in the url"""
4129         m=re.match(self._VALID_URL, url,re.VERBOSE)
4130         videoName=m.group('name')
4131         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4132         # If the url includes the language we get the title translated
4133         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4134         title=re.search(title_RE, webpage).group('title')
4135         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4136                         "id":(?P<videoID>[\d]+).*?
4137                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4138         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4139         thumb_match=re.search(thumb_RE,webpage)
4140         info_match=re.search(info_RE,webpage,re.VERBOSE)
4141         video_id=info_match.group('videoID')
4142         mediaSlug=info_match.group('mediaSlug')
4143         video_url=self._talk_video_link(mediaSlug)
4144         info = {
4145                 'id': video_id,
4146                 'url': video_url,
4147                 'ext': 'mp4',
4148                 'title': title,
4149                 'thumbnail': thumb_match.group('thumbnail')
4150                 }
4151         return info
4152
4153 class MySpassIE(InfoExtractor):
4154     _VALID_URL = r'http://www.myspass.de/.*'
4155
4156     def _real_extract(self, url):
4157         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4158
4159         # video id is the last path element of the URL
4160         # usually there is a trailing slash, so also try the second but last
4161         url_path = compat_urllib_parse_urlparse(url).path
4162         url_parent_path, video_id = os.path.split(url_path)
4163         if not video_id:
4164             _, video_id = os.path.split(url_parent_path)
4165
4166         # get metadata
4167         metadata_url = META_DATA_URL_TEMPLATE % video_id
4168         metadata_text = self._download_webpage(metadata_url, video_id)
4169         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4170
4171         # extract values from metadata
4172         url_flv_el = metadata.find('url_flv')
4173         if url_flv_el is None:
4174             self._downloader.report_error(u'unable to extract download url')
4175             return
4176         video_url = url_flv_el.text
4177         extension = os.path.splitext(video_url)[1][1:]
4178         title_el = metadata.find('title')
4179         if title_el is None:
4180             self._downloader.report_error(u'unable to extract title')
4181             return
4182         title = title_el.text
4183         format_id_el = metadata.find('format_id')
4184         if format_id_el is None:
4185             format = ext
4186         else:
4187             format = format_id_el.text
4188         description_el = metadata.find('description')
4189         if description_el is not None:
4190             description = description_el.text
4191         else:
4192             description = None
4193         imagePreview_el = metadata.find('imagePreview')
4194         if imagePreview_el is not None:
4195             thumbnail = imagePreview_el.text
4196         else:
4197             thumbnail = None
4198         info = {
4199             'id': video_id,
4200             'url': video_url,
4201             'title': title,
4202             'ext': extension,
4203             'format': format,
4204             'thumbnail': thumbnail,
4205             'description': description
4206         }
4207         return [info]
4208
4209 class SpiegelIE(InfoExtractor):
4210     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4211
4212     def _real_extract(self, url):
4213         m = re.match(self._VALID_URL, url)
4214         video_id = m.group('videoID')
4215
4216         webpage = self._download_webpage(url, video_id)
4217         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4218         if not m:
4219             raise ExtractorError(u'Cannot find title')
4220         video_title = unescapeHTML(m.group(1))
4221
4222         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4223         xml_code = self._download_webpage(xml_url, video_id,
4224                     note=u'Downloading XML', errnote=u'Failed to download XML')
4225
4226         idoc = xml.etree.ElementTree.fromstring(xml_code)
4227         last_type = idoc[-1]
4228         filename = last_type.findall('./filename')[0].text
4229         duration = float(last_type.findall('./duration')[0].text)
4230
4231         video_url = 'http://video2.spiegel.de/flash/' + filename
4232         video_ext = filename.rpartition('.')[2]
4233         info = {
4234             'id': video_id,
4235             'url': video_url,
4236             'ext': video_ext,
4237             'title': video_title,
4238             'duration': duration,
4239         }
4240         return [info]
4241
4242 class LiveLeakIE(InfoExtractor):
4243
4244     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4245     IE_NAME = u'liveleak'
4246
4247     def _real_extract(self, url):
4248         mobj = re.match(self._VALID_URL, url)
4249         if mobj is None:
4250             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4251             return
4252
4253         video_id = mobj.group('video_id')
4254
4255         webpage = self._download_webpage(url, video_id)
4256
4257         m = re.search(r'file: "(.*?)",', webpage)
4258         if not m:
4259             self._downloader.report_error(u'unable to find video url')
4260             return
4261         video_url = m.group(1)
4262
4263         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4264         if not m:
4265             self._downloader.trouble(u'Cannot find video title')
4266         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4267
4268         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4269         if m:
4270             desc = unescapeHTML(m.group('desc'))
4271         else:
4272             desc = None
4273
4274         m = re.search(r'By:.*?(\w+)</a>', webpage)
4275         if m:
4276             uploader = clean_html(m.group(1))
4277         else:
4278             uploader = None
4279
4280         info = {
4281             'id':  video_id,
4282             'url': video_url,
4283             'ext': 'mp4',
4284             'title': title,
4285             'description': desc,
4286             'uploader': uploader
4287         }
4288
4289         return [info]
4290
4291 class ARDIE(InfoExtractor):
4292     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4293     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4294     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4295
4296     def _real_extract(self, url):
4297         # determine video id from url
4298         m = re.match(self._VALID_URL, url)
4299
4300         numid = re.search(r'documentId=([0-9]+)', url)
4301         if numid:
4302             video_id = numid.group(1)
4303         else:
4304             video_id = m.group('video_id')
4305
4306         # determine title and media streams from webpage
4307         html = self._download_webpage(url, video_id)
4308         title = re.search(self._TITLE, html).group('title')
4309         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4310         if not streams:
4311             assert '"fsk"' in html
4312             self._downloader.report_error(u'this video is only available after 8:00 pm')
4313             return
4314
4315         # choose default media type and highest quality for now
4316         stream = max([s for s in streams if int(s["media_type"]) == 0],
4317                      key=lambda s: int(s["quality"]))
4318
4319         # there's two possibilities: RTMP stream or HTTP download
4320         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4321         if stream['rtmp_url']:
4322             self.to_screen(u'RTMP download detected')
4323             assert stream['video_url'].startswith('mp4:')
4324             info["url"] = stream["rtmp_url"]
4325             info["play_path"] = stream['video_url']
4326         else:
4327             assert stream["video_url"].endswith('.mp4')
4328             info["url"] = stream["video_url"]
4329         return [info]
4330
4331
4332 def gen_extractors():
4333     """ Return a list of an instance of every supported extractor.
4334     The order does matter; the first extractor matched is the one handling the URL.
4335     """
4336     return [
4337         YoutubePlaylistIE(),
4338         YoutubeChannelIE(),
4339         YoutubeUserIE(),
4340         YoutubeSearchIE(),
4341         YoutubeIE(),
4342         MetacafeIE(),
4343         DailymotionIE(),
4344         GoogleSearchIE(),
4345         PhotobucketIE(),
4346         YahooIE(),
4347         YahooSearchIE(),
4348         DepositFilesIE(),
4349         FacebookIE(),
4350         BlipTVUserIE(),
4351         BlipTVIE(),
4352         VimeoIE(),
4353         MyVideoIE(),
4354         ComedyCentralIE(),
4355         EscapistIE(),
4356         CollegeHumorIE(),
4357         XVideosIE(),
4358         SoundcloudSetIE(),
4359         SoundcloudIE(),
4360         InfoQIE(),
4361         MixcloudIE(),
4362         StanfordOpenClassroomIE(),
4363         MTVIE(),
4364         YoukuIE(),
4365         XNXXIE(),
4366         YouJizzIE(),
4367         PornotubeIE(),
4368         YouPornIE(),
4369         GooglePlusIE(),
4370         ArteTvIE(),
4371         NBAIE(),
4372         WorldStarHipHopIE(),
4373         JustinTVIE(),
4374         FunnyOrDieIE(),
4375         SteamIE(),
4376         UstreamIE(),
4377         RBMARadioIE(),
4378         EightTracksIE(),
4379         KeekIE(),
4380         TEDIE(),
4381         MySpassIE(),
4382         SpiegelIE(),
4383         LiveLeakIE(),
4384         ARDIE(),
4385         GenericIE()
4386     ]
4387
4388 def get_info_extractor(ie_name):
4389     """Returns the info extractor class with the given ie_name"""
4390     return globals()[ie_name+'IE']