_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             upload_date = unified_strdate(upload_date)
 566
 567         # description
 568         video_description = get_element_by_id("eow-description", video_webpage)
 569         if video_description:
 570             video_description = clean_html(video_description)
 571         else:
 572             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 573             if fd_mobj:
 574                 video_description = unescapeHTML(fd_mobj.group(1))
 575             else:
 576                 video_description = u''
 577
 578         # subtitles
 579         video_subtitles = None
 580
 581         if self._downloader.params.get('writesubtitles', False):
 582             video_subtitles = self._extract_subtitle(video_id)
 583             if video_subtitles:
 584                 (sub_error, sub_lang, sub) = video_subtitles[0]
 585                 if sub_error:
 586                     self._downloader.report_error(sub_error)
 587
 588         if self._downloader.params.get('allsubtitles', False):
 589             video_subtitles = self._extract_all_subtitles(video_id)
 590             for video_subtitle in video_subtitles:
 591                 (sub_error, sub_lang, sub) = video_subtitle
 592                 if sub_error:
 593                     self._downloader.report_error(sub_error)
 594
 595         if self._downloader.params.get('listsubtitles', False):
 596             sub_lang_list = self._list_available_subtitles(video_id)
 597             return
 598
 599         if 'length_seconds' not in video_info:
 600             self._downloader.report_warning(u'unable to extract video duration')
 601             video_duration = ''
 602         else:
 603             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 604
 605         # token
 606         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 607
 608         # Decide which formats to download
 609         req_format = self._downloader.params.get('format', None)
 610
 611         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 612             self.report_rtmp_download()
 613             video_url_list = [(None, video_info['conn'][0])]
 614         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 615             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 616             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 617             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 618             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 619
 620             format_limit = self._downloader.params.get('format_limit', None)
 621             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 622             if format_limit is not None and format_limit in available_formats:
 623                 format_list = available_formats[available_formats.index(format_limit):]
 624             else:
 625                 format_list = available_formats
 626             existing_formats = [x for x in format_list if x in url_map]
 627             if len(existing_formats) == 0:
 628                 raise ExtractorError(u'no known formats available for video')
 629             if self._downloader.params.get('listformats', None):
 630                 self._print_formats(existing_formats)
 631                 return
 632             if req_format is None or req_format == 'best':
 633                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 634             elif req_format == 'worst':
 635                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 636             elif req_format in ('-1', 'all'):
 637                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 638             else:
 639                 # Specific formats. We pick the first in a slash-delimeted sequence.
 640                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 641                 req_formats = req_format.split('/')
 642                 video_url_list = None
 643                 for rf in req_formats:
 644                     if rf in url_map:
 645                         video_url_list = [(rf, url_map[rf])]
 646                         break
 647                 if video_url_list is None:
 648                     raise ExtractorError(u'requested format not available')
 649         else:
 650             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 651
 652         results = []
 653         for format_param, video_real_url in video_url_list:
 654             # Extension
 655             video_extension = self._video_extensions.get(format_param, 'flv')
 656
 657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 658                                               self._video_dimensions.get(format_param, '???'))
 659
 660             results.append({
 661                 'id':       video_id,
 662                 'url':      video_real_url,
 663                 'uploader': video_uploader,
 664                 'uploader_id': video_uploader_id,
 665                 'upload_date':  upload_date,
 666                 'title':    video_title,
 667                 'ext':      video_extension,
 668                 'format':   video_format,
 669                 'thumbnail':    video_thumbnail,
 670                 'description':  video_description,
 671                 'player_url':   player_url,
 672                 'subtitles':    video_subtitles,
 673                 'duration':     video_duration
 674             })
 675         return results
 676
 677
 678 class MetacafeIE(InfoExtractor):
 679     """Information Extractor for metacafe.com."""
 680
 681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 684     IE_NAME = u'metacafe'
 685
 686     def report_disclaimer(self):
 687         """Report disclaimer retrieval."""
 688         self.to_screen(u'Retrieving disclaimer')
 689
 690     def _real_initialize(self):
 691         # Retrieve disclaimer
 692         request = compat_urllib_request.Request(self._DISCLAIMER)
 693         try:
 694             self.report_disclaimer()
 695             disclaimer = compat_urllib_request.urlopen(request).read()
 696         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 697             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 698             return
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 711             return
 712
 713     def _real_extract(self, url):
 714         # Extract id and simplified title from URL
 715         mobj = re.match(self._VALID_URL, url)
 716         if mobj is None:
 717             self._downloader.report_error(u'invalid URL: %s' % url)
 718             return
 719
 720         video_id = mobj.group(1)
 721
 722         # Check if video comes from YouTube
 723         mobj2 = re.match(r'^yt-(.*)$', video_id)
 724         if mobj2 is not None:
 725             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 726
 727         # Retrieve video webpage to extract further information
 728         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 729
 730         # Extract URL, uploader and title from webpage
 731         self.report_extraction(video_id)
 732         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 733         if mobj is not None:
 734             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 735             video_extension = mediaURL[-3:]
 736
 737             # Extract gdaKey if available
 738             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 739             if mobj is None:
 740                 video_url = mediaURL
 741             else:
 742                 gdaKey = mobj.group(1)
 743                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 744         else:
 745             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 746             if mobj is None:
 747                 self._downloader.report_error(u'unable to extract media URL')
 748                 return
 749             vardict = compat_parse_qs(mobj.group(1))
 750             if 'mediaData' not in vardict:
 751                 self._downloader.report_error(u'unable to extract media URL')
 752                 return
 753             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 754             if mobj is None:
 755                 self._downloader.report_error(u'unable to extract media URL')
 756                 return
 757             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 758             video_extension = mediaURL[-3:]
 759             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 760
 761         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 762         if mobj is None:
 763             self._downloader.report_error(u'unable to extract title')
 764             return
 765         video_title = mobj.group(1).decode('utf-8')
 766
 767         mobj = re.search(r'submitter=(.*?);', webpage)
 768         if mobj is None:
 769             self._downloader.report_error(u'unable to extract uploader nickname')
 770             return
 771         video_uploader = mobj.group(1)
 772
 773         return [{
 774             'id':       video_id.decode('utf-8'),
 775             'url':      video_url.decode('utf-8'),
 776             'uploader': video_uploader.decode('utf-8'),
 777             'upload_date':  None,
 778             'title':    video_title,
 779             'ext':      video_extension.decode('utf-8'),
 780         }]
 781
 782 class RedtubeIE(InfoExtractor):
 783     """Information Extractor for redtube"""
 784     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 785     IE_NAME = u'redtube'
 786
 787     def _real_extract(self,url):
 788         mobj = re.match(self._VALID_URL, url)
 789         if mobj is None:
 790             self._downloader.report_error(u'invalid URL: %s' % url)
 791             return
 792         video_id = mobj.group('id')
 793         video_extension = 'mp4'
 794         webpage = self._download_webpage(url, video_id)
 795         self.report_extraction(video_id)
 796         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
 797         if mobj is not None:
 798             video_url = mobj.group(1)
 799         else:
 800             self._downloader.report_error(u'unable to extract media URL')
 801             return
 802         mobj = re.search('<h1 class="videoTitle slidePanelMovable">'+r'(.+)'+r'</h1>',webpage)
 803         if mobj is not None:
 804             video_title = mobj.group(1)
 805         else:
 806             video_title = 'Redtube - %s' % time.ctime()
 807
 808         return [{
 809             'id':       video_id,
 810             'url':      video_url,
 811             'ext':      video_extension,
 812             'title':    video_title,
 813         }]
 814
 815 class DailymotionIE(InfoExtractor):
 816     """Information Extractor for Dailymotion"""
 817
 818     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 819     IE_NAME = u'dailymotion'
 820
 821     def _real_extract(self, url):
 822         # Extract id and simplified title from URL
 823         mobj = re.match(self._VALID_URL, url)
 824         if mobj is None:
 825             self._downloader.report_error(u'invalid URL: %s' % url)
 826             return
 827
 828         video_id = mobj.group(1).split('_')[0].split('?')[0]
 829
 830         video_extension = 'mp4'
 831
 832         # Retrieve video webpage to extract further information
 833         request = compat_urllib_request.Request(url)
 834         request.add_header('Cookie', 'family_filter=off')
 835         webpage = self._download_webpage(request, video_id)
 836
 837         # Extract URL, uploader and title from webpage
 838         self.report_extraction(video_id)
 839         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 840         if mobj is None:
 841             self._downloader.report_error(u'unable to extract media URL')
 842             return
 843         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 844
 845         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 846             if key in flashvars:
 847                 max_quality = key
 848                 self.to_screen(u'Using %s' % key)
 849                 break
 850         else:
 851             self._downloader.report_error(u'unable to extract video URL')
 852             return
 853
 854         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 855         if mobj is None:
 856             self._downloader.report_error(u'unable to extract video URL')
 857             return
 858
 859         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 860
 861         # TODO: support choosing qualities
 862
 863         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 864         if mobj is None:
 865             self._downloader.report_error(u'unable to extract title')
 866             return
 867         video_title = unescapeHTML(mobj.group('title'))
 868
 869         video_uploader = None
 870         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 871         if mobj is None:
 872             # lookin for official user
 873             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 874             if mobj_official is None:
 875                 self._downloader.report_warning(u'unable to extract uploader nickname')
 876             else:
 877                 video_uploader = mobj_official.group(1)
 878         else:
 879             video_uploader = mobj.group(1)
 880
 881         video_upload_date = None
 882         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 883         if mobj is not None:
 884             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 885
 886         return [{
 887             'id':       video_id,
 888             'url':      video_url,
 889             'uploader': video_uploader,
 890             'upload_date':  video_upload_date,
 891             'title':    video_title,
 892             'ext':      video_extension,
 893         }]
 894
 895
 896 class PhotobucketIE(InfoExtractor):
 897     """Information extractor for photobucket.com."""
 898
 899     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 900     IE_NAME = u'photobucket'
 901
 902     def _real_extract(self, url):
 903         # Extract id from URL
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             self._downloader.report_error(u'Invalid URL: %s' % url)
 907             return
 908
 909         video_id = mobj.group(1)
 910
 911         video_extension = 'flv'
 912
 913         # Retrieve video webpage to extract further information
 914         request = compat_urllib_request.Request(url)
 915         try:
 916             self.report_download_webpage(video_id)
 917             webpage = compat_urllib_request.urlopen(request).read()
 918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 919             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 920             return
 921
 922         # Extract URL, uploader, and title from webpage
 923         self.report_extraction(video_id)
 924         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 925         if mobj is None:
 926             self._downloader.report_error(u'unable to extract media URL')
 927             return
 928         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 929
 930         video_url = mediaURL
 931
 932         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 933         if mobj is None:
 934             self._downloader.report_error(u'unable to extract title')
 935             return
 936         video_title = mobj.group(1).decode('utf-8')
 937
 938         video_uploader = mobj.group(2).decode('utf-8')
 939
 940         return [{
 941             'id':       video_id.decode('utf-8'),
 942             'url':      video_url.decode('utf-8'),
 943             'uploader': video_uploader,
 944             'upload_date':  None,
 945             'title':    video_title,
 946             'ext':      video_extension.decode('utf-8'),
 947         }]
 948
 949
 950 class YahooIE(InfoExtractor):
 951     """Information extractor for video.yahoo.com."""
 952
 953     _WORKING = False
 954     # _VALID_URL matches all Yahoo! Video URLs
 955     # _VPAGE_URL matches only the extractable '/watch/' URLs
 956     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 957     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 958     IE_NAME = u'video.yahoo'
 959
 960     def _real_extract(self, url, new_video=True):
 961         # Extract ID from URL
 962         mobj = re.match(self._VALID_URL, url)
 963         if mobj is None:
 964             self._downloader.report_error(u'Invalid URL: %s' % url)
 965             return
 966
 967         video_id = mobj.group(2)
 968         video_extension = 'flv'
 969
 970         # Rewrite valid but non-extractable URLs as
 971         # extractable English language /watch/ URLs
 972         if re.match(self._VPAGE_URL, url) is None:
 973             request = compat_urllib_request.Request(url)
 974             try:
 975                 webpage = compat_urllib_request.urlopen(request).read()
 976             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 977                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 978                 return
 979
 980             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 981             if mobj is None:
 982                 self._downloader.report_error(u'Unable to extract id field')
 983                 return
 984             yahoo_id = mobj.group(1)
 985
 986             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 987             if mobj is None:
 988                 self._downloader.report_error(u'Unable to extract vid field')
 989                 return
 990             yahoo_vid = mobj.group(1)
 991
 992             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 993             return self._real_extract(url, new_video=False)
 994
 995         # Retrieve video webpage to extract further information
 996         request = compat_urllib_request.Request(url)
 997         try:
 998             self.report_download_webpage(video_id)
 999             webpage = compat_urllib_request.urlopen(request).read()
1000         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1001             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1002             return
1003
1004         # Extract uploader and title from webpage
1005         self.report_extraction(video_id)
1006         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1007         if mobj is None:
1008             self._downloader.report_error(u'unable to extract video title')
1009             return
1010         video_title = mobj.group(1).decode('utf-8')
1011
1012         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1013         if mobj is None:
1014             self._downloader.report_error(u'unable to extract video uploader')
1015             return
1016         video_uploader = mobj.group(1).decode('utf-8')
1017
1018         # Extract video thumbnail
1019         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1020         if mobj is None:
1021             self._downloader.report_error(u'unable to extract video thumbnail')
1022             return
1023         video_thumbnail = mobj.group(1).decode('utf-8')
1024
1025         # Extract video description
1026         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1027         if mobj is None:
1028             self._downloader.report_error(u'unable to extract video description')
1029             return
1030         video_description = mobj.group(1).decode('utf-8')
1031         if not video_description:
1032             video_description = 'No description available.'
1033
1034         # Extract video height and width
1035         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'unable to extract video height')
1038             return
1039         yv_video_height = mobj.group(1)
1040
1041         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1042         if mobj is None:
1043             self._downloader.report_error(u'unable to extract video width')
1044             return
1045         yv_video_width = mobj.group(1)
1046
1047         # Retrieve video playlist to extract media URL
1048         # I'm not completely sure what all these options are, but we
1049         # seem to need most of them, otherwise the server sends a 401.
1050         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1051         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1052         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1053                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1054                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1055         try:
1056             self.report_download_webpage(video_id)
1057             webpage = compat_urllib_request.urlopen(request).read()
1058         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1059             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1060             return
1061
1062         # Extract media URL from playlist XML
1063         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1064         if mobj is None:
1065             self._downloader.report_error(u'Unable to extract media URL')
1066             return
1067         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1068         video_url = unescapeHTML(video_url)
1069
1070         return [{
1071             'id':       video_id.decode('utf-8'),
1072             'url':      video_url,
1073             'uploader': video_uploader,
1074             'upload_date':  None,
1075             'title':    video_title,
1076             'ext':      video_extension.decode('utf-8'),
1077             'thumbnail':    video_thumbnail.decode('utf-8'),
1078             'description':  video_description,
1079         }]
1080
1081
1082 class VimeoIE(InfoExtractor):
1083     """Information extractor for vimeo.com."""
1084
1085     # _VALID_URL matches Vimeo URLs
1086     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1087     IE_NAME = u'vimeo'
1088
1089     def _real_extract(self, url, new_video=True):
1090         # Extract ID from URL
1091         mobj = re.match(self._VALID_URL, url)
1092         if mobj is None:
1093             self._downloader.report_error(u'Invalid URL: %s' % url)
1094             return
1095
1096         video_id = mobj.group('id')
1097         if not mobj.group('proto'):
1098             url = 'https://' + url
1099         if mobj.group('direct_link'):
1100             url = 'https://vimeo.com/' + video_id
1101
1102         # Retrieve video webpage to extract further information
1103         request = compat_urllib_request.Request(url, None, std_headers)
1104         webpage = self._download_webpage(request, video_id)
1105
1106         # Now we begin extracting as much information as we can from what we
1107         # retrieved. First we extract the information common to all extractors,
1108         # and latter we extract those that are Vimeo specific.
1109         self.report_extraction(video_id)
1110
1111         # Extract the config JSON
1112         try:
1113             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1114             config = json.loads(config)
1115         except:
1116             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1117                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1118             else:
1119                 self._downloader.report_error(u'unable to extract info section')
1120             return
1121
1122         # Extract title
1123         video_title = config["video"]["title"]
1124
1125         # Extract uploader and uploader_id
1126         video_uploader = config["video"]["owner"]["name"]
1127         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1128
1129         # Extract video thumbnail
1130         video_thumbnail = config["video"]["thumbnail"]
1131
1132         # Extract video description
1133         video_description = get_element_by_attribute("itemprop", "description", webpage)
1134         if video_description: video_description = clean_html(video_description)
1135         else: video_description = u''
1136
1137         # Extract upload date
1138         video_upload_date = None
1139         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1140         if mobj is not None:
1141             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1142
1143         # Vimeo specific: extract request signature and timestamp
1144         sig = config['request']['signature']
1145         timestamp = config['request']['timestamp']
1146
1147         # Vimeo specific: extract video codec and quality information
1148         # First consider quality, then codecs, then take everything
1149         # TODO bind to format param
1150         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1151         files = { 'hd': [], 'sd': [], 'other': []}
1152         for codec_name, codec_extension in codecs:
1153             if codec_name in config["video"]["files"]:
1154                 if 'hd' in config["video"]["files"][codec_name]:
1155                     files['hd'].append((codec_name, codec_extension, 'hd'))
1156                 elif 'sd' in config["video"]["files"][codec_name]:
1157                     files['sd'].append((codec_name, codec_extension, 'sd'))
1158                 else:
1159                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1160
1161         for quality in ('hd', 'sd', 'other'):
1162             if len(files[quality]) > 0:
1163                 video_quality = files[quality][0][2]
1164                 video_codec = files[quality][0][0]
1165                 video_extension = files[quality][0][1]
1166                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1167                 break
1168         else:
1169             self._downloader.report_error(u'no known codec found')
1170             return
1171
1172         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1173                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1174
1175         return [{
1176             'id':       video_id,
1177             'url':      video_url,
1178             'uploader': video_uploader,
1179             'uploader_id': video_uploader_id,
1180             'upload_date':  video_upload_date,
1181             'title':    video_title,
1182             'ext':      video_extension,
1183             'thumbnail':    video_thumbnail,
1184             'description':  video_description,
1185         }]
1186
1187
1188 class ArteTvIE(InfoExtractor):
1189     """arte.tv information extractor."""
1190
1191     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1192     _LIVE_URL = r'index-[0-9]+\.html$'
1193
1194     IE_NAME = u'arte.tv'
1195
1196     def fetch_webpage(self, url):
1197         request = compat_urllib_request.Request(url)
1198         try:
1199             self.report_download_webpage(url)
1200             webpage = compat_urllib_request.urlopen(request).read()
1201         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1202             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1203             return
1204         except ValueError as err:
1205             self._downloader.report_error(u'Invalid URL: %s' % url)
1206             return
1207         return webpage
1208
1209     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1210         page = self.fetch_webpage(url)
1211         mobj = re.search(regex, page, regexFlags)
1212         info = {}
1213
1214         if mobj is None:
1215             self._downloader.report_error(u'Invalid URL: %s' % url)
1216             return
1217
1218         for (i, key, err) in matchTuples:
1219             if mobj.group(i) is None:
1220                 self._downloader.report_error(err)
1221                 return
1222             else:
1223                 info[key] = mobj.group(i)
1224
1225         return info
1226
1227     def extractLiveStream(self, url):
1228         video_lang = url.split('/')[-4]
1229         info = self.grep_webpage(
1230             url,
1231             r'src="(.*?/videothek_js.*?\.js)',
1232             0,
1233             [
1234                 (1, 'url', u'Invalid URL: %s' % url)
1235             ]
1236         )
1237         http_host = url.split('/')[2]
1238         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239         info = self.grep_webpage(
1240             next_url,
1241             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242                 '(http://.*?\.swf).*?' +
1243                 '(rtmp://.*?)\'',
1244             re.DOTALL,
1245             [
1246                 (1, 'path',   u'could not extract video path: %s' % url),
1247                 (2, 'player', u'could not extract video player: %s' % url),
1248                 (3, 'url',    u'could not extract video url: %s' % url)
1249             ]
1250         )
1251         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1252
1253     def extractPlus7Stream(self, url):
1254         video_lang = url.split('/')[-3]
1255         info = self.grep_webpage(
1256             url,
1257             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1258             0,
1259             [
1260                 (1, 'url', u'Invalid URL: %s' % url)
1261             ]
1262         )
1263         next_url = compat_urllib_parse.unquote(info.get('url'))
1264         info = self.grep_webpage(
1265             next_url,
1266             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1267             0,
1268             [
1269                 (1, 'url', u'Could not find <video> tag: %s' % url)
1270             ]
1271         )
1272         next_url = compat_urllib_parse.unquote(info.get('url'))
1273
1274         info = self.grep_webpage(
1275             next_url,
1276             r'<video id="(.*?)".*?>.*?' +
1277                 '<name>(.*?)</name>.*?' +
1278                 '<dateVideo>(.*?)</dateVideo>.*?' +
1279                 '<url quality="hd">(.*?)</url>',
1280             re.DOTALL,
1281             [
1282                 (1, 'id',    u'could not extract video id: %s' % url),
1283                 (2, 'title', u'could not extract video title: %s' % url),
1284                 (3, 'date',  u'could not extract video date: %s' % url),
1285                 (4, 'url',   u'could not extract video url: %s' % url)
1286             ]
1287         )
1288
1289         return {
1290             'id':           info.get('id'),
1291             'url':          compat_urllib_parse.unquote(info.get('url')),
1292             'uploader':     u'arte.tv',
1293             'upload_date':  info.get('date'),
1294             'title':        info.get('title').decode('utf-8'),
1295             'ext':          u'mp4',
1296             'format':       u'NA',
1297             'player_url':   None,
1298         }
1299
1300     def _real_extract(self, url):
1301         video_id = url.split('/')[-1]
1302         self.report_extraction(video_id)
1303
1304         if re.search(self._LIVE_URL, video_id) is not None:
1305             self.extractLiveStream(url)
1306             return
1307         else:
1308             info = self.extractPlus7Stream(url)
1309
1310         return [info]
1311
1312
1313 class GenericIE(InfoExtractor):
1314     """Generic last-resort information extractor."""
1315
1316     _VALID_URL = r'.*'
1317     IE_NAME = u'generic'
1318
1319     def report_download_webpage(self, video_id):
1320         """Report webpage download."""
1321         if not self._downloader.params.get('test', False):
1322             self._downloader.report_warning(u'Falling back on generic information extractor.')
1323         super(GenericIE, self).report_download_webpage(video_id)
1324
1325     def report_following_redirect(self, new_url):
1326         """Report information extraction."""
1327         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1328
1329     def _test_redirect(self, url):
1330         """Check if it is a redirect, like url shorteners, in case return the new url."""
1331         class HeadRequest(compat_urllib_request.Request):
1332             def get_method(self):
1333                 return "HEAD"
1334
1335         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1336             """
1337             Subclass the HTTPRedirectHandler to make it use our
1338             HeadRequest also on the redirected URL
1339             """
1340             def redirect_request(self, req, fp, code, msg, headers, newurl):
1341                 if code in (301, 302, 303, 307):
1342                     newurl = newurl.replace(' ', '%20')
1343                     newheaders = dict((k,v) for k,v in req.headers.items()
1344                                       if k.lower() not in ("content-length", "content-type"))
1345                     return HeadRequest(newurl,
1346                                        headers=newheaders,
1347                                        origin_req_host=req.get_origin_req_host(),
1348                                        unverifiable=True)
1349                 else:
1350                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1351
1352         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1353             """
1354             Fallback to GET if HEAD is not allowed (405 HTTP error)
1355             """
1356             def http_error_405(self, req, fp, code, msg, headers):
1357                 fp.read()
1358                 fp.close()
1359
1360                 newheaders = dict((k,v) for k,v in req.headers.items()
1361                                   if k.lower() not in ("content-length", "content-type"))
1362                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1363                                                  headers=newheaders,
1364                                                  origin_req_host=req.get_origin_req_host(),
1365                                                  unverifiable=True))
1366
1367         # Build our opener
1368         opener = compat_urllib_request.OpenerDirector()
1369         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370                         HTTPMethodFallback, HEADRedirectHandler,
1371                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372             opener.add_handler(handler())
1373
1374         response = opener.open(HeadRequest(url))
1375         new_url = response.geturl()
1376
1377         if url == new_url:
1378             return False
1379
1380         self.report_following_redirect(new_url)
1381         return new_url
1382
1383     def _real_extract(self, url):
1384         new_url = self._test_redirect(url)
1385         if new_url: return [self.url_result(new_url)]
1386
1387         video_id = url.split('/')[-1]
1388         try:
1389             webpage = self._download_webpage(url, video_id)
1390         except ValueError as err:
1391             # since this is the last-resort InfoExtractor, if
1392             # this error is thrown, it'll be thrown here
1393             self._downloader.report_error(u'Invalid URL: %s' % url)
1394             return
1395
1396         self.report_extraction(video_id)
1397         # Start with something easy: JW Player in SWFObject
1398         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1399         if mobj is None:
1400             # Broaden the search a little bit
1401             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1402         if mobj is None:
1403             # Broaden the search a little bit: JWPlayer JS loader
1404             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1405         if mobj is None:
1406             self._downloader.report_error(u'Invalid URL: %s' % url)
1407             return
1408
1409         # It's possible that one of the regexes
1410         # matched, but returned an empty group:
1411         if mobj.group(1) is None:
1412             self._downloader.report_error(u'Invalid URL: %s' % url)
1413             return
1414
1415         video_url = compat_urllib_parse.unquote(mobj.group(1))
1416         video_id = os.path.basename(video_url)
1417
1418         # here's a fun little line of code for you:
1419         video_extension = os.path.splitext(video_id)[1][1:]
1420         video_id = os.path.splitext(video_id)[0]
1421
1422         # it's tempting to parse this further, but you would
1423         # have to take into account all the variations like
1424         #   Video Title - Site Name
1425         #   Site Name | Video Title
1426         #   Video Title - Tagline | Site Name
1427         # and so on and so forth; it's just not practical
1428         mobj = re.search(r'<title>(.*)</title>', webpage)
1429         if mobj is None:
1430             self._downloader.report_error(u'unable to extract title')
1431             return
1432         video_title = mobj.group(1)
1433
1434         # video uploader is domain name
1435         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1436         if mobj is None:
1437             self._downloader.report_error(u'unable to extract title')
1438             return
1439         video_uploader = mobj.group(1)
1440
1441         return [{
1442             'id':       video_id,
1443             'url':      video_url,
1444             'uploader': video_uploader,
1445             'upload_date':  None,
1446             'title':    video_title,
1447             'ext':      video_extension,
1448         }]
1449
1450
1451 class YoutubeSearchIE(InfoExtractor):
1452     """Information Extractor for YouTube search queries."""
1453     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1454     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1455     _max_youtube_results = 1000
1456     IE_NAME = u'youtube:search'
1457
1458     def report_download_page(self, query, pagenum):
1459         """Report attempt to download search page with given number."""
1460         query = query.decode(preferredencoding())
1461         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1462
1463     def _real_extract(self, query):
1464         mobj = re.match(self._VALID_URL, query)
1465         if mobj is None:
1466             self._downloader.report_error(u'invalid search query "%s"' % query)
1467             return
1468
1469         prefix, query = query.split(':')
1470         prefix = prefix[8:]
1471         query = query.encode('utf-8')
1472         if prefix == '':
1473             return self._get_n_results(query, 1)
1474         elif prefix == 'all':
1475             self._get_n_results(query, self._max_youtube_results)
1476         else:
1477             try:
1478                 n = int(prefix)
1479                 if n <= 0:
1480                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1481                     return
1482                 elif n > self._max_youtube_results:
1483                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1484                     n = self._max_youtube_results
1485                 return self._get_n_results(query, n)
1486             except ValueError: # parsing prefix as integer fails
1487                 return self._get_n_results(query, 1)
1488
1489     def _get_n_results(self, query, n):
1490         """Get a specified number of results for a query"""
1491
1492         video_ids = []
1493         pagenum = 0
1494         limit = n
1495
1496         while (50 * pagenum) < limit:
1497             self.report_download_page(query, pagenum+1)
1498             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1499             request = compat_urllib_request.Request(result_url)
1500             try:
1501                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1502             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1503                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1504                 return
1505             api_response = json.loads(data)['data']
1506
1507             if not 'items' in api_response:
1508                 self._downloader.report_error(u'[youtube] No video results')
1509                 return
1510
1511             new_ids = list(video['id'] for video in api_response['items'])
1512             video_ids += new_ids
1513
1514             limit = min(n, api_response['totalItems'])
1515             pagenum += 1
1516
1517         if len(video_ids) > n:
1518             video_ids = video_ids[:n]
1519         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1520         return videos
1521
1522
1523 class GoogleSearchIE(InfoExtractor):
1524     """Information Extractor for Google Video search queries."""
1525     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1526     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1527     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1528     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1529     _max_google_results = 1000
1530     IE_NAME = u'video.google:search'
1531
1532     def report_download_page(self, query, pagenum):
1533         """Report attempt to download playlist page with given number."""
1534         query = query.decode(preferredencoding())
1535         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1536
1537     def _real_extract(self, query):
1538         mobj = re.match(self._VALID_URL, query)
1539         if mobj is None:
1540             self._downloader.report_error(u'invalid search query "%s"' % query)
1541             return
1542
1543         prefix, query = query.split(':')
1544         prefix = prefix[8:]
1545         query = query.encode('utf-8')
1546         if prefix == '':
1547             self._download_n_results(query, 1)
1548             return
1549         elif prefix == 'all':
1550             self._download_n_results(query, self._max_google_results)
1551             return
1552         else:
1553             try:
1554                 n = int(prefix)
1555                 if n <= 0:
1556                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1557                     return
1558                 elif n > self._max_google_results:
1559                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1560                     n = self._max_google_results
1561                 self._download_n_results(query, n)
1562                 return
1563             except ValueError: # parsing prefix as integer fails
1564                 self._download_n_results(query, 1)
1565                 return
1566
1567     def _download_n_results(self, query, n):
1568         """Downloads a specified number of results for a query"""
1569
1570         video_ids = []
1571         pagenum = 0
1572
1573         while True:
1574             self.report_download_page(query, pagenum)
1575             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1576             request = compat_urllib_request.Request(result_url)
1577             try:
1578                 page = compat_urllib_request.urlopen(request).read()
1579             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1580                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1581                 return
1582
1583             # Extract video identifiers
1584             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1585                 video_id = mobj.group(1)
1586                 if video_id not in video_ids:
1587                     video_ids.append(video_id)
1588                     if len(video_ids) == n:
1589                         # Specified n videos reached
1590                         for id in video_ids:
1591                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1592                         return
1593
1594             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595                 for id in video_ids:
1596                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1597                 return
1598
1599             pagenum = pagenum + 1
1600
1601
1602 class YahooSearchIE(InfoExtractor):
1603     """Information Extractor for Yahoo! Video search queries."""
1604
1605     _WORKING = False
1606     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1607     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1608     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1609     _MORE_PAGES_INDICATOR = r'\s*Next'
1610     _max_yahoo_results = 1000
1611     IE_NAME = u'video.yahoo:search'
1612
1613     def report_download_page(self, query, pagenum):
1614         """Report attempt to download playlist page with given number."""
1615         query = query.decode(preferredencoding())
1616         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1617
1618     def _real_extract(self, query):
1619         mobj = re.match(self._VALID_URL, query)
1620         if mobj is None:
1621             self._downloader.report_error(u'invalid search query "%s"' % query)
1622             return
1623
1624         prefix, query = query.split(':')
1625         prefix = prefix[8:]
1626         query = query.encode('utf-8')
1627         if prefix == '':
1628             self._download_n_results(query, 1)
1629             return
1630         elif prefix == 'all':
1631             self._download_n_results(query, self._max_yahoo_results)
1632             return
1633         else:
1634             try:
1635                 n = int(prefix)
1636                 if n <= 0:
1637                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1638                     return
1639                 elif n > self._max_yahoo_results:
1640                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1641                     n = self._max_yahoo_results
1642                 self._download_n_results(query, n)
1643                 return
1644             except ValueError: # parsing prefix as integer fails
1645                 self._download_n_results(query, 1)
1646                 return
1647
1648     def _download_n_results(self, query, n):
1649         """Downloads a specified number of results for a query"""
1650
1651         video_ids = []
1652         already_seen = set()
1653         pagenum = 1
1654
1655         while True:
1656             self.report_download_page(query, pagenum)
1657             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1658             request = compat_urllib_request.Request(result_url)
1659             try:
1660                 page = compat_urllib_request.urlopen(request).read()
1661             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1662                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1663                 return
1664
1665             # Extract video identifiers
1666             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1667                 video_id = mobj.group(1)
1668                 if video_id not in already_seen:
1669                     video_ids.append(video_id)
1670                     already_seen.add(video_id)
1671                     if len(video_ids) == n:
1672                         # Specified n videos reached
1673                         for id in video_ids:
1674                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1675                         return
1676
1677             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1678                 for id in video_ids:
1679                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1680                 return
1681
1682             pagenum = pagenum + 1
1683
1684
1685 class YoutubePlaylistIE(InfoExtractor):
1686     """Information Extractor for YouTube playlists."""
1687
1688     _VALID_URL = r"""(?:
1689                         (?:https?://)?
1690                         (?:\w+\.)?
1691                         youtube\.com/
1692                         (?:
1693                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1694                            \? (?:.*?&)*? (?:p|a|list)=
1695                         |  p/
1696                         )
1697                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1698                         .*
1699                      |
1700                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1701                      )"""
1702     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1703     _MAX_RESULTS = 50
1704     IE_NAME = u'youtube:playlist'
1705
1706     @classmethod
1707     def suitable(cls, url):
1708         """Receives a URL and returns True if suitable for this IE."""
1709         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1710
1711     def _real_extract(self, url):
1712         # Extract playlist id
1713         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1714         if mobj is None:
1715             self._downloader.report_error(u'invalid url: %s' % url)
1716             return
1717
1718         # Download playlist videos from API
1719         playlist_id = mobj.group(1) or mobj.group(2)
1720         page_num = 1
1721         videos = []
1722
1723         while True:
1724             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1725             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1726
1727             try:
1728                 response = json.loads(page)
1729             except ValueError as err:
1730                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1731                 return
1732
1733             if 'feed' not in response:
1734                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1735                 return
1736             playlist_title = response['feed']['title']['$t']
1737             if 'entry' not in response['feed']:
1738                 # Number of videos is a multiple of self._MAX_RESULTS
1739                 break
1740
1741             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1742                         for entry in response['feed']['entry']
1743                         if 'content' in entry ]
1744
1745             if len(response['feed']['entry']) < self._MAX_RESULTS:
1746                 break
1747             page_num += 1
1748
1749         videos = [v[1] for v in sorted(videos)]
1750
1751         url_results = [self.url_result(url, 'Youtube') for url in videos]
1752         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1753
1754
1755 class YoutubeChannelIE(InfoExtractor):
1756     """Information Extractor for YouTube channels."""
1757
1758     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1759     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1760     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1761     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1762     IE_NAME = u'youtube:channel'
1763
1764     def extract_videos_from_page(self, page):
1765         ids_in_page = []
1766         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1767             if mobj.group(1) not in ids_in_page:
1768                 ids_in_page.append(mobj.group(1))
1769         return ids_in_page
1770
1771     def _real_extract(self, url):
1772         # Extract channel id
1773         mobj = re.match(self._VALID_URL, url)
1774         if mobj is None:
1775             self._downloader.report_error(u'invalid url: %s' % url)
1776             return
1777
1778         # Download channel page
1779         channel_id = mobj.group(1)
1780         video_ids = []
1781         pagenum = 1
1782
1783         url = self._TEMPLATE_URL % (channel_id, pagenum)
1784         page = self._download_webpage(url, channel_id,
1785                                       u'Downloading page #%s' % pagenum)
1786
1787         # Extract video identifiers
1788         ids_in_page = self.extract_videos_from_page(page)
1789         video_ids.extend(ids_in_page)
1790
1791         # Download any subsequent channel pages using the json-based channel_ajax query
1792         if self._MORE_PAGES_INDICATOR in page:
1793             while True:
1794                 pagenum = pagenum + 1
1795
1796                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1797                 page = self._download_webpage(url, channel_id,
1798                                               u'Downloading page #%s' % pagenum)
1799
1800                 page = json.loads(page)
1801
1802                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1803                 video_ids.extend(ids_in_page)
1804
1805                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1806                     break
1807
1808         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1809
1810         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1811         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1812         return [self.playlist_result(url_entries, channel_id)]
1813
1814
1815 class YoutubeUserIE(InfoExtractor):
1816     """Information Extractor for YouTube users."""
1817
1818     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1819     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1820     _GDATA_PAGE_SIZE = 50
1821     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1822     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1823     IE_NAME = u'youtube:user'
1824
1825     def _real_extract(self, url):
1826         # Extract username
1827         mobj = re.match(self._VALID_URL, url)
1828         if mobj is None:
1829             self._downloader.report_error(u'invalid url: %s' % url)
1830             return
1831
1832         username = mobj.group(1)
1833
1834         # Download video ids using YouTube Data API. Result size per
1835         # query is limited (currently to 50 videos) so we need to query
1836         # page by page until there are no video ids - it means we got
1837         # all of them.
1838
1839         video_ids = []
1840         pagenum = 0
1841
1842         while True:
1843             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1844
1845             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1846             page = self._download_webpage(gdata_url, username,
1847                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1848
1849             # Extract video identifiers
1850             ids_in_page = []
1851
1852             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1853                 if mobj.group(1) not in ids_in_page:
1854                     ids_in_page.append(mobj.group(1))
1855
1856             video_ids.extend(ids_in_page)
1857
1858             # A little optimization - if current page is not
1859             # "full", ie. does not contain PAGE_SIZE video ids then
1860             # we can assume that this page is the last one - there
1861             # are no more ids on further pages - no need to query
1862             # again.
1863
1864             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1865                 break
1866
1867             pagenum += 1
1868
1869         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1870         url_results = [self.url_result(url, 'Youtube') for url in urls]
1871         return [self.playlist_result(url_results, playlist_title = username)]
1872
1873
1874 class BlipTVUserIE(InfoExtractor):
1875     """Information Extractor for blip.tv users."""
1876
1877     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1878     _PAGE_SIZE = 12
1879     IE_NAME = u'blip.tv:user'
1880
1881     def _real_extract(self, url):
1882         # Extract username
1883         mobj = re.match(self._VALID_URL, url)
1884         if mobj is None:
1885             self._downloader.report_error(u'invalid url: %s' % url)
1886             return
1887
1888         username = mobj.group(1)
1889
1890         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1891
1892         page = self._download_webpage(url, username, u'Downloading user page')
1893         mobj = re.search(r'data-users-id="([^"]+)"', page)
1894         page_base = page_base % mobj.group(1)
1895
1896
1897         # Download video ids using BlipTV Ajax calls. Result size per
1898         # query is limited (currently to 12 videos) so we need to query
1899         # page by page until there are no video ids - it means we got
1900         # all of them.
1901
1902         video_ids = []
1903         pagenum = 1
1904
1905         while True:
1906             url = page_base + "&page=" + str(pagenum)
1907             page = self._download_webpage(url, username,
1908                                           u'Downloading video ids from page %d' % pagenum)
1909
1910             # Extract video identifiers
1911             ids_in_page = []
1912
1913             for mobj in re.finditer(r'href="/([^"]+)"', page):
1914                 if mobj.group(1) not in ids_in_page:
1915                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1916
1917             video_ids.extend(ids_in_page)
1918
1919             # A little optimization - if current page is not
1920             # "full", ie. does not contain PAGE_SIZE video ids then
1921             # we can assume that this page is the last one - there
1922             # are no more ids on further pages - no need to query
1923             # again.
1924
1925             if len(ids_in_page) < self._PAGE_SIZE:
1926                 break
1927
1928             pagenum += 1
1929
1930         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1931         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1932         return [self.playlist_result(url_entries, playlist_title = username)]
1933
1934
1935 class DepositFilesIE(InfoExtractor):
1936     """Information extractor for depositfiles.com"""
1937
1938     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1939
1940     def _real_extract(self, url):
1941         file_id = url.split('/')[-1]
1942         # Rebuild url in english locale
1943         url = 'http://depositfiles.com/en/files/' + file_id
1944
1945         # Retrieve file webpage with 'Free download' button pressed
1946         free_download_indication = { 'gateway_result' : '1' }
1947         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1948         try:
1949             self.report_download_webpage(file_id)
1950             webpage = compat_urllib_request.urlopen(request).read()
1951         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1952             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1953             return
1954
1955         # Search for the real file URL
1956         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1957         if (mobj is None) or (mobj.group(1) is None):
1958             # Try to figure out reason of the error.
1959             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1960             if (mobj is not None) and (mobj.group(1) is not None):
1961                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1962                 self._downloader.report_error(u'%s' % restriction_message)
1963             else:
1964                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1965             return
1966
1967         file_url = mobj.group(1)
1968         file_extension = os.path.splitext(file_url)[1][1:]
1969
1970         # Search for file title
1971         mobj = re.search(r'<b title="(.*?)">', webpage)
1972         if mobj is None:
1973             self._downloader.report_error(u'unable to extract title')
1974             return
1975         file_title = mobj.group(1).decode('utf-8')
1976
1977         return [{
1978             'id':       file_id.decode('utf-8'),
1979             'url':      file_url.decode('utf-8'),
1980             'uploader': None,
1981             'upload_date':  None,
1982             'title':    file_title,
1983             'ext':      file_extension.decode('utf-8'),
1984         }]
1985
1986
1987 class FacebookIE(InfoExtractor):
1988     """Information Extractor for Facebook"""
1989
1990     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1991     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1992     _NETRC_MACHINE = 'facebook'
1993     IE_NAME = u'facebook'
1994
1995     def report_login(self):
1996         """Report attempt to log in."""
1997         self.to_screen(u'Logging in')
1998
1999     def _real_initialize(self):
2000         if self._downloader is None:
2001             return
2002
2003         useremail = None
2004         password = None
2005         downloader_params = self._downloader.params
2006
2007         # Attempt to use provided username and password or .netrc data
2008         if downloader_params.get('username', None) is not None:
2009             useremail = downloader_params['username']
2010             password = downloader_params['password']
2011         elif downloader_params.get('usenetrc', False):
2012             try:
2013                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2014                 if info is not None:
2015                     useremail = info[0]
2016                     password = info[2]
2017                 else:
2018                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2019             except (IOError, netrc.NetrcParseError) as err:
2020                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2021                 return
2022
2023         if useremail is None:
2024             return
2025
2026         # Log in
2027         login_form = {
2028             'email': useremail,
2029             'pass': password,
2030             'login': 'Log+In'
2031             }
2032         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2033         try:
2034             self.report_login()
2035             login_results = compat_urllib_request.urlopen(request).read()
2036             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2037                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2038                 return
2039         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2040             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2041             return
2042
2043     def _real_extract(self, url):
2044         mobj = re.match(self._VALID_URL, url)
2045         if mobj is None:
2046             self._downloader.report_error(u'invalid URL: %s' % url)
2047             return
2048         video_id = mobj.group('ID')
2049
2050         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2051         webpage = self._download_webpage(url, video_id)
2052
2053         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2054         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2055         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2056         if not m:
2057             raise ExtractorError(u'Cannot parse data')
2058         data = dict(json.loads(m.group(1)))
2059         params_raw = compat_urllib_parse.unquote(data['params'])
2060         params = json.loads(params_raw)
2061         video_data = params['video_data'][0]
2062         video_url = video_data.get('hd_src')
2063         if not video_url:
2064             video_url = video_data['sd_src']
2065         if not video_url:
2066             raise ExtractorError(u'Cannot find video URL')
2067         video_duration = int(video_data['video_duration'])
2068         thumbnail = video_data['thumbnail_src']
2069
2070         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2071         if not m:
2072             raise ExtractorError(u'Cannot find title in webpage')
2073         video_title = unescapeHTML(m.group(1))
2074
2075         info = {
2076             'id': video_id,
2077             'title': video_title,
2078             'url': video_url,
2079             'ext': 'mp4',
2080             'duration': video_duration,
2081             'thumbnail': thumbnail,
2082         }
2083         return [info]
2084
2085
2086 class BlipTVIE(InfoExtractor):
2087     """Information extractor for blip.tv"""
2088
2089     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2090     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2091     IE_NAME = u'blip.tv'
2092
2093     def report_direct_download(self, title):
2094         """Report information extraction."""
2095         self.to_screen(u'%s: Direct download detected' % title)
2096
2097     def _real_extract(self, url):
2098         mobj = re.match(self._VALID_URL, url)
2099         if mobj is None:
2100             self._downloader.report_error(u'invalid URL: %s' % url)
2101             return
2102
2103         urlp = compat_urllib_parse_urlparse(url)
2104         if urlp.path.startswith('/play/'):
2105             request = compat_urllib_request.Request(url)
2106             response = compat_urllib_request.urlopen(request)
2107             redirecturl = response.geturl()
2108             rurlp = compat_urllib_parse_urlparse(redirecturl)
2109             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2110             url = 'http://blip.tv/a/a-' + file_id
2111             return self._real_extract(url)
2112
2113
2114         if '?' in url:
2115             cchar = '&'
2116         else:
2117             cchar = '?'
2118         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2119         request = compat_urllib_request.Request(json_url)
2120         request.add_header('User-Agent', 'iTunes/10.6.1')
2121         self.report_extraction(mobj.group(1))
2122         info = None
2123         try:
2124             urlh = compat_urllib_request.urlopen(request)
2125             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2126                 basename = url.split('/')[-1]
2127                 title,ext = os.path.splitext(basename)
2128                 title = title.decode('UTF-8')
2129                 ext = ext.replace('.', '')
2130                 self.report_direct_download(title)
2131                 info = {
2132                     'id': title,
2133                     'url': url,
2134                     'uploader': None,
2135                     'upload_date': None,
2136                     'title': title,
2137                     'ext': ext,
2138                     'urlhandle': urlh
2139                 }
2140         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2141             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2142         if info is None: # Regular URL
2143             try:
2144                 json_code_bytes = urlh.read()
2145                 json_code = json_code_bytes.decode('utf-8')
2146             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2147                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2148                 return
2149
2150             try:
2151                 json_data = json.loads(json_code)
2152                 if 'Post' in json_data:
2153                     data = json_data['Post']
2154                 else:
2155                     data = json_data
2156
2157                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2158                 video_url = data['media']['url']
2159                 umobj = re.match(self._URL_EXT, video_url)
2160                 if umobj is None:
2161                     raise ValueError('Can not determine filename extension')
2162                 ext = umobj.group(1)
2163
2164                 info = {
2165                     'id': data['item_id'],
2166                     'url': video_url,
2167                     'uploader': data['display_name'],
2168                     'upload_date': upload_date,
2169                     'title': data['title'],
2170                     'ext': ext,
2171                     'format': data['media']['mimeType'],
2172                     'thumbnail': data['thumbnailUrl'],
2173                     'description': data['description'],
2174                     'player_url': data['embedUrl'],
2175                     'user_agent': 'iTunes/10.6.1',
2176                 }
2177             except (ValueError,KeyError) as err:
2178                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2179                 return
2180
2181         return [info]
2182
2183
2184 class MyVideoIE(InfoExtractor):
2185     """Information Extractor for myvideo.de."""
2186
2187     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2188     IE_NAME = u'myvideo'
2189
2190     def _real_extract(self,url):
2191         mobj = re.match(self._VALID_URL, url)
2192         if mobj is None:
2193             self._download.report_error(u'invalid URL: %s' % url)
2194             return
2195
2196         video_id = mobj.group(1)
2197
2198         # Get video webpage
2199         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2200         webpage = self._download_webpage(webpage_url, video_id)
2201
2202         self.report_extraction(video_id)
2203         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2204                  webpage)
2205         if mobj is None:
2206             self._downloader.report_error(u'unable to extract media URL')
2207             return
2208         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2209
2210         mobj = re.search('<title>([^<]+)</title>', webpage)
2211         if mobj is None:
2212             self._downloader.report_error(u'unable to extract title')
2213             return
2214
2215         video_title = mobj.group(1)
2216
2217         return [{
2218             'id':       video_id,
2219             'url':      video_url,
2220             'uploader': None,
2221             'upload_date':  None,
2222             'title':    video_title,
2223             'ext':      u'flv',
2224         }]
2225
2226 class ComedyCentralIE(InfoExtractor):
2227     """Information extractor for The Daily Show and Colbert Report """
2228
2229     # urls can be abbreviations like :thedailyshow or :colbert
2230     # urls for episodes like:
2231     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2232     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2233     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2234     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2235                       |(https?://)?(www\.)?
2236                           (?P<showname>thedailyshow|colbertnation)\.com/
2237                          (full-episodes/(?P<episode>.*)|
2238                           (?P<clip>
2239                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2240                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2241                      $"""
2242
2243     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2244
2245     _video_extensions = {
2246         '3500': 'mp4',
2247         '2200': 'mp4',
2248         '1700': 'mp4',
2249         '1200': 'mp4',
2250         '750': 'mp4',
2251         '400': 'mp4',
2252     }
2253     _video_dimensions = {
2254         '3500': '1280x720',
2255         '2200': '960x540',
2256         '1700': '768x432',
2257         '1200': '640x360',
2258         '750': '512x288',
2259         '400': '384x216',
2260     }
2261
2262     @classmethod
2263     def suitable(cls, url):
2264         """Receives a URL and returns True if suitable for this IE."""
2265         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2266
2267     def _print_formats(self, formats):
2268         print('Available formats:')
2269         for x in formats:
2270             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2271
2272
2273     def _real_extract(self, url):
2274         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2275         if mobj is None:
2276             self._downloader.report_error(u'invalid URL: %s' % url)
2277             return
2278
2279         if mobj.group('shortname'):
2280             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2281                 url = u'http://www.thedailyshow.com/full-episodes/'
2282             else:
2283                 url = u'http://www.colbertnation.com/full-episodes/'
2284             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2285             assert mobj is not None
2286
2287         if mobj.group('clip'):
2288             if mobj.group('showname') == 'thedailyshow':
2289                 epTitle = mobj.group('tdstitle')
2290             else:
2291                 epTitle = mobj.group('cntitle')
2292             dlNewest = False
2293         else:
2294             dlNewest = not mobj.group('episode')
2295             if dlNewest:
2296                 epTitle = mobj.group('showname')
2297             else:
2298                 epTitle = mobj.group('episode')
2299
2300         self.report_extraction(epTitle)
2301         webpage = self._download_webpage(url, epTitle)
2302         if dlNewest:
2303             url = htmlHandle.geturl()
2304             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2305             if mobj is None:
2306                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2307                 return
2308             if mobj.group('episode') == '':
2309                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2310                 return
2311             epTitle = mobj.group('episode')
2312
2313         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2314
2315         if len(mMovieParams) == 0:
2316             # The Colbert Report embeds the information in a without
2317             # a URL prefix; so extract the alternate reference
2318             # and then add the URL prefix manually.
2319
2320             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2321             if len(altMovieParams) == 0:
2322                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2323                 return
2324             else:
2325                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2326
2327         uri = mMovieParams[0][1]
2328         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2329         indexXml = self._download_webpage(indexUrl, epTitle,
2330                                           u'Downloading show index',
2331                                           u'unable to download episode index')
2332
2333         results = []
2334
2335         idoc = xml.etree.ElementTree.fromstring(indexXml)
2336         itemEls = idoc.findall('.//item')
2337         for partNum,itemEl in enumerate(itemEls):
2338             mediaId = itemEl.findall('./guid')[0].text
2339             shortMediaId = mediaId.split(':')[-1]
2340             showId = mediaId.split(':')[-2].replace('.com', '')
2341             officialTitle = itemEl.findall('./title')[0].text
2342             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2343
2344             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2345                         compat_urllib_parse.urlencode({'uri': mediaId}))
2346             configXml = self._download_webpage(configUrl, epTitle,
2347                                                u'Downloading configuration for %s' % shortMediaId)
2348
2349             cdoc = xml.etree.ElementTree.fromstring(configXml)
2350             turls = []
2351             for rendition in cdoc.findall('.//rendition'):
2352                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2353                 turls.append(finfo)
2354
2355             if len(turls) == 0:
2356                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2357                 continue
2358
2359             if self._downloader.params.get('listformats', None):
2360                 self._print_formats([i[0] for i in turls])
2361                 return
2362
2363             # For now, just pick the highest bitrate
2364             format,rtmp_video_url = turls[-1]
2365
2366             # Get the format arg from the arg stream
2367             req_format = self._downloader.params.get('format', None)
2368
2369             # Select format if we can find one
2370             for f,v in turls:
2371                 if f == req_format:
2372                     format, rtmp_video_url = f, v
2373                     break
2374
2375             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2376             if not m:
2377                 raise ExtractorError(u'Cannot transform RTMP url')
2378             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2379             video_url = base + m.group('finalid')
2380
2381             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2382             info = {
2383                 'id': shortMediaId,
2384                 'url': video_url,
2385                 'uploader': showId,
2386                 'upload_date': officialDate,
2387                 'title': effTitle,
2388                 'ext': 'mp4',
2389                 'format': format,
2390                 'thumbnail': None,
2391                 'description': officialTitle,
2392             }
2393             results.append(info)
2394
2395         return results
2396
2397
2398 class EscapistIE(InfoExtractor):
2399     """Information extractor for The Escapist """
2400
2401     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2402     IE_NAME = u'escapist'
2403
2404     def _real_extract(self, url):
2405         mobj = re.match(self._VALID_URL, url)
2406         if mobj is None:
2407             self._downloader.report_error(u'invalid URL: %s' % url)
2408             return
2409         showName = mobj.group('showname')
2410         videoId = mobj.group('episode')
2411
2412         self.report_extraction(showName)
2413         webPage = self._download_webpage(url, showName)
2414
2415         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2416         description = unescapeHTML(descMatch.group(1))
2417         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2418         imgUrl = unescapeHTML(imgMatch.group(1))
2419         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2420         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2421         configUrlMatch = re.search('config=(.*)$', playerUrl)
2422         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2423
2424         configJSON = self._download_webpage(configUrl, showName,
2425                                             u'Downloading configuration',
2426                                             u'unable to download configuration')
2427
2428         # Technically, it's JavaScript, not JSON
2429         configJSON = configJSON.replace("'", '"')
2430
2431         try:
2432             config = json.loads(configJSON)
2433         except (ValueError,) as err:
2434             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2435             return
2436
2437         playlist = config['playlist']
2438         videoUrl = playlist[1]['url']
2439
2440         info = {
2441             'id': videoId,
2442             'url': videoUrl,
2443             'uploader': showName,
2444             'upload_date': None,
2445             'title': showName,
2446             'ext': 'mp4',
2447             'thumbnail': imgUrl,
2448             'description': description,
2449             'player_url': playerUrl,
2450         }
2451
2452         return [info]
2453
2454 class CollegeHumorIE(InfoExtractor):
2455     """Information extractor for collegehumor.com"""
2456
2457     _WORKING = False
2458     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2459     IE_NAME = u'collegehumor'
2460
2461     def report_manifest(self, video_id):
2462         """Report information extraction."""
2463         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2464
2465     def _real_extract(self, url):
2466         mobj = re.match(self._VALID_URL, url)
2467         if mobj is None:
2468             self._downloader.report_error(u'invalid URL: %s' % url)
2469             return
2470         video_id = mobj.group('videoid')
2471
2472         info = {
2473             'id': video_id,
2474             'uploader': None,
2475             'upload_date': None,
2476         }
2477
2478         self.report_extraction(video_id)
2479         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2480         try:
2481             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2483             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2484             return
2485
2486         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2487         try:
2488             videoNode = mdoc.findall('./video')[0]
2489             info['description'] = videoNode.findall('./description')[0].text
2490             info['title'] = videoNode.findall('./caption')[0].text
2491             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2492             manifest_url = videoNode.findall('./file')[0].text
2493         except IndexError:
2494             self._downloader.report_error(u'Invalid metadata XML file')
2495             return
2496
2497         manifest_url += '?hdcore=2.10.3'
2498         self.report_manifest(video_id)
2499         try:
2500             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2501         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2502             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2503             return
2504
2505         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2506         try:
2507             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2508             node_id = media_node.attrib['url']
2509             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2510         except IndexError as err:
2511             self._downloader.report_error(u'Invalid manifest file')
2512             return
2513
2514         url_pr = compat_urllib_parse_urlparse(manifest_url)
2515         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2516
2517         info['url'] = url
2518         info['ext'] = 'f4f'
2519         return [info]
2520
2521
2522 class XVideosIE(InfoExtractor):
2523     """Information extractor for xvideos.com"""
2524
2525     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2526     IE_NAME = u'xvideos'
2527
2528     def _real_extract(self, url):
2529         mobj = re.match(self._VALID_URL, url)
2530         if mobj is None:
2531             self._downloader.report_error(u'invalid URL: %s' % url)
2532             return
2533         video_id = mobj.group(1)
2534
2535         webpage = self._download_webpage(url, video_id)
2536
2537         self.report_extraction(video_id)
2538
2539
2540         # Extract video URL
2541         mobj = re.search(r'flv_url=(.+?)&', webpage)
2542         if mobj is None:
2543             self._downloader.report_error(u'unable to extract video url')
2544             return
2545         video_url = compat_urllib_parse.unquote(mobj.group(1))
2546
2547
2548         # Extract title
2549         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2550         if mobj is None:
2551             self._downloader.report_error(u'unable to extract video title')
2552             return
2553         video_title = mobj.group(1)
2554
2555
2556         # Extract video thumbnail
2557         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2558         if mobj is None:
2559             self._downloader.report_error(u'unable to extract video thumbnail')
2560             return
2561         video_thumbnail = mobj.group(0)
2562
2563         info = {
2564             'id': video_id,
2565             'url': video_url,
2566             'uploader': None,
2567             'upload_date': None,
2568             'title': video_title,
2569             'ext': 'flv',
2570             'thumbnail': video_thumbnail,
2571             'description': None,
2572         }
2573
2574         return [info]
2575
2576
2577 class SoundcloudIE(InfoExtractor):
2578     """Information extractor for soundcloud.com
2579        To access the media, the uid of the song and a stream token
2580        must be extracted from the page source and the script must make
2581        a request to media.soundcloud.com/crossdomain.xml. Then
2582        the media can be grabbed by requesting from an url composed
2583        of the stream token and uid
2584      """
2585
2586     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2587     IE_NAME = u'soundcloud'
2588
2589     def report_resolve(self, video_id):
2590         """Report information extraction."""
2591         self.to_screen(u'%s: Resolving id' % video_id)
2592
2593     def _real_extract(self, url):
2594         mobj = re.match(self._VALID_URL, url)
2595         if mobj is None:
2596             self._downloader.report_error(u'invalid URL: %s' % url)
2597             return
2598
2599         # extract uploader (which is in the url)
2600         uploader = mobj.group(1)
2601         # extract simple title (uploader + slug of song title)
2602         slug_title =  mobj.group(2)
2603         simple_title = uploader + u'-' + slug_title
2604         full_title = '%s/%s' % (uploader, slug_title)
2605
2606         self.report_resolve(full_title)
2607
2608         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2609         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2610         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2611
2612         info = json.loads(info_json)
2613         video_id = info['id']
2614         self.report_extraction(full_title)
2615
2616         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2617         stream_json = self._download_webpage(streams_url, full_title,
2618                                              u'Downloading stream definitions',
2619                                              u'unable to download stream definitions')
2620
2621         streams = json.loads(stream_json)
2622         mediaURL = streams['http_mp3_128_url']
2623         upload_date = unified_strdate(info['created_at'])
2624
2625         return [{
2626             'id':       info['id'],
2627             'url':      mediaURL,
2628             'uploader': info['user']['username'],
2629             'upload_date': upload_date,
2630             'title':    info['title'],
2631             'ext':      u'mp3',
2632             'description': info['description'],
2633         }]
2634
2635 class SoundcloudSetIE(InfoExtractor):
2636     """Information extractor for soundcloud.com sets
2637        To access the media, the uid of the song and a stream token
2638        must be extracted from the page source and the script must make
2639        a request to media.soundcloud.com/crossdomain.xml. Then
2640        the media can be grabbed by requesting from an url composed
2641        of the stream token and uid
2642      """
2643
2644     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2645     IE_NAME = u'soundcloud:set'
2646
2647     def report_resolve(self, video_id):
2648         """Report information extraction."""
2649         self.to_screen(u'%s: Resolving id' % video_id)
2650
2651     def _real_extract(self, url):
2652         mobj = re.match(self._VALID_URL, url)
2653         if mobj is None:
2654             self._downloader.report_error(u'invalid URL: %s' % url)
2655             return
2656
2657         # extract uploader (which is in the url)
2658         uploader = mobj.group(1)
2659         # extract simple title (uploader + slug of song title)
2660         slug_title =  mobj.group(2)
2661         simple_title = uploader + u'-' + slug_title
2662         full_title = '%s/sets/%s' % (uploader, slug_title)
2663
2664         self.report_resolve(full_title)
2665
2666         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2667         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2668         info_json = self._download_webpage(resolv_url, full_title)
2669
2670         videos = []
2671         info = json.loads(info_json)
2672         if 'errors' in info:
2673             for err in info['errors']:
2674                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2675             return
2676
2677         self.report_extraction(full_title)
2678         for track in info['tracks']:
2679             video_id = track['id']
2680
2681             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2682             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2683
2684             self.report_extraction(video_id)
2685             streams = json.loads(stream_json)
2686             mediaURL = streams['http_mp3_128_url']
2687
2688             videos.append({
2689                 'id':       video_id,
2690                 'url':      mediaURL,
2691                 'uploader': track['user']['username'],
2692                 'upload_date':  unified_strdate(track['created_at']),
2693                 'title':    track['title'],
2694                 'ext':      u'mp3',
2695                 'description': track['description'],
2696             })
2697         return videos
2698
2699
2700 class InfoQIE(InfoExtractor):
2701     """Information extractor for infoq.com"""
2702     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2703
2704     def _real_extract(self, url):
2705         mobj = re.match(self._VALID_URL, url)
2706         if mobj is None:
2707             self._downloader.report_error(u'invalid URL: %s' % url)
2708             return
2709
2710         webpage = self._download_webpage(url, video_id=url)
2711         self.report_extraction(url)
2712
2713         # Extract video URL
2714         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2715         if mobj is None:
2716             self._downloader.report_error(u'unable to extract video url')
2717             return
2718         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2719         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2720
2721         # Extract title
2722         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2723         if mobj is None:
2724             self._downloader.report_error(u'unable to extract video title')
2725             return
2726         video_title = mobj.group(1)
2727
2728         # Extract description
2729         video_description = u'No description available.'
2730         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2731         if mobj is not None:
2732             video_description = mobj.group(1)
2733
2734         video_filename = video_url.split('/')[-1]
2735         video_id, extension = video_filename.split('.')
2736
2737         info = {
2738             'id': video_id,
2739             'url': video_url,
2740             'uploader': None,
2741             'upload_date': None,
2742             'title': video_title,
2743             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2744             'thumbnail': None,
2745             'description': video_description,
2746         }
2747
2748         return [info]
2749
2750 class MixcloudIE(InfoExtractor):
2751     """Information extractor for www.mixcloud.com"""
2752
2753     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2754     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2755     IE_NAME = u'mixcloud'
2756
2757     def report_download_json(self, file_id):
2758         """Report JSON download."""
2759         self.to_screen(u'Downloading json')
2760
2761     def get_urls(self, jsonData, fmt, bitrate='best'):
2762         """Get urls from 'audio_formats' section in json"""
2763         file_url = None
2764         try:
2765             bitrate_list = jsonData[fmt]
2766             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2767                 bitrate = max(bitrate_list) # select highest
2768
2769             url_list = jsonData[fmt][bitrate]
2770         except TypeError: # we have no bitrate info.
2771             url_list = jsonData[fmt]
2772         return url_list
2773
2774     def check_urls(self, url_list):
2775         """Returns 1st active url from list"""
2776         for url in url_list:
2777             try:
2778                 compat_urllib_request.urlopen(url)
2779                 return url
2780             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2781                 url = None
2782
2783         return None
2784
2785     def _print_formats(self, formats):
2786         print('Available formats:')
2787         for fmt in formats.keys():
2788             for b in formats[fmt]:
2789                 try:
2790                     ext = formats[fmt][b][0]
2791                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2792                 except TypeError: # we have no bitrate info
2793                     ext = formats[fmt][0]
2794                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2795                     break
2796
2797     def _real_extract(self, url):
2798         mobj = re.match(self._VALID_URL, url)
2799         if mobj is None:
2800             self._downloader.report_error(u'invalid URL: %s' % url)
2801             return
2802         # extract uploader & filename from url
2803         uploader = mobj.group(1).decode('utf-8')
2804         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2805
2806         # construct API request
2807         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2808         # retrieve .json file with links to files
2809         request = compat_urllib_request.Request(file_url)
2810         try:
2811             self.report_download_json(file_url)
2812             jsonData = compat_urllib_request.urlopen(request).read()
2813         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2814             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2815             return
2816
2817         # parse JSON
2818         json_data = json.loads(jsonData)
2819         player_url = json_data['player_swf_url']
2820         formats = dict(json_data['audio_formats'])
2821
2822         req_format = self._downloader.params.get('format', None)
2823         bitrate = None
2824
2825         if self._downloader.params.get('listformats', None):
2826             self._print_formats(formats)
2827             return
2828
2829         if req_format is None or req_format == 'best':
2830             for format_param in formats.keys():
2831                 url_list = self.get_urls(formats, format_param)
2832                 # check urls
2833                 file_url = self.check_urls(url_list)
2834                 if file_url is not None:
2835                     break # got it!
2836         else:
2837             if req_format not in formats:
2838                 self._downloader.report_error(u'format is not available')
2839                 return
2840
2841             url_list = self.get_urls(formats, req_format)
2842             file_url = self.check_urls(url_list)
2843             format_param = req_format
2844
2845         return [{
2846             'id': file_id.decode('utf-8'),
2847             'url': file_url.decode('utf-8'),
2848             'uploader': uploader.decode('utf-8'),
2849             'upload_date': None,
2850             'title': json_data['name'],
2851             'ext': file_url.split('.')[-1].decode('utf-8'),
2852             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2853             'thumbnail': json_data['thumbnail_url'],
2854             'description': json_data['description'],
2855             'player_url': player_url.decode('utf-8'),
2856         }]
2857
2858 class StanfordOpenClassroomIE(InfoExtractor):
2859     """Information extractor for Stanford's Open ClassRoom"""
2860
2861     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2862     IE_NAME = u'stanfordoc'
2863
2864     def _real_extract(self, url):
2865         mobj = re.match(self._VALID_URL, url)
2866         if mobj is None:
2867             raise ExtractorError(u'Invalid URL: %s' % url)
2868
2869         if mobj.group('course') and mobj.group('video'): # A specific video
2870             course = mobj.group('course')
2871             video = mobj.group('video')
2872             info = {
2873                 'id': course + '_' + video,
2874                 'uploader': None,
2875                 'upload_date': None,
2876             }
2877
2878             self.report_extraction(info['id'])
2879             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2880             xmlUrl = baseUrl + video + '.xml'
2881             try:
2882                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2883             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2885                 return
2886             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2887             try:
2888                 info['title'] = mdoc.findall('./title')[0].text
2889                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2890             except IndexError:
2891                 self._downloader.report_error(u'Invalid metadata XML file')
2892                 return
2893             info['ext'] = info['url'].rpartition('.')[2]
2894             return [info]
2895         elif mobj.group('course'): # A course page
2896             course = mobj.group('course')
2897             info = {
2898                 'id': course,
2899                 'type': 'playlist',
2900                 'uploader': None,
2901                 'upload_date': None,
2902             }
2903
2904             coursepage = self._download_webpage(url, info['id'],
2905                                         note='Downloading course info page',
2906                                         errnote='Unable to download course info page')
2907
2908             m = re.search('<h1>([^<]+)</h1>', coursepage)
2909             if m:
2910                 info['title'] = unescapeHTML(m.group(1))
2911             else:
2912                 info['title'] = info['id']
2913
2914             m = re.search('<description>([^<]+)</description>', coursepage)
2915             if m:
2916                 info['description'] = unescapeHTML(m.group(1))
2917
2918             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2919             info['list'] = [
2920                 {
2921                     'type': 'reference',
2922                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2923                 }
2924                     for vpage in links]
2925             results = []
2926             for entry in info['list']:
2927                 assert entry['type'] == 'reference'
2928                 results += self.extract(entry['url'])
2929             return results
2930         else: # Root page
2931             info = {
2932                 'id': 'Stanford OpenClassroom',
2933                 'type': 'playlist',
2934                 'uploader': None,
2935                 'upload_date': None,
2936             }
2937
2938             self.report_download_webpage(info['id'])
2939             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2940             try:
2941                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2942             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2943                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2944                 return
2945
2946             info['title'] = info['id']
2947
2948             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2949             info['list'] = [
2950                 {
2951                     'type': 'reference',
2952                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2953                 }
2954                     for cpage in links]
2955
2956             results = []
2957             for entry in info['list']:
2958                 assert entry['type'] == 'reference'
2959                 results += self.extract(entry['url'])
2960             return results
2961
2962 class MTVIE(InfoExtractor):
2963     """Information extractor for MTV.com"""
2964
2965     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2966     IE_NAME = u'mtv'
2967
2968     def _real_extract(self, url):
2969         mobj = re.match(self._VALID_URL, url)
2970         if mobj is None:
2971             self._downloader.report_error(u'invalid URL: %s' % url)
2972             return
2973         if not mobj.group('proto'):
2974             url = 'http://' + url
2975         video_id = mobj.group('videoid')
2976
2977         webpage = self._download_webpage(url, video_id)
2978
2979         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2980         if mobj is None:
2981             self._downloader.report_error(u'unable to extract song name')
2982             return
2983         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2984         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2985         if mobj is None:
2986             self._downloader.report_error(u'unable to extract performer')
2987             return
2988         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2989         video_title = performer + ' - ' + song_name
2990
2991         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2992         if mobj is None:
2993             self._downloader.report_error(u'unable to mtvn_uri')
2994             return
2995         mtvn_uri = mobj.group(1)
2996
2997         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2998         if mobj is None:
2999             self._downloader.report_error(u'unable to extract content id')
3000             return
3001         content_id = mobj.group(1)
3002
3003         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3004         self.report_extraction(video_id)
3005         request = compat_urllib_request.Request(videogen_url)
3006         try:
3007             metadataXml = compat_urllib_request.urlopen(request).read()
3008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3010             return
3011
3012         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3013         renditions = mdoc.findall('.//rendition')
3014
3015         # For now, always pick the highest quality.
3016         rendition = renditions[-1]
3017
3018         try:
3019             _,_,ext = rendition.attrib['type'].partition('/')
3020             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3021             video_url = rendition.find('./src').text
3022         except KeyError:
3023             self._downloader.report_error('Invalid rendition field.')
3024             return
3025
3026         info = {
3027             'id': video_id,
3028             'url': video_url,
3029             'uploader': performer,
3030             'upload_date': None,
3031             'title': video_title,
3032             'ext': ext,
3033             'format': format,
3034         }
3035
3036         return [info]
3037
3038
3039 class YoukuIE(InfoExtractor):
3040     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3041
3042     def _gen_sid(self):
3043         nowTime = int(time.time() * 1000)
3044         random1 = random.randint(1000,1998)
3045         random2 = random.randint(1000,9999)
3046
3047         return "%d%d%d" %(nowTime,random1,random2)
3048
3049     def _get_file_ID_mix_string(self, seed):
3050         mixed = []
3051         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3052         seed = float(seed)
3053         for i in range(len(source)):
3054             seed  =  (seed * 211 + 30031 ) % 65536
3055             index  =  math.floor(seed / 65536 * len(source) )
3056             mixed.append(source[int(index)])
3057             source.remove(source[int(index)])
3058         #return ''.join(mixed)
3059         return mixed
3060
3061     def _get_file_id(self, fileId, seed):
3062         mixed = self._get_file_ID_mix_string(seed)
3063         ids = fileId.split('*')
3064         realId = []
3065         for ch in ids:
3066             if ch:
3067                 realId.append(mixed[int(ch)])
3068         return ''.join(realId)
3069
3070     def _real_extract(self, url):
3071         mobj = re.match(self._VALID_URL, url)
3072         if mobj is None:
3073             self._downloader.report_error(u'invalid URL: %s' % url)
3074             return
3075         video_id = mobj.group('ID')
3076
3077         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3078
3079         jsondata = self._download_webpage(info_url, video_id)
3080
3081         self.report_extraction(video_id)
3082         try:
3083             config = json.loads(jsondata)
3084
3085             video_title =  config['data'][0]['title']
3086             seed = config['data'][0]['seed']
3087
3088             format = self._downloader.params.get('format', None)
3089             supported_format = list(config['data'][0]['streamfileids'].keys())
3090
3091             if format is None or format == 'best':
3092                 if 'hd2' in supported_format:
3093                     format = 'hd2'
3094                 else:
3095                     format = 'flv'
3096                 ext = u'flv'
3097             elif format == 'worst':
3098                 format = 'mp4'
3099                 ext = u'mp4'
3100             else:
3101                 format = 'flv'
3102                 ext = u'flv'
3103
3104
3105             fileid = config['data'][0]['streamfileids'][format]
3106             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3107         except (UnicodeDecodeError, ValueError, KeyError):
3108             self._downloader.report_error(u'unable to extract info section')
3109             return
3110
3111         files_info=[]
3112         sid = self._gen_sid()
3113         fileid = self._get_file_id(fileid, seed)
3114
3115         #column 8,9 of fileid represent the segment number
3116         #fileid[7:9] should be changed
3117         for index, key in enumerate(keys):
3118
3119             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3120             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3121
3122             info = {
3123                 'id': '%s_part%02d' % (video_id, index),
3124                 'url': download_url,
3125                 'uploader': None,
3126                 'upload_date': None,
3127                 'title': video_title,
3128                 'ext': ext,
3129             }
3130             files_info.append(info)
3131
3132         return files_info
3133
3134
3135 class XNXXIE(InfoExtractor):
3136     """Information extractor for xnxx.com"""
3137
3138     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3139     IE_NAME = u'xnxx'
3140     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3141     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3142     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3143
3144     def _real_extract(self, url):
3145         mobj = re.match(self._VALID_URL, url)
3146         if mobj is None:
3147             self._downloader.report_error(u'invalid URL: %s' % url)
3148             return
3149         video_id = mobj.group(1)
3150
3151         # Get webpage content
3152         webpage = self._download_webpage(url, video_id)
3153
3154         result = re.search(self.VIDEO_URL_RE, webpage)
3155         if result is None:
3156             self._downloader.report_error(u'unable to extract video url')
3157             return
3158         video_url = compat_urllib_parse.unquote(result.group(1))
3159
3160         result = re.search(self.VIDEO_TITLE_RE, webpage)
3161         if result is None:
3162             self._downloader.report_error(u'unable to extract video title')
3163             return
3164         video_title = result.group(1)
3165
3166         result = re.search(self.VIDEO_THUMB_RE, webpage)
3167         if result is None:
3168             self._downloader.report_error(u'unable to extract video thumbnail')
3169             return
3170         video_thumbnail = result.group(1)
3171
3172         return [{
3173             'id': video_id,
3174             'url': video_url,
3175             'uploader': None,
3176             'upload_date': None,
3177             'title': video_title,
3178             'ext': 'flv',
3179             'thumbnail': video_thumbnail,
3180             'description': None,
3181         }]
3182
3183
3184 class GooglePlusIE(InfoExtractor):
3185     """Information extractor for plus.google.com."""
3186
3187     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3188     IE_NAME = u'plus.google'
3189
3190     def report_extract_entry(self, url):
3191         """Report downloading extry"""
3192         self.to_screen(u'Downloading entry: %s' % url)
3193
3194     def report_date(self, upload_date):
3195         """Report downloading extry"""
3196         self.to_screen(u'Entry date: %s' % upload_date)
3197
3198     def report_uploader(self, uploader):
3199         """Report downloading extry"""
3200         self.to_screen(u'Uploader: %s' % uploader)
3201
3202     def report_title(self, video_title):
3203         """Report downloading extry"""
3204         self.to_screen(u'Title: %s' % video_title)
3205
3206     def report_extract_vid_page(self, video_page):
3207         """Report information extraction."""
3208         self.to_screen(u'Extracting video page: %s' % video_page)
3209
3210     def _real_extract(self, url):
3211         # Extract id from URL
3212         mobj = re.match(self._VALID_URL, url)
3213         if mobj is None:
3214             self._downloader.report_error(u'Invalid URL: %s' % url)
3215             return
3216
3217         post_url = mobj.group(0)
3218         video_id = mobj.group(1)
3219
3220         video_extension = 'flv'
3221
3222         # Step 1, Retrieve post webpage to extract further information
3223         self.report_extract_entry(post_url)
3224         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3225
3226         # Extract update date
3227         upload_date = None
3228         pattern = 'title="Timestamp">(.*?)</a>'
3229         mobj = re.search(pattern, webpage)
3230         if mobj:
3231             upload_date = mobj.group(1)
3232             # Convert timestring to a format suitable for filename
3233             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3234             upload_date = upload_date.strftime('%Y%m%d')
3235         self.report_date(upload_date)
3236
3237         # Extract uploader
3238         uploader = None
3239         pattern = r'rel\="author".*?>(.*?)</a>'
3240         mobj = re.search(pattern, webpage)
3241         if mobj:
3242             uploader = mobj.group(1)
3243         self.report_uploader(uploader)
3244
3245         # Extract title
3246         # Get the first line for title
3247         video_title = u'NA'
3248         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3249         mobj = re.search(pattern, webpage)
3250         if mobj:
3251             video_title = mobj.group(1)
3252         self.report_title(video_title)
3253
3254         # Step 2, Stimulate clicking the image box to launch video
3255         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3256         mobj = re.search(pattern, webpage)
3257         if mobj is None:
3258             self._downloader.report_error(u'unable to extract video page URL')
3259
3260         video_page = mobj.group(1)
3261         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3262         self.report_extract_vid_page(video_page)
3263
3264
3265         # Extract video links on video page
3266         """Extract video links of all sizes"""
3267         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3268         mobj = re.findall(pattern, webpage)
3269         if len(mobj) == 0:
3270             self._downloader.report_error(u'unable to extract video links')
3271
3272         # Sort in resolution
3273         links = sorted(mobj)
3274
3275         # Choose the lowest of the sort, i.e. highest resolution
3276         video_url = links[-1]
3277         # Only get the url. The resolution part in the tuple has no use anymore
3278         video_url = video_url[-1]
3279         # Treat escaped \u0026 style hex
3280         try:
3281             video_url = video_url.decode("unicode_escape")
3282         except AttributeError: # Python 3
3283             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3284
3285
3286         return [{
3287             'id':       video_id,
3288             'url':      video_url,
3289             'uploader': uploader,
3290             'upload_date':  upload_date,
3291             'title':    video_title,
3292             'ext':      video_extension,
3293         }]
3294
3295 class NBAIE(InfoExtractor):
3296     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3297     IE_NAME = u'nba'
3298
3299     def _real_extract(self, url):
3300         mobj = re.match(self._VALID_URL, url)
3301         if mobj is None:
3302             self._downloader.report_error(u'invalid URL: %s' % url)
3303             return
3304
3305         video_id = mobj.group(1)
3306         if video_id.endswith('/index.html'):
3307             video_id = video_id[:-len('/index.html')]
3308
3309         webpage = self._download_webpage(url, video_id)
3310
3311         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3312         def _findProp(rexp, default=None):
3313             m = re.search(rexp, webpage)
3314             if m:
3315                 return unescapeHTML(m.group(1))
3316             else:
3317                 return default
3318
3319         shortened_video_id = video_id.rpartition('/')[2]
3320         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3321         info = {
3322             'id': shortened_video_id,
3323             'url': video_url,
3324             'ext': 'mp4',
3325             'title': title,
3326             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3327             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3328         }
3329         return [info]
3330
3331 class JustinTVIE(InfoExtractor):
3332     """Information extractor for justin.tv and twitch.tv"""
3333     # TODO: One broadcast may be split into multiple videos. The key
3334     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3335     # starts at 1 and increases. Can we treat all parts as one video?
3336
3337     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3338         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3339     _JUSTIN_PAGE_LIMIT = 100
3340     IE_NAME = u'justin.tv'
3341
3342     def report_download_page(self, channel, offset):
3343         """Report attempt to download a single page of videos."""
3344         self.to_screen(u'%s: Downloading video information from %d to %d' %
3345                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3346
3347     # Return count of items, list of *valid* items
3348     def _parse_page(self, url, video_id):
3349         webpage = self._download_webpage(url, video_id,
3350                                          u'Downloading video info JSON',
3351                                          u'unable to download video info JSON')
3352
3353         response = json.loads(webpage)
3354         if type(response) != list:
3355             error_text = response.get('error', 'unknown error')
3356             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3357             return
3358         info = []
3359         for clip in response:
3360             video_url = clip['video_file_url']
3361             if video_url:
3362                 video_extension = os.path.splitext(video_url)[1][1:]
3363                 video_date = re.sub('-', '', clip['start_time'][:10])
3364                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3365                 video_id = clip['id']
3366                 video_title = clip.get('title', video_id)
3367                 info.append({
3368                     'id': video_id,
3369                     'url': video_url,
3370                     'title': video_title,
3371                     'uploader': clip.get('channel_name', video_uploader_id),
3372                     'uploader_id': video_uploader_id,
3373                     'upload_date': video_date,
3374                     'ext': video_extension,
3375                 })
3376         return (len(response), info)
3377
3378     def _real_extract(self, url):
3379         mobj = re.match(self._VALID_URL, url)
3380         if mobj is None:
3381             self._downloader.report_error(u'invalid URL: %s' % url)
3382             return
3383
3384         api = 'http://api.justin.tv'
3385         video_id = mobj.group(mobj.lastindex)
3386         paged = False
3387         if mobj.lastindex == 1:
3388             paged = True
3389             api += '/channel/archives/%s.json'
3390         else:
3391             api += '/broadcast/by_archive/%s.json'
3392         api = api % (video_id,)
3393
3394         self.report_extraction(video_id)
3395
3396         info = []
3397         offset = 0
3398         limit = self._JUSTIN_PAGE_LIMIT
3399         while True:
3400             if paged:
3401                 self.report_download_page(video_id, offset)
3402             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3403             page_count, page_info = self._parse_page(page_url, video_id)
3404             info.extend(page_info)
3405             if not paged or page_count != limit:
3406                 break
3407             offset += limit
3408         return info
3409
3410 class FunnyOrDieIE(InfoExtractor):
3411     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3412
3413     def _real_extract(self, url):
3414         mobj = re.match(self._VALID_URL, url)
3415         if mobj is None:
3416             self._downloader.report_error(u'invalid URL: %s' % url)
3417             return
3418
3419         video_id = mobj.group('id')
3420         webpage = self._download_webpage(url, video_id)
3421
3422         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3423         if not m:
3424             self._downloader.report_error(u'unable to find video information')
3425         video_url = unescapeHTML(m.group('url'))
3426
3427         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3428         if not m:
3429             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3430             if not m:
3431                 self._downloader.report_error(u'Cannot find video title')
3432         title = clean_html(m.group('title'))
3433
3434         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3435         if m:
3436             desc = unescapeHTML(m.group('desc'))
3437         else:
3438             desc = None
3439
3440         info = {
3441             'id': video_id,
3442             'url': video_url,
3443             'ext': 'mp4',
3444             'title': title,
3445             'description': desc,
3446         }
3447         return [info]
3448
3449 class SteamIE(InfoExtractor):
3450     _VALID_URL = r"""http://store\.steampowered\.com/
3451                 (agecheck/)?
3452                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3453                 (?P<gameID>\d+)/?
3454                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3455                 """
3456
3457     @classmethod
3458     def suitable(cls, url):
3459         """Receives a URL and returns True if suitable for this IE."""
3460         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3461
3462     def _real_extract(self, url):
3463         m = re.match(self._VALID_URL, url, re.VERBOSE)
3464         gameID = m.group('gameID')
3465         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3466         self.report_age_confirmation()
3467         webpage = self._download_webpage(videourl, gameID)
3468         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3469
3470         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3471         mweb = re.finditer(urlRE, webpage)
3472         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3473         titles = re.finditer(namesRE, webpage)
3474         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3475         thumbs = re.finditer(thumbsRE, webpage)
3476         videos = []
3477         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3478             video_id = vid.group('videoID')
3479             title = vtitle.group('videoName')
3480             video_url = vid.group('videoURL')
3481             video_thumb = thumb.group('thumbnail')
3482             if not video_url:
3483                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3484             info = {
3485                 'id':video_id,
3486                 'url':video_url,
3487                 'ext': 'flv',
3488                 'title': unescapeHTML(title),
3489                 'thumbnail': video_thumb
3490                   }
3491             videos.append(info)
3492         return [self.playlist_result(videos, gameID, game_title)]
3493
3494 class UstreamIE(InfoExtractor):
3495     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3496     IE_NAME = u'ustream'
3497
3498     def _real_extract(self, url):
3499         m = re.match(self._VALID_URL, url)
3500         video_id = m.group('videoID')
3501         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3502         webpage = self._download_webpage(url, video_id)
3503         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3504         title = m.group('title')
3505         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3506         uploader = m.group('uploader')
3507         info = {
3508                 'id':video_id,
3509                 'url':video_url,
3510                 'ext': 'flv',
3511                 'title': title,
3512                 'uploader': uploader
3513                   }
3514         return [info]
3515
3516 class WorldStarHipHopIE(InfoExtractor):
3517     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3518     IE_NAME = u'WorldStarHipHop'
3519
3520     def _real_extract(self, url):
3521         _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3522
3523         m = re.match(self._VALID_URL, url)
3524         video_id = m.group('id')
3525
3526         webpage_src = self._download_webpage(url, video_id)
3527
3528         mobj = re.search(_src_url, webpage_src)
3529
3530         if mobj is not None:
3531             video_url = mobj.group()
3532             if 'mp4' in video_url:
3533                 ext = 'mp4'
3534             else:
3535                 ext = 'flv'
3536         else:
3537             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3538             return
3539
3540         _title = r"""<title>(.*)</title>"""
3541
3542         mobj = re.search(_title, webpage_src)
3543
3544         if mobj is not None:
3545             title = mobj.group(1)
3546         else:
3547             title = 'World Start Hip Hop - %s' % time.ctime()
3548
3549         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3550         mobj = re.search(_thumbnail, webpage_src)
3551
3552         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3553         if mobj is not None:
3554             thumbnail = mobj.group(1)
3555         else:
3556             _title = r"""candytitles.*>(.*)</span>"""
3557             mobj = re.search(_title, webpage_src)
3558             if mobj is not None:
3559                 title = mobj.group(1)
3560             thumbnail = None
3561
3562         results = [{
3563                     'id': video_id,
3564                     'url' : video_url,
3565                     'title' : title,
3566                     'thumbnail' : thumbnail,
3567                     'ext' : ext,
3568                     }]
3569         return results
3570
3571 class RBMARadioIE(InfoExtractor):
3572     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3573
3574     def _real_extract(self, url):
3575         m = re.match(self._VALID_URL, url)
3576         video_id = m.group('videoID')
3577
3578         webpage = self._download_webpage(url, video_id)
3579         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3580         if not m:
3581             raise ExtractorError(u'Cannot find metadata')
3582         json_data = m.group(1)
3583
3584         try:
3585             data = json.loads(json_data)
3586         except ValueError as e:
3587             raise ExtractorError(u'Invalid JSON: ' + str(e))
3588
3589         video_url = data['akamai_url'] + '&cbr=256'
3590         url_parts = compat_urllib_parse_urlparse(video_url)
3591         video_ext = url_parts.path.rpartition('.')[2]
3592         info = {
3593                 'id': video_id,
3594                 'url': video_url,
3595                 'ext': video_ext,
3596                 'title': data['title'],
3597                 'description': data.get('teaser_text'),
3598                 'location': data.get('country_of_origin'),
3599                 'uploader': data.get('host', {}).get('name'),
3600                 'uploader_id': data.get('host', {}).get('slug'),
3601                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3602                 'duration': data.get('duration'),
3603         }
3604         return [info]
3605
3606
3607 class YouPornIE(InfoExtractor):
3608     """Information extractor for youporn.com."""
3609     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3610
3611     def _print_formats(self, formats):
3612         """Print all available formats"""
3613         print(u'Available formats:')
3614         print(u'ext\t\tformat')
3615         print(u'---------------------------------')
3616         for format in formats:
3617             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3618
3619     def _specific(self, req_format, formats):
3620         for x in formats:
3621             if(x["format"]==req_format):
3622                 return x
3623         return None
3624
3625     def _real_extract(self, url):
3626         mobj = re.match(self._VALID_URL, url)
3627         if mobj is None:
3628             self._downloader.report_error(u'invalid URL: %s' % url)
3629             return
3630
3631         video_id = mobj.group('videoid')
3632
3633         req = compat_urllib_request.Request(url)
3634         req.add_header('Cookie', 'age_verified=1')
3635         webpage = self._download_webpage(req, video_id)
3636
3637         # Get the video title
3638         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3639         if result is None:
3640             raise ExtractorError(u'Unable to extract video title')
3641         video_title = result.group('title').strip()
3642
3643         # Get the video date
3644         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3645         if result is None:
3646             self._downloader.report_warning(u'unable to extract video date')
3647             upload_date = None
3648         else:
3649             upload_date = unified_strdate(result.group('date').strip())
3650
3651         # Get the video uploader
3652         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3653         if result is None:
3654             self._downloader.report_warning(u'unable to extract uploader')
3655             video_uploader = None
3656         else:
3657             video_uploader = result.group('uploader').strip()
3658             video_uploader = clean_html( video_uploader )
3659
3660         # Get all of the formats available
3661         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3662         result = re.search(DOWNLOAD_LIST_RE, webpage)
3663         if result is None:
3664             raise ExtractorError(u'Unable to extract download list')
3665         download_list_html = result.group('download_list').strip()
3666
3667         # Get all of the links from the page
3668         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3669         links = re.findall(LINK_RE, download_list_html)
3670         if(len(links) == 0):
3671             raise ExtractorError(u'ERROR: no known formats available for video')
3672
3673         self.to_screen(u'Links found: %d' % len(links))
3674
3675         formats = []
3676         for link in links:
3677
3678             # A link looks like this:
3679             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3680             # A path looks like this:
3681             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3682             video_url = unescapeHTML( link )
3683             path = compat_urllib_parse_urlparse( video_url ).path
3684             extension = os.path.splitext( path )[1][1:]
3685             format = path.split('/')[4].split('_')[:2]
3686             size = format[0]
3687             bitrate = format[1]
3688             format = "-".join( format )
3689             title = u'%s-%s-%s' % (video_title, size, bitrate)
3690
3691             formats.append({
3692                 'id': video_id,
3693                 'url': video_url,
3694                 'uploader': video_uploader,
3695                 'upload_date': upload_date,
3696                 'title': title,
3697                 'ext': extension,
3698                 'format': format,
3699                 'thumbnail': None,
3700                 'description': None,
3701                 'player_url': None
3702             })
3703
3704         if self._downloader.params.get('listformats', None):
3705             self._print_formats(formats)
3706             return
3707
3708         req_format = self._downloader.params.get('format', None)
3709         self.to_screen(u'Format: %s' % req_format)
3710
3711         if req_format is None or req_format == 'best':
3712             return [formats[0]]
3713         elif req_format == 'worst':
3714             return [formats[-1]]
3715         elif req_format in ('-1', 'all'):
3716             return formats
3717         else:
3718             format = self._specific( req_format, formats )
3719             if result is None:
3720                 self._downloader.report_error(u'requested format not available')
3721                 return
3722             return [format]
3723
3724
3725
3726 class PornotubeIE(InfoExtractor):
3727     """Information extractor for pornotube.com."""
3728     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3729
3730     def _real_extract(self, url):
3731         mobj = re.match(self._VALID_URL, url)
3732         if mobj is None:
3733             self._downloader.report_error(u'invalid URL: %s' % url)
3734             return
3735
3736         video_id = mobj.group('videoid')
3737         video_title = mobj.group('title')
3738
3739         # Get webpage content
3740         webpage = self._download_webpage(url, video_id)
3741
3742         # Get the video URL
3743         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3744         result = re.search(VIDEO_URL_RE, webpage)
3745         if result is None:
3746             self._downloader.report_error(u'unable to extract video url')
3747             return
3748         video_url = compat_urllib_parse.unquote(result.group('url'))
3749
3750         #Get the uploaded date
3751         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3752         result = re.search(VIDEO_UPLOADED_RE, webpage)
3753         if result is None:
3754             self._downloader.report_error(u'unable to extract video title')
3755             return
3756         upload_date = unified_strdate(result.group('date'))
3757
3758         info = {'id': video_id,
3759                 'url': video_url,
3760                 'uploader': None,
3761                 'upload_date': upload_date,
3762                 'title': video_title,
3763                 'ext': 'flv',
3764                 'format': 'flv'}
3765
3766         return [info]
3767
3768 class YouJizzIE(InfoExtractor):
3769     """Information extractor for youjizz.com."""
3770     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3771
3772     def _real_extract(self, url):
3773         mobj = re.match(self._VALID_URL, url)
3774         if mobj is None:
3775             self._downloader.report_error(u'invalid URL: %s' % url)
3776             return
3777
3778         video_id = mobj.group('videoid')
3779
3780         # Get webpage content
3781         webpage = self._download_webpage(url, video_id)
3782
3783         # Get the video title
3784         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3785         if result is None:
3786             raise ExtractorError(u'ERROR: unable to extract video title')
3787         video_title = result.group('title').strip()
3788
3789         # Get the embed page
3790         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3791         if result is None:
3792             raise ExtractorError(u'ERROR: unable to extract embed page')
3793
3794         embed_page_url = result.group(0).strip()
3795         video_id = result.group('videoid')
3796
3797         webpage = self._download_webpage(embed_page_url, video_id)
3798
3799         # Get the video URL
3800         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3801         if result is None:
3802             raise ExtractorError(u'ERROR: unable to extract video url')
3803         video_url = result.group('source')
3804
3805         info = {'id': video_id,
3806                 'url': video_url,
3807                 'title': video_title,
3808                 'ext': 'flv',
3809                 'format': 'flv',
3810                 'player_url': embed_page_url}
3811
3812         return [info]
3813
3814 class EightTracksIE(InfoExtractor):
3815     IE_NAME = '8tracks'
3816     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3817
3818     def _real_extract(self, url):
3819         mobj = re.match(self._VALID_URL, url)
3820         if mobj is None:
3821             raise ExtractorError(u'Invalid URL: %s' % url)
3822         playlist_id = mobj.group('id')
3823
3824         webpage = self._download_webpage(url, playlist_id)
3825
3826         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3827         if not m:
3828             raise ExtractorError(u'Cannot find trax information')
3829         json_like = m.group(1)
3830         data = json.loads(json_like)
3831
3832         session = str(random.randint(0, 1000000000))
3833         mix_id = data['id']
3834         track_count = data['tracks_count']
3835         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3836         next_url = first_url
3837         res = []
3838         for i in itertools.count():
3839             api_json = self._download_webpage(next_url, playlist_id,
3840                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3841                 errnote=u'Failed to download song information')
3842             api_data = json.loads(api_json)
3843             track_data = api_data[u'set']['track']
3844             info = {
3845                 'id': track_data['id'],
3846                 'url': track_data['track_file_stream_url'],
3847                 'title': track_data['performer'] + u' - ' + track_data['name'],
3848                 'raw_title': track_data['name'],
3849                 'uploader_id': data['user']['login'],
3850                 'ext': 'm4a',
3851             }
3852             res.append(info)
3853             if api_data['set']['at_last_track']:
3854                 break
3855             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3856         return res
3857
3858 class KeekIE(InfoExtractor):
3859     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3860     IE_NAME = u'keek'
3861
3862     def _real_extract(self, url):
3863         m = re.match(self._VALID_URL, url)
3864         video_id = m.group('videoID')
3865         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3866         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3867         webpage = self._download_webpage(url, video_id)
3868         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3869         title = unescapeHTML(m.group('title'))
3870         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3871         uploader = clean_html(m.group('uploader'))
3872         info = {
3873                 'id': video_id,
3874                 'url': video_url,
3875                 'ext': 'mp4',
3876                 'title': title,
3877                 'thumbnail': thumbnail,
3878                 'uploader': uploader
3879         }
3880         return [info]
3881
3882 class TEDIE(InfoExtractor):
3883     _VALID_URL=r'''http://www\.ted\.com/
3884                    (
3885                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3886                         |
3887                         ((?P<type_talk>talks)) # We have a simple talk
3888                    )
3889                    (/lang/(.*?))? # The url may contain the language
3890                    /(?P<name>\w+) # Here goes the name and then ".html"
3891                    '''
3892
3893     @classmethod
3894     def suitable(cls, url):
3895         """Receives a URL and returns True if suitable for this IE."""
3896         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3897
3898     def _real_extract(self, url):
3899         m=re.match(self._VALID_URL, url, re.VERBOSE)
3900         if m.group('type_talk'):
3901             return [self._talk_info(url)]
3902         else :
3903             playlist_id=m.group('playlist_id')
3904             name=m.group('name')
3905             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3906             return [self._playlist_videos_info(url,name,playlist_id)]
3907
3908     def _talk_video_link(self,mediaSlug):
3909         '''Returns the video link for that mediaSlug'''
3910         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3911
3912     def _playlist_videos_info(self,url,name,playlist_id=0):
3913         '''Returns the videos of the playlist'''
3914         video_RE=r'''
3915                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3916                      ([.\s]*?)data-playlist_item_id="(\d+)"
3917                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3918                      '''
3919         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3920         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3921         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3922         m_names=re.finditer(video_name_RE,webpage)
3923
3924         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3925         m_playlist = re.search(playlist_RE, webpage)
3926         playlist_title = m_playlist.group('playlist_title')
3927
3928         playlist_entries = []
3929         for m_video, m_name in zip(m_videos,m_names):
3930             video_id=m_video.group('video_id')
3931             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3932             playlist_entries.append(self.url_result(talk_url, 'TED'))
3933         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3934
3935     def _talk_info(self, url, video_id=0):
3936         """Return the video for the talk in the url"""
3937         m=re.match(self._VALID_URL, url,re.VERBOSE)
3938         videoName=m.group('name')
3939         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3940         # If the url includes the language we get the title translated
3941         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3942         title=re.search(title_RE, webpage).group('title')
3943         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3944                         "id":(?P<videoID>[\d]+).*?
3945                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3946         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3947         thumb_match=re.search(thumb_RE,webpage)
3948         info_match=re.search(info_RE,webpage,re.VERBOSE)
3949         video_id=info_match.group('videoID')
3950         mediaSlug=info_match.group('mediaSlug')
3951         video_url=self._talk_video_link(mediaSlug)
3952         info = {
3953                 'id': video_id,
3954                 'url': video_url,
3955                 'ext': 'mp4',
3956                 'title': title,
3957                 'thumbnail': thumb_match.group('thumbnail')
3958                 }
3959         return info
3960
3961 class MySpassIE(InfoExtractor):
3962     _VALID_URL = r'http://www.myspass.de/.*'
3963
3964     def _real_extract(self, url):
3965         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3966
3967         # video id is the last path element of the URL
3968         # usually there is a trailing slash, so also try the second but last
3969         url_path = compat_urllib_parse_urlparse(url).path
3970         url_parent_path, video_id = os.path.split(url_path)
3971         if not video_id:
3972             _, video_id = os.path.split(url_parent_path)
3973
3974         # get metadata
3975         metadata_url = META_DATA_URL_TEMPLATE % video_id
3976         metadata_text = self._download_webpage(metadata_url, video_id)
3977         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3978
3979         # extract values from metadata
3980         url_flv_el = metadata.find('url_flv')
3981         if url_flv_el is None:
3982             self._downloader.report_error(u'unable to extract download url')
3983             return
3984         video_url = url_flv_el.text
3985         extension = os.path.splitext(video_url)[1][1:]
3986         title_el = metadata.find('title')
3987         if title_el is None:
3988             self._downloader.report_error(u'unable to extract title')
3989             return
3990         title = title_el.text
3991         format_id_el = metadata.find('format_id')
3992         if format_id_el is None:
3993             format = ext
3994         else:
3995             format = format_id_el.text
3996         description_el = metadata.find('description')
3997         if description_el is not None:
3998             description = description_el.text
3999         else:
4000             description = None
4001         imagePreview_el = metadata.find('imagePreview')
4002         if imagePreview_el is not None:
4003             thumbnail = imagePreview_el.text
4004         else:
4005             thumbnail = None
4006         info = {
4007             'id': video_id,
4008             'url': video_url,
4009             'title': title,
4010             'ext': extension,
4011             'format': format,
4012             'thumbnail': thumbnail,
4013             'description': description
4014         }
4015         return [info]
4016
4017 class SpiegelIE(InfoExtractor):
4018     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4019
4020     def _real_extract(self, url):
4021         m = re.match(self._VALID_URL, url)
4022         video_id = m.group('videoID')
4023
4024         webpage = self._download_webpage(url, video_id)
4025         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4026         if not m:
4027             raise ExtractorError(u'Cannot find title')
4028         video_title = unescapeHTML(m.group(1))
4029
4030         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4031         xml_code = self._download_webpage(xml_url, video_id,
4032                     note=u'Downloading XML', errnote=u'Failed to download XML')
4033
4034         idoc = xml.etree.ElementTree.fromstring(xml_code)
4035         last_type = idoc[-1]
4036         filename = last_type.findall('./filename')[0].text
4037         duration = float(last_type.findall('./duration')[0].text)
4038
4039         video_url = 'http://video2.spiegel.de/flash/' + filename
4040         video_ext = filename.rpartition('.')[2]
4041         info = {
4042             'id': video_id,
4043             'url': video_url,
4044             'ext': video_ext,
4045             'title': video_title,
4046             'duration': duration,
4047         }
4048         return [info]
4049
4050 class LiveLeakIE(InfoExtractor):
4051
4052     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4053     IE_NAME = u'liveleak'
4054
4055     def _real_extract(self, url):
4056         mobj = re.match(self._VALID_URL, url)
4057         if mobj is None:
4058             self._downloader.report_error(u'invalid URL: %s' % url)
4059             return
4060
4061         video_id = mobj.group('video_id')
4062
4063         webpage = self._download_webpage(url, video_id)
4064
4065         m = re.search(r'file: "(.*?)",', webpage)
4066         if not m:
4067             self._downloader.report_error(u'unable to find video url')
4068             return
4069         video_url = m.group(1)
4070
4071         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4072         if not m:
4073             self._downloader.report_error(u'Cannot find video title')
4074         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4075
4076         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4077         if m:
4078             desc = unescapeHTML(m.group('desc'))
4079         else:
4080             desc = None
4081
4082         m = re.search(r'By:.*?(\w+)</a>', webpage)
4083         if m:
4084             uploader = clean_html(m.group(1))
4085         else:
4086             uploader = None
4087
4088         info = {
4089             'id':  video_id,
4090             'url': video_url,
4091             'ext': 'mp4',
4092             'title': title,
4093             'description': desc,
4094             'uploader': uploader
4095         }
4096
4097         return [info]
4098
4099 class ARDIE(InfoExtractor):
4100     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4101     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4102     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4103
4104     def _real_extract(self, url):
4105         # determine video id from url
4106         m = re.match(self._VALID_URL, url)
4107
4108         numid = re.search(r'documentId=([0-9]+)', url)
4109         if numid:
4110             video_id = numid.group(1)
4111         else:
4112             video_id = m.group('video_id')
4113
4114         # determine title and media streams from webpage
4115         html = self._download_webpage(url, video_id)
4116         title = re.search(self._TITLE, html).group('title')
4117         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4118         if not streams:
4119             assert '"fsk"' in html
4120             self._downloader.report_error(u'this video is only available after 8:00 pm')
4121             return
4122
4123         # choose default media type and highest quality for now
4124         stream = max([s for s in streams if int(s["media_type"]) == 0],
4125                      key=lambda s: int(s["quality"]))
4126
4127         # there's two possibilities: RTMP stream or HTTP download
4128         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4129         if stream['rtmp_url']:
4130             self.to_screen(u'RTMP download detected')
4131             assert stream['video_url'].startswith('mp4:')
4132             info["url"] = stream["rtmp_url"]
4133             info["play_path"] = stream['video_url']
4134         else:
4135             assert stream["video_url"].endswith('.mp4')
4136             info["url"] = stream["video_url"]
4137         return [info]
4138
4139 class TumblrIE(InfoExtractor):
4140     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4141
4142     def _real_extract(self, url):
4143         m_url = re.match(self._VALID_URL, url)
4144         video_id = m_url.group('id')
4145         blog = m_url.group('blog_name')
4146
4147         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4148         webpage = self._download_webpage(url, video_id)
4149
4150         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4151         video = re.search(re_video, webpage)
4152         if video is None:
4153             self.to_screen("No video founded")
4154             return []
4155         video_url = video.group('video_url')
4156         ext = video.group('ext')
4157
4158         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4159         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4160
4161         # The only place where you can get a title, it's not complete,
4162         # but searching in other places doesn't work for all videos
4163         re_title = r'<title>(?P<title>.*?)</title>'
4164         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4165
4166         return [{'id': video_id,
4167                  'url': video_url,
4168                  'title': title,
4169                  'thumbnail': thumb,
4170                  'ext': ext
4171                  }]
4172
4173 class BandcampIE(InfoExtractor):
4174     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4175
4176     def _real_extract(self, url):
4177         mobj = re.match(self._VALID_URL, url)
4178         title = mobj.group('title')
4179         webpage = self._download_webpage(url, title)
4180         # We get the link to the free download page
4181         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4182         if m_download is None:
4183             self._downloader.report_error('No free songs founded')
4184             return
4185         download_link = m_download.group(1)
4186         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4187                        webpage, re.MULTILINE|re.DOTALL).group('id')
4188
4189         download_webpage = self._download_webpage(download_link, id,
4190                                                   'Downloading free downloads page')
4191         # We get the dictionary of the track from some javascrip code
4192         info = re.search(r'items: (.*?),$',
4193                          download_webpage, re.MULTILINE).group(1)
4194         info = json.loads(info)[0]
4195         # We pick mp3-320 for now, until format selection can be easily implemented.
4196         mp3_info = info[u'downloads'][u'mp3-320']
4197         # If we try to use this url it says the link has expired
4198         initial_url = mp3_info[u'url']
4199         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4200         m_url = re.match(re_url, initial_url)
4201         #We build the url we will use to get the final track url
4202         # This url is build in Bandcamp in the script download_bunde_*.js
4203         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4204         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4205         # If we could correctly generate the .rand field the url would be
4206         #in the "download_url" key
4207         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4208
4209         track_info = {'id':id,
4210                       'title' : info[u'title'],
4211                       'ext' : 'mp3',
4212                       'url' : final_url,
4213                       'thumbnail' : info[u'thumb_url'],
4214                       'uploader' : info[u'artist']
4215                       }
4216
4217         return [track_info]
4218
4219
4220 def gen_extractors():
4221     """ Return a list of an instance of every supported extractor.
4222     The order does matter; the first extractor matched is the one handling the URL.
4223     """
4224     return [
4225         YoutubePlaylistIE(),
4226         YoutubeChannelIE(),
4227         YoutubeUserIE(),
4228         YoutubeSearchIE(),
4229         YoutubeIE(),
4230         MetacafeIE(),
4231         DailymotionIE(),
4232         GoogleSearchIE(),
4233         PhotobucketIE(),
4234         YahooIE(),
4235         YahooSearchIE(),
4236         DepositFilesIE(),
4237         FacebookIE(),
4238         BlipTVUserIE(),
4239         BlipTVIE(),
4240         VimeoIE(),
4241         MyVideoIE(),
4242         ComedyCentralIE(),
4243         EscapistIE(),
4244         CollegeHumorIE(),
4245         XVideosIE(),
4246         SoundcloudSetIE(),
4247         SoundcloudIE(),
4248         InfoQIE(),
4249         MixcloudIE(),
4250         StanfordOpenClassroomIE(),
4251         MTVIE(),
4252         YoukuIE(),
4253         XNXXIE(),
4254         YouJizzIE(),
4255         PornotubeIE(),
4256         YouPornIE(),
4257         GooglePlusIE(),
4258         ArteTvIE(),
4259         NBAIE(),
4260         WorldStarHipHopIE(),
4261         JustinTVIE(),
4262         FunnyOrDieIE(),
4263         SteamIE(),
4264         UstreamIE(),
4265         RBMARadioIE(),
4266         EightTracksIE(),
4267         KeekIE(),
4268         TEDIE(),
4269         MySpassIE(),
4270         SpiegelIE(),
4271         RedtubeIE(),
4272         LiveLeakIE(),
4273         ARDIE(),
4274         TumblrIE(),
4275         BandcampIE(),
4276         GenericIE()
4277     ]
4278
4279 def get_info_extractor(ie_name):
4280     """Returns the info extractor class with the given ie_name"""
4281     return globals()[ie_name+'IE']