git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             upload_date = unified_strdate(upload_date)
 566
 567         # description
 568         video_description = get_element_by_id("eow-description", video_webpage)
 569         if video_description:
 570             video_description = clean_html(video_description)
 571         else:
 572             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 573             if fd_mobj:
 574                 video_description = unescapeHTML(fd_mobj.group(1))
 575             else:
 576                 video_description = u''
 577
 578         # subtitles
 579         video_subtitles = None
 580
 581         if self._downloader.params.get('writesubtitles', False):
 582             video_subtitles = self._extract_subtitle(video_id)
 583             if video_subtitles:
 584                 (sub_error, sub_lang, sub) = video_subtitles[0]
 585                 if sub_error:
 586                     self._downloader.report_error(sub_error)
 587
 588         if self._downloader.params.get('allsubtitles', False):
 589             video_subtitles = self._extract_all_subtitles(video_id)
 590             for video_subtitle in video_subtitles:
 591                 (sub_error, sub_lang, sub) = video_subtitle
 592                 if sub_error:
 593                     self._downloader.report_error(sub_error)
 594
 595         if self._downloader.params.get('listsubtitles', False):
 596             sub_lang_list = self._list_available_subtitles(video_id)
 597             return
 598
 599         if 'length_seconds' not in video_info:
 600             self._downloader.report_warning(u'unable to extract video duration')
 601             video_duration = ''
 602         else:
 603             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 604
 605         # token
 606         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 607
 608         # Decide which formats to download
 609         req_format = self._downloader.params.get('format', None)
 610
 611         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 612             self.report_rtmp_download()
 613             video_url_list = [(None, video_info['conn'][0])]
 614         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 615             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 616             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 617             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 618             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 619
 620             format_limit = self._downloader.params.get('format_limit', None)
 621             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 622             if format_limit is not None and format_limit in available_formats:
 623                 format_list = available_formats[available_formats.index(format_limit):]
 624             else:
 625                 format_list = available_formats
 626             existing_formats = [x for x in format_list if x in url_map]
 627             if len(existing_formats) == 0:
 628                 raise ExtractorError(u'no known formats available for video')
 629             if self._downloader.params.get('listformats', None):
 630                 self._print_formats(existing_formats)
 631                 return
 632             if req_format is None or req_format == 'best':
 633                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 634             elif req_format == 'worst':
 635                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 636             elif req_format in ('-1', 'all'):
 637                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 638             else:
 639                 # Specific formats. We pick the first in a slash-delimeted sequence.
 640                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 641                 req_formats = req_format.split('/')
 642                 video_url_list = None
 643                 for rf in req_formats:
 644                     if rf in url_map:
 645                         video_url_list = [(rf, url_map[rf])]
 646                         break
 647                 if video_url_list is None:
 648                     raise ExtractorError(u'requested format not available')
 649         else:
 650             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 651
 652         results = []
 653         for format_param, video_real_url in video_url_list:
 654             # Extension
 655             video_extension = self._video_extensions.get(format_param, 'flv')
 656
 657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 658                                               self._video_dimensions.get(format_param, '???'))
 659
 660             results.append({
 661                 'id':       video_id,
 662                 'url':      video_real_url,
 663                 'uploader': video_uploader,
 664                 'uploader_id': video_uploader_id,
 665                 'upload_date':  upload_date,
 666                 'title':    video_title,
 667                 'ext':      video_extension,
 668                 'format':   video_format,
 669                 'thumbnail':    video_thumbnail,
 670                 'description':  video_description,
 671                 'player_url':   player_url,
 672                 'subtitles':    video_subtitles,
 673                 'duration':     video_duration
 674             })
 675         return results
 676
 677
 678 class MetacafeIE(InfoExtractor):
 679     """Information Extractor for metacafe.com."""
 680
 681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 684     IE_NAME = u'metacafe'
 685
 686     def report_disclaimer(self):
 687         """Report disclaimer retrieval."""
 688         self.to_screen(u'Retrieving disclaimer')
 689
 690     def _real_initialize(self):
 691         # Retrieve disclaimer
 692         request = compat_urllib_request.Request(self._DISCLAIMER)
 693         try:
 694             self.report_disclaimer()
 695             disclaimer = compat_urllib_request.urlopen(request).read()
 696         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 697             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 698             return
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 711             return
 712
 713     def _real_extract(self, url):
 714         # Extract id and simplified title from URL
 715         mobj = re.match(self._VALID_URL, url)
 716         if mobj is None:
 717             self._downloader.report_error(u'invalid URL: %s' % url)
 718             return
 719
 720         video_id = mobj.group(1)
 721
 722         # Check if video comes from YouTube
 723         mobj2 = re.match(r'^yt-(.*)$', video_id)
 724         if mobj2 is not None:
 725             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 726
 727         # Retrieve video webpage to extract further information
 728         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 729
 730         # Extract URL, uploader and title from webpage
 731         self.report_extraction(video_id)
 732         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 733         if mobj is not None:
 734             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 735             video_extension = mediaURL[-3:]
 736
 737             # Extract gdaKey if available
 738             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 739             if mobj is None:
 740                 video_url = mediaURL
 741             else:
 742                 gdaKey = mobj.group(1)
 743                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 744         else:
 745             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 746             if mobj is None:
 747                 self._downloader.report_error(u'unable to extract media URL')
 748                 return
 749             vardict = compat_parse_qs(mobj.group(1))
 750             if 'mediaData' not in vardict:
 751                 self._downloader.report_error(u'unable to extract media URL')
 752                 return
 753             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 754             if mobj is None:
 755                 self._downloader.report_error(u'unable to extract media URL')
 756                 return
 757             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 758             video_extension = mediaURL[-3:]
 759             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 760
 761         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 762         if mobj is None:
 763             self._downloader.report_error(u'unable to extract title')
 764             return
 765         video_title = mobj.group(1).decode('utf-8')
 766
 767         mobj = re.search(r'submitter=(.*?);', webpage)
 768         if mobj is None:
 769             self._downloader.report_error(u'unable to extract uploader nickname')
 770             return
 771         video_uploader = mobj.group(1)
 772
 773         return [{
 774             'id':       video_id.decode('utf-8'),
 775             'url':      video_url.decode('utf-8'),
 776             'uploader': video_uploader.decode('utf-8'),
 777             'upload_date':  None,
 778             'title':    video_title,
 779             'ext':      video_extension.decode('utf-8'),
 780         }]
 781
 782 class DailymotionIE(InfoExtractor):
 783     """Information Extractor for Dailymotion"""
 784
 785     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 786     IE_NAME = u'dailymotion'
 787
 788     def _real_extract(self, url):
 789         # Extract id and simplified title from URL
 790         mobj = re.match(self._VALID_URL, url)
 791         if mobj is None:
 792             self._downloader.report_error(u'invalid URL: %s' % url)
 793             return
 794
 795         video_id = mobj.group(1).split('_')[0].split('?')[0]
 796
 797         video_extension = 'mp4'
 798
 799         # Retrieve video webpage to extract further information
 800         request = compat_urllib_request.Request(url)
 801         request.add_header('Cookie', 'family_filter=off')
 802         webpage = self._download_webpage(request, video_id)
 803
 804         # Extract URL, uploader and title from webpage
 805         self.report_extraction(video_id)
 806         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 807         if mobj is None:
 808             self._downloader.report_error(u'unable to extract media URL')
 809             return
 810         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 811
 812         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 813             if key in flashvars:
 814                 max_quality = key
 815                 self.to_screen(u'Using %s' % key)
 816                 break
 817         else:
 818             self._downloader.report_error(u'unable to extract video URL')
 819             return
 820
 821         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 822         if mobj is None:
 823             self._downloader.report_error(u'unable to extract video URL')
 824             return
 825
 826         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 827
 828         # TODO: support choosing qualities
 829
 830         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 831         if mobj is None:
 832             self._downloader.report_error(u'unable to extract title')
 833             return
 834         video_title = unescapeHTML(mobj.group('title'))
 835
 836         video_uploader = None
 837         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 838         if mobj is None:
 839             # lookin for official user
 840             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 841             if mobj_official is None:
 842                 self._downloader.report_warning(u'unable to extract uploader nickname')
 843             else:
 844                 video_uploader = mobj_official.group(1)
 845         else:
 846             video_uploader = mobj.group(1)
 847
 848         video_upload_date = None
 849         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 850         if mobj is not None:
 851             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 852
 853         return [{
 854             'id':       video_id,
 855             'url':      video_url,
 856             'uploader': video_uploader,
 857             'upload_date':  video_upload_date,
 858             'title':    video_title,
 859             'ext':      video_extension,
 860         }]
 861
 862
 863 class PhotobucketIE(InfoExtractor):
 864     """Information extractor for photobucket.com."""
 865
 866     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 867     IE_NAME = u'photobucket'
 868
 869     def _real_extract(self, url):
 870         # Extract id from URL
 871         mobj = re.match(self._VALID_URL, url)
 872         if mobj is None:
 873             self._downloader.report_error(u'Invalid URL: %s' % url)
 874             return
 875
 876         video_id = mobj.group(1)
 877
 878         video_extension = 'flv'
 879
 880         # Retrieve video webpage to extract further information
 881         request = compat_urllib_request.Request(url)
 882         try:
 883             self.report_download_webpage(video_id)
 884             webpage = compat_urllib_request.urlopen(request).read()
 885         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 886             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 887             return
 888
 889         # Extract URL, uploader, and title from webpage
 890         self.report_extraction(video_id)
 891         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 892         if mobj is None:
 893             self._downloader.report_error(u'unable to extract media URL')
 894             return
 895         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 896
 897         video_url = mediaURL
 898
 899         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 900         if mobj is None:
 901             self._downloader.report_error(u'unable to extract title')
 902             return
 903         video_title = mobj.group(1).decode('utf-8')
 904
 905         video_uploader = mobj.group(2).decode('utf-8')
 906
 907         return [{
 908             'id':       video_id.decode('utf-8'),
 909             'url':      video_url.decode('utf-8'),
 910             'uploader': video_uploader,
 911             'upload_date':  None,
 912             'title':    video_title,
 913             'ext':      video_extension.decode('utf-8'),
 914         }]
 915
 916
 917 class YahooIE(InfoExtractor):
 918     """Information extractor for video.yahoo.com."""
 919
 920     _WORKING = False
 921     # _VALID_URL matches all Yahoo! Video URLs
 922     # _VPAGE_URL matches only the extractable '/watch/' URLs
 923     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 924     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 925     IE_NAME = u'video.yahoo'
 926
 927     def _real_extract(self, url, new_video=True):
 928         # Extract ID from URL
 929         mobj = re.match(self._VALID_URL, url)
 930         if mobj is None:
 931             self._downloader.report_error(u'Invalid URL: %s' % url)
 932             return
 933
 934         video_id = mobj.group(2)
 935         video_extension = 'flv'
 936
 937         # Rewrite valid but non-extractable URLs as
 938         # extractable English language /watch/ URLs
 939         if re.match(self._VPAGE_URL, url) is None:
 940             request = compat_urllib_request.Request(url)
 941             try:
 942                 webpage = compat_urllib_request.urlopen(request).read()
 943             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 944                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 945                 return
 946
 947             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 948             if mobj is None:
 949                 self._downloader.report_error(u'Unable to extract id field')
 950                 return
 951             yahoo_id = mobj.group(1)
 952
 953             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 954             if mobj is None:
 955                 self._downloader.report_error(u'Unable to extract vid field')
 956                 return
 957             yahoo_vid = mobj.group(1)
 958
 959             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 960             return self._real_extract(url, new_video=False)
 961
 962         # Retrieve video webpage to extract further information
 963         request = compat_urllib_request.Request(url)
 964         try:
 965             self.report_download_webpage(video_id)
 966             webpage = compat_urllib_request.urlopen(request).read()
 967         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 968             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 969             return
 970
 971         # Extract uploader and title from webpage
 972         self.report_extraction(video_id)
 973         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 974         if mobj is None:
 975             self._downloader.report_error(u'unable to extract video title')
 976             return
 977         video_title = mobj.group(1).decode('utf-8')
 978
 979         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 980         if mobj is None:
 981             self._downloader.report_error(u'unable to extract video uploader')
 982             return
 983         video_uploader = mobj.group(1).decode('utf-8')
 984
 985         # Extract video thumbnail
 986         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 987         if mobj is None:
 988             self._downloader.report_error(u'unable to extract video thumbnail')
 989             return
 990         video_thumbnail = mobj.group(1).decode('utf-8')
 991
 992         # Extract video description
 993         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 994         if mobj is None:
 995             self._downloader.report_error(u'unable to extract video description')
 996             return
 997         video_description = mobj.group(1).decode('utf-8')
 998         if not video_description:
 999             video_description = 'No description available.'
1000
1001         # Extract video height and width
1002         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1003         if mobj is None:
1004             self._downloader.report_error(u'unable to extract video height')
1005             return
1006         yv_video_height = mobj.group(1)
1007
1008         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1009         if mobj is None:
1010             self._downloader.report_error(u'unable to extract video width')
1011             return
1012         yv_video_width = mobj.group(1)
1013
1014         # Retrieve video playlist to extract media URL
1015         # I'm not completely sure what all these options are, but we
1016         # seem to need most of them, otherwise the server sends a 401.
1017         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1018         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1019         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1020                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1021                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1022         try:
1023             self.report_download_webpage(video_id)
1024             webpage = compat_urllib_request.urlopen(request).read()
1025         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1026             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1027             return
1028
1029         # Extract media URL from playlist XML
1030         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1031         if mobj is None:
1032             self._downloader.report_error(u'Unable to extract media URL')
1033             return
1034         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1035         video_url = unescapeHTML(video_url)
1036
1037         return [{
1038             'id':       video_id.decode('utf-8'),
1039             'url':      video_url,
1040             'uploader': video_uploader,
1041             'upload_date':  None,
1042             'title':    video_title,
1043             'ext':      video_extension.decode('utf-8'),
1044             'thumbnail':    video_thumbnail.decode('utf-8'),
1045             'description':  video_description,
1046         }]
1047
1048
1049 class VimeoIE(InfoExtractor):
1050     """Information extractor for vimeo.com."""
1051
1052     # _VALID_URL matches Vimeo URLs
1053     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1054     IE_NAME = u'vimeo'
1055
1056     def _real_extract(self, url, new_video=True):
1057         # Extract ID from URL
1058         mobj = re.match(self._VALID_URL, url)
1059         if mobj is None:
1060             self._downloader.report_error(u'Invalid URL: %s' % url)
1061             return
1062
1063         video_id = mobj.group('id')
1064         if not mobj.group('proto'):
1065             url = 'https://' + url
1066         if mobj.group('direct_link'):
1067             url = 'https://vimeo.com/' + video_id
1068
1069         # Retrieve video webpage to extract further information
1070         request = compat_urllib_request.Request(url, None, std_headers)
1071         webpage = self._download_webpage(request, video_id)
1072
1073         # Now we begin extracting as much information as we can from what we
1074         # retrieved. First we extract the information common to all extractors,
1075         # and latter we extract those that are Vimeo specific.
1076         self.report_extraction(video_id)
1077
1078         # Extract the config JSON
1079         try:
1080             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1081             config = json.loads(config)
1082         except:
1083             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1084                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1085             else:
1086                 self._downloader.report_error(u'unable to extract info section')
1087             return
1088
1089         # Extract title
1090         video_title = config["video"]["title"]
1091
1092         # Extract uploader and uploader_id
1093         video_uploader = config["video"]["owner"]["name"]
1094         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1095
1096         # Extract video thumbnail
1097         video_thumbnail = config["video"]["thumbnail"]
1098
1099         # Extract video description
1100         video_description = get_element_by_attribute("itemprop", "description", webpage)
1101         if video_description: video_description = clean_html(video_description)
1102         else: video_description = u''
1103
1104         # Extract upload date
1105         video_upload_date = None
1106         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1107         if mobj is not None:
1108             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1109
1110         # Vimeo specific: extract request signature and timestamp
1111         sig = config['request']['signature']
1112         timestamp = config['request']['timestamp']
1113
1114         # Vimeo specific: extract video codec and quality information
1115         # First consider quality, then codecs, then take everything
1116         # TODO bind to format param
1117         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1118         files = { 'hd': [], 'sd': [], 'other': []}
1119         for codec_name, codec_extension in codecs:
1120             if codec_name in config["video"]["files"]:
1121                 if 'hd' in config["video"]["files"][codec_name]:
1122                     files['hd'].append((codec_name, codec_extension, 'hd'))
1123                 elif 'sd' in config["video"]["files"][codec_name]:
1124                     files['sd'].append((codec_name, codec_extension, 'sd'))
1125                 else:
1126                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1127
1128         for quality in ('hd', 'sd', 'other'):
1129             if len(files[quality]) > 0:
1130                 video_quality = files[quality][0][2]
1131                 video_codec = files[quality][0][0]
1132                 video_extension = files[quality][0][1]
1133                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1134                 break
1135         else:
1136             self._downloader.report_error(u'no known codec found')
1137             return
1138
1139         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1140                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1141
1142         return [{
1143             'id':       video_id,
1144             'url':      video_url,
1145             'uploader': video_uploader,
1146             'uploader_id': video_uploader_id,
1147             'upload_date':  video_upload_date,
1148             'title':    video_title,
1149             'ext':      video_extension,
1150             'thumbnail':    video_thumbnail,
1151             'description':  video_description,
1152         }]
1153
1154
1155 class ArteTvIE(InfoExtractor):
1156     """arte.tv information extractor."""
1157
1158     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1159     _LIVE_URL = r'index-[0-9]+\.html$'
1160
1161     IE_NAME = u'arte.tv'
1162
1163     def fetch_webpage(self, url):
1164         request = compat_urllib_request.Request(url)
1165         try:
1166             self.report_download_webpage(url)
1167             webpage = compat_urllib_request.urlopen(request).read()
1168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1169             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1170             return
1171         except ValueError as err:
1172             self._downloader.report_error(u'Invalid URL: %s' % url)
1173             return
1174         return webpage
1175
1176     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1177         page = self.fetch_webpage(url)
1178         mobj = re.search(regex, page, regexFlags)
1179         info = {}
1180
1181         if mobj is None:
1182             self._downloader.report_error(u'Invalid URL: %s' % url)
1183             return
1184
1185         for (i, key, err) in matchTuples:
1186             if mobj.group(i) is None:
1187                 self._downloader.report_error(err)
1188                 return
1189             else:
1190                 info[key] = mobj.group(i)
1191
1192         return info
1193
1194     def extractLiveStream(self, url):
1195         video_lang = url.split('/')[-4]
1196         info = self.grep_webpage(
1197             url,
1198             r'src="(.*?/videothek_js.*?\.js)',
1199             0,
1200             [
1201                 (1, 'url', u'Invalid URL: %s' % url)
1202             ]
1203         )
1204         http_host = url.split('/')[2]
1205         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1206         info = self.grep_webpage(
1207             next_url,
1208             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1209                 '(http://.*?\.swf).*?' +
1210                 '(rtmp://.*?)\'',
1211             re.DOTALL,
1212             [
1213                 (1, 'path',   u'could not extract video path: %s' % url),
1214                 (2, 'player', u'could not extract video player: %s' % url),
1215                 (3, 'url',    u'could not extract video url: %s' % url)
1216             ]
1217         )
1218         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1219
1220     def extractPlus7Stream(self, url):
1221         video_lang = url.split('/')[-3]
1222         info = self.grep_webpage(
1223             url,
1224             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1225             0,
1226             [
1227                 (1, 'url', u'Invalid URL: %s' % url)
1228             ]
1229         )
1230         next_url = compat_urllib_parse.unquote(info.get('url'))
1231         info = self.grep_webpage(
1232             next_url,
1233             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1234             0,
1235             [
1236                 (1, 'url', u'Could not find <video> tag: %s' % url)
1237             ]
1238         )
1239         next_url = compat_urllib_parse.unquote(info.get('url'))
1240
1241         info = self.grep_webpage(
1242             next_url,
1243             r'<video id="(.*?)".*?>.*?' +
1244                 '<name>(.*?)</name>.*?' +
1245                 '<dateVideo>(.*?)</dateVideo>.*?' +
1246                 '<url quality="hd">(.*?)</url>',
1247             re.DOTALL,
1248             [
1249                 (1, 'id',    u'could not extract video id: %s' % url),
1250                 (2, 'title', u'could not extract video title: %s' % url),
1251                 (3, 'date',  u'could not extract video date: %s' % url),
1252                 (4, 'url',   u'could not extract video url: %s' % url)
1253             ]
1254         )
1255
1256         return {
1257             'id':           info.get('id'),
1258             'url':          compat_urllib_parse.unquote(info.get('url')),
1259             'uploader':     u'arte.tv',
1260             'upload_date':  info.get('date'),
1261             'title':        info.get('title').decode('utf-8'),
1262             'ext':          u'mp4',
1263             'format':       u'NA',
1264             'player_url':   None,
1265         }
1266
1267     def _real_extract(self, url):
1268         video_id = url.split('/')[-1]
1269         self.report_extraction(video_id)
1270
1271         if re.search(self._LIVE_URL, video_id) is not None:
1272             self.extractLiveStream(url)
1273             return
1274         else:
1275             info = self.extractPlus7Stream(url)
1276
1277         return [info]
1278
1279
1280 class GenericIE(InfoExtractor):
1281     """Generic last-resort information extractor."""
1282
1283     _VALID_URL = r'.*'
1284     IE_NAME = u'generic'
1285
1286     def report_download_webpage(self, video_id):
1287         """Report webpage download."""
1288         if not self._downloader.params.get('test', False):
1289             self._downloader.report_warning(u'Falling back on generic information extractor.')
1290         super(GenericIE, self).report_download_webpage(video_id)
1291
1292     def report_following_redirect(self, new_url):
1293         """Report information extraction."""
1294         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1295
1296     def _test_redirect(self, url):
1297         """Check if it is a redirect, like url shorteners, in case return the new url."""
1298         class HeadRequest(compat_urllib_request.Request):
1299             def get_method(self):
1300                 return "HEAD"
1301
1302         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1303             """
1304             Subclass the HTTPRedirectHandler to make it use our
1305             HeadRequest also on the redirected URL
1306             """
1307             def redirect_request(self, req, fp, code, msg, headers, newurl):
1308                 if code in (301, 302, 303, 307):
1309                     newurl = newurl.replace(' ', '%20')
1310                     newheaders = dict((k,v) for k,v in req.headers.items()
1311                                       if k.lower() not in ("content-length", "content-type"))
1312                     return HeadRequest(newurl,
1313                                        headers=newheaders,
1314                                        origin_req_host=req.get_origin_req_host(),
1315                                        unverifiable=True)
1316                 else:
1317                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1318
1319         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1320             """
1321             Fallback to GET if HEAD is not allowed (405 HTTP error)
1322             """
1323             def http_error_405(self, req, fp, code, msg, headers):
1324                 fp.read()
1325                 fp.close()
1326
1327                 newheaders = dict((k,v) for k,v in req.headers.items()
1328                                   if k.lower() not in ("content-length", "content-type"))
1329                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1330                                                  headers=newheaders,
1331                                                  origin_req_host=req.get_origin_req_host(),
1332                                                  unverifiable=True))
1333
1334         # Build our opener
1335         opener = compat_urllib_request.OpenerDirector()
1336         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1337                         HTTPMethodFallback, HEADRedirectHandler,
1338                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1339             opener.add_handler(handler())
1340
1341         response = opener.open(HeadRequest(url))
1342         new_url = response.geturl()
1343
1344         if url == new_url:
1345             return False
1346
1347         self.report_following_redirect(new_url)
1348         return new_url
1349
1350     def _real_extract(self, url):
1351         new_url = self._test_redirect(url)
1352         if new_url: return [self.url_result(new_url)]
1353
1354         video_id = url.split('/')[-1]
1355         try:
1356             webpage = self._download_webpage(url, video_id)
1357         except ValueError as err:
1358             # since this is the last-resort InfoExtractor, if
1359             # this error is thrown, it'll be thrown here
1360             self._downloader.report_error(u'Invalid URL: %s' % url)
1361             return
1362
1363         self.report_extraction(video_id)
1364         # Start with something easy: JW Player in SWFObject
1365         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366         if mobj is None:
1367             # Broaden the search a little bit
1368             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369         if mobj is None:
1370             # Broaden the search a little bit: JWPlayer JS loader
1371             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1372         if mobj is None:
1373             self._downloader.report_error(u'Invalid URL: %s' % url)
1374             return
1375
1376         # It's possible that one of the regexes
1377         # matched, but returned an empty group:
1378         if mobj.group(1) is None:
1379             self._downloader.report_error(u'Invalid URL: %s' % url)
1380             return
1381
1382         video_url = compat_urllib_parse.unquote(mobj.group(1))
1383         video_id = os.path.basename(video_url)
1384
1385         # here's a fun little line of code for you:
1386         video_extension = os.path.splitext(video_id)[1][1:]
1387         video_id = os.path.splitext(video_id)[0]
1388
1389         # it's tempting to parse this further, but you would
1390         # have to take into account all the variations like
1391         #   Video Title - Site Name
1392         #   Site Name | Video Title
1393         #   Video Title - Tagline | Site Name
1394         # and so on and so forth; it's just not practical
1395         mobj = re.search(r'<title>(.*)</title>', webpage)
1396         if mobj is None:
1397             self._downloader.report_error(u'unable to extract title')
1398             return
1399         video_title = mobj.group(1)
1400
1401         # video uploader is domain name
1402         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1403         if mobj is None:
1404             self._downloader.report_error(u'unable to extract title')
1405             return
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(InfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422     _max_youtube_results = 1000
1423     IE_NAME = u'youtube:search'
1424
1425     def report_download_page(self, query, pagenum):
1426         """Report attempt to download search page with given number."""
1427         query = query.decode(preferredencoding())
1428         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429
1430     def _real_extract(self, query):
1431         mobj = re.match(self._VALID_URL, query)
1432         if mobj is None:
1433             self._downloader.report_error(u'invalid search query "%s"' % query)
1434             return
1435
1436         prefix, query = query.split(':')
1437         prefix = prefix[8:]
1438         query = query.encode('utf-8')
1439         if prefix == '':
1440             return self._get_n_results(query, 1)
1441         elif prefix == 'all':
1442             self._get_n_results(query, self._max_youtube_results)
1443         else:
1444             try:
1445                 n = int(prefix)
1446                 if n <= 0:
1447                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1448                     return
1449                 elif n > self._max_youtube_results:
1450                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1451                     n = self._max_youtube_results
1452                 return self._get_n_results(query, n)
1453             except ValueError: # parsing prefix as integer fails
1454                 return self._get_n_results(query, 1)
1455
1456     def _get_n_results(self, query, n):
1457         """Get a specified number of results for a query"""
1458
1459         video_ids = []
1460         pagenum = 0
1461         limit = n
1462
1463         while (50 * pagenum) < limit:
1464             self.report_download_page(query, pagenum+1)
1465             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1466             request = compat_urllib_request.Request(result_url)
1467             try:
1468                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1469             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1470                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1471                 return
1472             api_response = json.loads(data)['data']
1473
1474             if not 'items' in api_response:
1475                 self._downloader.report_error(u'[youtube] No video results')
1476                 return
1477
1478             new_ids = list(video['id'] for video in api_response['items'])
1479             video_ids += new_ids
1480
1481             limit = min(n, api_response['totalItems'])
1482             pagenum += 1
1483
1484         if len(video_ids) > n:
1485             video_ids = video_ids[:n]
1486         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1487         return videos
1488
1489
1490 class GoogleSearchIE(InfoExtractor):
1491     """Information Extractor for Google Video search queries."""
1492     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1493     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1494     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1495     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1496     _max_google_results = 1000
1497     IE_NAME = u'video.google:search'
1498
1499     def report_download_page(self, query, pagenum):
1500         """Report attempt to download playlist page with given number."""
1501         query = query.decode(preferredencoding())
1502         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1503
1504     def _real_extract(self, query):
1505         mobj = re.match(self._VALID_URL, query)
1506         if mobj is None:
1507             self._downloader.report_error(u'invalid search query "%s"' % query)
1508             return
1509
1510         prefix, query = query.split(':')
1511         prefix = prefix[8:]
1512         query = query.encode('utf-8')
1513         if prefix == '':
1514             self._download_n_results(query, 1)
1515             return
1516         elif prefix == 'all':
1517             self._download_n_results(query, self._max_google_results)
1518             return
1519         else:
1520             try:
1521                 n = int(prefix)
1522                 if n <= 0:
1523                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524                     return
1525                 elif n > self._max_google_results:
1526                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1527                     n = self._max_google_results
1528                 self._download_n_results(query, n)
1529                 return
1530             except ValueError: # parsing prefix as integer fails
1531                 self._download_n_results(query, 1)
1532                 return
1533
1534     def _download_n_results(self, query, n):
1535         """Downloads a specified number of results for a query"""
1536
1537         video_ids = []
1538         pagenum = 0
1539
1540         while True:
1541             self.report_download_page(query, pagenum)
1542             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 page = compat_urllib_request.urlopen(request).read()
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1548                 return
1549
1550             # Extract video identifiers
1551             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1552                 video_id = mobj.group(1)
1553                 if video_id not in video_ids:
1554                     video_ids.append(video_id)
1555                     if len(video_ids) == n:
1556                         # Specified n videos reached
1557                         for id in video_ids:
1558                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1559                         return
1560
1561             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1562                 for id in video_ids:
1563                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1564                 return
1565
1566             pagenum = pagenum + 1
1567
1568
1569 class YahooSearchIE(InfoExtractor):
1570     """Information Extractor for Yahoo! Video search queries."""
1571
1572     _WORKING = False
1573     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1574     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1575     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1576     _MORE_PAGES_INDICATOR = r'\s*Next'
1577     _max_yahoo_results = 1000
1578     IE_NAME = u'video.yahoo:search'
1579
1580     def report_download_page(self, query, pagenum):
1581         """Report attempt to download playlist page with given number."""
1582         query = query.decode(preferredencoding())
1583         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1584
1585     def _real_extract(self, query):
1586         mobj = re.match(self._VALID_URL, query)
1587         if mobj is None:
1588             self._downloader.report_error(u'invalid search query "%s"' % query)
1589             return
1590
1591         prefix, query = query.split(':')
1592         prefix = prefix[8:]
1593         query = query.encode('utf-8')
1594         if prefix == '':
1595             self._download_n_results(query, 1)
1596             return
1597         elif prefix == 'all':
1598             self._download_n_results(query, self._max_yahoo_results)
1599             return
1600         else:
1601             try:
1602                 n = int(prefix)
1603                 if n <= 0:
1604                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605                     return
1606                 elif n > self._max_yahoo_results:
1607                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1608                     n = self._max_yahoo_results
1609                 self._download_n_results(query, n)
1610                 return
1611             except ValueError: # parsing prefix as integer fails
1612                 self._download_n_results(query, 1)
1613                 return
1614
1615     def _download_n_results(self, query, n):
1616         """Downloads a specified number of results for a query"""
1617
1618         video_ids = []
1619         already_seen = set()
1620         pagenum = 1
1621
1622         while True:
1623             self.report_download_page(query, pagenum)
1624             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1625             request = compat_urllib_request.Request(result_url)
1626             try:
1627                 page = compat_urllib_request.urlopen(request).read()
1628             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1629                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1630                 return
1631
1632             # Extract video identifiers
1633             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1634                 video_id = mobj.group(1)
1635                 if video_id not in already_seen:
1636                     video_ids.append(video_id)
1637                     already_seen.add(video_id)
1638                     if len(video_ids) == n:
1639                         # Specified n videos reached
1640                         for id in video_ids:
1641                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1642                         return
1643
1644             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1645                 for id in video_ids:
1646                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1647                 return
1648
1649             pagenum = pagenum + 1
1650
1651
1652 class YoutubePlaylistIE(InfoExtractor):
1653     """Information Extractor for YouTube playlists."""
1654
1655     _VALID_URL = r"""(?:
1656                         (?:https?://)?
1657                         (?:\w+\.)?
1658                         youtube\.com/
1659                         (?:
1660                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1661                            \? (?:.*?&)*? (?:p|a|list)=
1662                         |  p/
1663                         )
1664                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1665                         .*
1666                      |
1667                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1668                      )"""
1669     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1670     _MAX_RESULTS = 50
1671     IE_NAME = u'youtube:playlist'
1672
1673     @classmethod
1674     def suitable(cls, url):
1675         """Receives a URL and returns True if suitable for this IE."""
1676         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1677
1678     def _real_extract(self, url):
1679         # Extract playlist id
1680         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1681         if mobj is None:
1682             self._downloader.report_error(u'invalid url: %s' % url)
1683             return
1684
1685         # Download playlist videos from API
1686         playlist_id = mobj.group(1) or mobj.group(2)
1687         page_num = 1
1688         videos = []
1689
1690         while True:
1691             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1692             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1693
1694             try:
1695                 response = json.loads(page)
1696             except ValueError as err:
1697                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1698                 return
1699
1700             if 'feed' not in response:
1701                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1702                 return
1703             playlist_title = response['feed']['title']['$t']
1704             if 'entry' not in response['feed']:
1705                 # Number of videos is a multiple of self._MAX_RESULTS
1706                 break
1707
1708             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1709                         for entry in response['feed']['entry']
1710                         if 'content' in entry ]
1711
1712             if len(response['feed']['entry']) < self._MAX_RESULTS:
1713                 break
1714             page_num += 1
1715
1716         videos = [v[1] for v in sorted(videos)]
1717
1718         url_results = [self.url_result(url, 'Youtube') for url in videos]
1719         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1720
1721
1722 class YoutubeChannelIE(InfoExtractor):
1723     """Information Extractor for YouTube channels."""
1724
1725     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1726     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1727     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1728     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1729     IE_NAME = u'youtube:channel'
1730
1731     def extract_videos_from_page(self, page):
1732         ids_in_page = []
1733         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1734             if mobj.group(1) not in ids_in_page:
1735                 ids_in_page.append(mobj.group(1))
1736         return ids_in_page
1737
1738     def _real_extract(self, url):
1739         # Extract channel id
1740         mobj = re.match(self._VALID_URL, url)
1741         if mobj is None:
1742             self._downloader.report_error(u'invalid url: %s' % url)
1743             return
1744
1745         # Download channel page
1746         channel_id = mobj.group(1)
1747         video_ids = []
1748         pagenum = 1
1749
1750         url = self._TEMPLATE_URL % (channel_id, pagenum)
1751         page = self._download_webpage(url, channel_id,
1752                                       u'Downloading page #%s' % pagenum)
1753
1754         # Extract video identifiers
1755         ids_in_page = self.extract_videos_from_page(page)
1756         video_ids.extend(ids_in_page)
1757
1758         # Download any subsequent channel pages using the json-based channel_ajax query
1759         if self._MORE_PAGES_INDICATOR in page:
1760             while True:
1761                 pagenum = pagenum + 1
1762
1763                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1764                 page = self._download_webpage(url, channel_id,
1765                                               u'Downloading page #%s' % pagenum)
1766
1767                 page = json.loads(page)
1768
1769                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1770                 video_ids.extend(ids_in_page)
1771
1772                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1773                     break
1774
1775         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1776
1777         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1778         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1779         return [self.playlist_result(url_entries, channel_id)]
1780
1781
1782 class YoutubeUserIE(InfoExtractor):
1783     """Information Extractor for YouTube users."""
1784
1785     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1786     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1787     _GDATA_PAGE_SIZE = 50
1788     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1789     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1790     IE_NAME = u'youtube:user'
1791
1792     def _real_extract(self, url):
1793         # Extract username
1794         mobj = re.match(self._VALID_URL, url)
1795         if mobj is None:
1796             self._downloader.report_error(u'invalid url: %s' % url)
1797             return
1798
1799         username = mobj.group(1)
1800
1801         # Download video ids using YouTube Data API. Result size per
1802         # query is limited (currently to 50 videos) so we need to query
1803         # page by page until there are no video ids - it means we got
1804         # all of them.
1805
1806         video_ids = []
1807         pagenum = 0
1808
1809         while True:
1810             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1811
1812             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1813             page = self._download_webpage(gdata_url, username,
1814                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1815
1816             # Extract video identifiers
1817             ids_in_page = []
1818
1819             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1820                 if mobj.group(1) not in ids_in_page:
1821                     ids_in_page.append(mobj.group(1))
1822
1823             video_ids.extend(ids_in_page)
1824
1825             # A little optimization - if current page is not
1826             # "full", ie. does not contain PAGE_SIZE video ids then
1827             # we can assume that this page is the last one - there
1828             # are no more ids on further pages - no need to query
1829             # again.
1830
1831             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1832                 break
1833
1834             pagenum += 1
1835
1836         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1837         url_results = [self.url_result(url, 'Youtube') for url in urls]
1838         return [self.playlist_result(url_results, playlist_title = username)]
1839
1840
1841 class BlipTVUserIE(InfoExtractor):
1842     """Information Extractor for blip.tv users."""
1843
1844     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1845     _PAGE_SIZE = 12
1846     IE_NAME = u'blip.tv:user'
1847
1848     def _real_extract(self, url):
1849         # Extract username
1850         mobj = re.match(self._VALID_URL, url)
1851         if mobj is None:
1852             self._downloader.report_error(u'invalid url: %s' % url)
1853             return
1854
1855         username = mobj.group(1)
1856
1857         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1858
1859         page = self._download_webpage(url, username, u'Downloading user page')
1860         mobj = re.search(r'data-users-id="([^"]+)"', page)
1861         page_base = page_base % mobj.group(1)
1862
1863
1864         # Download video ids using BlipTV Ajax calls. Result size per
1865         # query is limited (currently to 12 videos) so we need to query
1866         # page by page until there are no video ids - it means we got
1867         # all of them.
1868
1869         video_ids = []
1870         pagenum = 1
1871
1872         while True:
1873             url = page_base + "&page=" + str(pagenum)
1874             page = self._download_webpage(url, username,
1875                                           u'Downloading video ids from page %d' % pagenum)
1876
1877             # Extract video identifiers
1878             ids_in_page = []
1879
1880             for mobj in re.finditer(r'href="/([^"]+)"', page):
1881                 if mobj.group(1) not in ids_in_page:
1882                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1883
1884             video_ids.extend(ids_in_page)
1885
1886             # A little optimization - if current page is not
1887             # "full", ie. does not contain PAGE_SIZE video ids then
1888             # we can assume that this page is the last one - there
1889             # are no more ids on further pages - no need to query
1890             # again.
1891
1892             if len(ids_in_page) < self._PAGE_SIZE:
1893                 break
1894
1895             pagenum += 1
1896
1897         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1898         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1899         return [self.playlist_result(url_entries, playlist_title = username)]
1900
1901
1902 class DepositFilesIE(InfoExtractor):
1903     """Information extractor for depositfiles.com"""
1904
1905     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1906
1907     def _real_extract(self, url):
1908         file_id = url.split('/')[-1]
1909         # Rebuild url in english locale
1910         url = 'http://depositfiles.com/en/files/' + file_id
1911
1912         # Retrieve file webpage with 'Free download' button pressed
1913         free_download_indication = { 'gateway_result' : '1' }
1914         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1915         try:
1916             self.report_download_webpage(file_id)
1917             webpage = compat_urllib_request.urlopen(request).read()
1918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1920             return
1921
1922         # Search for the real file URL
1923         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1924         if (mobj is None) or (mobj.group(1) is None):
1925             # Try to figure out reason of the error.
1926             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1927             if (mobj is not None) and (mobj.group(1) is not None):
1928                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1929                 self._downloader.report_error(u'%s' % restriction_message)
1930             else:
1931                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1932             return
1933
1934         file_url = mobj.group(1)
1935         file_extension = os.path.splitext(file_url)[1][1:]
1936
1937         # Search for file title
1938         mobj = re.search(r'<b title="(.*?)">', webpage)
1939         if mobj is None:
1940             self._downloader.report_error(u'unable to extract title')
1941             return
1942         file_title = mobj.group(1).decode('utf-8')
1943
1944         return [{
1945             'id':       file_id.decode('utf-8'),
1946             'url':      file_url.decode('utf-8'),
1947             'uploader': None,
1948             'upload_date':  None,
1949             'title':    file_title,
1950             'ext':      file_extension.decode('utf-8'),
1951         }]
1952
1953
1954 class FacebookIE(InfoExtractor):
1955     """Information Extractor for Facebook"""
1956
1957     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1958     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1959     _NETRC_MACHINE = 'facebook'
1960     IE_NAME = u'facebook'
1961
1962     def report_login(self):
1963         """Report attempt to log in."""
1964         self.to_screen(u'Logging in')
1965
1966     def _real_initialize(self):
1967         if self._downloader is None:
1968             return
1969
1970         useremail = None
1971         password = None
1972         downloader_params = self._downloader.params
1973
1974         # Attempt to use provided username and password or .netrc data
1975         if downloader_params.get('username', None) is not None:
1976             useremail = downloader_params['username']
1977             password = downloader_params['password']
1978         elif downloader_params.get('usenetrc', False):
1979             try:
1980                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1981                 if info is not None:
1982                     useremail = info[0]
1983                     password = info[2]
1984                 else:
1985                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1986             except (IOError, netrc.NetrcParseError) as err:
1987                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1988                 return
1989
1990         if useremail is None:
1991             return
1992
1993         # Log in
1994         login_form = {
1995             'email': useremail,
1996             'pass': password,
1997             'login': 'Log+In'
1998             }
1999         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2000         try:
2001             self.report_login()
2002             login_results = compat_urllib_request.urlopen(request).read()
2003             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2004                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2005                 return
2006         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2007             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2008             return
2009
2010     def _real_extract(self, url):
2011         mobj = re.match(self._VALID_URL, url)
2012         if mobj is None:
2013             self._downloader.report_error(u'invalid URL: %s' % url)
2014             return
2015         video_id = mobj.group('ID')
2016
2017         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2018         webpage = self._download_webpage(url, video_id)
2019
2020         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2021         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2022         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2023         if not m:
2024             raise ExtractorError(u'Cannot parse data')
2025         data = dict(json.loads(m.group(1)))
2026         params_raw = compat_urllib_parse.unquote(data['params'])
2027         params = json.loads(params_raw)
2028         video_data = params['video_data'][0]
2029         video_url = video_data.get('hd_src')
2030         if not video_url:
2031             video_url = video_data['sd_src']
2032         if not video_url:
2033             raise ExtractorError(u'Cannot find video URL')
2034         video_duration = int(video_data['video_duration'])
2035         thumbnail = video_data['thumbnail_src']
2036
2037         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2038         if not m:
2039             raise ExtractorError(u'Cannot find title in webpage')
2040         video_title = unescapeHTML(m.group(1))
2041
2042         info = {
2043             'id': video_id,
2044             'title': video_title,
2045             'url': video_url,
2046             'ext': 'mp4',
2047             'duration': video_duration,
2048             'thumbnail': thumbnail,
2049         }
2050         return [info]
2051
2052
2053 class BlipTVIE(InfoExtractor):
2054     """Information extractor for blip.tv"""
2055
2056     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2057     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2058     IE_NAME = u'blip.tv'
2059
2060     def report_direct_download(self, title):
2061         """Report information extraction."""
2062         self.to_screen(u'%s: Direct download detected' % title)
2063
2064     def _real_extract(self, url):
2065         mobj = re.match(self._VALID_URL, url)
2066         if mobj is None:
2067             self._downloader.report_error(u'invalid URL: %s' % url)
2068             return
2069
2070         urlp = compat_urllib_parse_urlparse(url)
2071         if urlp.path.startswith('/play/'):
2072             request = compat_urllib_request.Request(url)
2073             response = compat_urllib_request.urlopen(request)
2074             redirecturl = response.geturl()
2075             rurlp = compat_urllib_parse_urlparse(redirecturl)
2076             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2077             url = 'http://blip.tv/a/a-' + file_id
2078             return self._real_extract(url)
2079
2080
2081         if '?' in url:
2082             cchar = '&'
2083         else:
2084             cchar = '?'
2085         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2086         request = compat_urllib_request.Request(json_url)
2087         request.add_header('User-Agent', 'iTunes/10.6.1')
2088         self.report_extraction(mobj.group(1))
2089         info = None
2090         try:
2091             urlh = compat_urllib_request.urlopen(request)
2092             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2093                 basename = url.split('/')[-1]
2094                 title,ext = os.path.splitext(basename)
2095                 title = title.decode('UTF-8')
2096                 ext = ext.replace('.', '')
2097                 self.report_direct_download(title)
2098                 info = {
2099                     'id': title,
2100                     'url': url,
2101                     'uploader': None,
2102                     'upload_date': None,
2103                     'title': title,
2104                     'ext': ext,
2105                     'urlhandle': urlh
2106                 }
2107         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2108             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2109         if info is None: # Regular URL
2110             try:
2111                 json_code_bytes = urlh.read()
2112                 json_code = json_code_bytes.decode('utf-8')
2113             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2114                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2115                 return
2116
2117             try:
2118                 json_data = json.loads(json_code)
2119                 if 'Post' in json_data:
2120                     data = json_data['Post']
2121                 else:
2122                     data = json_data
2123
2124                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2125                 video_url = data['media']['url']
2126                 umobj = re.match(self._URL_EXT, video_url)
2127                 if umobj is None:
2128                     raise ValueError('Can not determine filename extension')
2129                 ext = umobj.group(1)
2130
2131                 info = {
2132                     'id': data['item_id'],
2133                     'url': video_url,
2134                     'uploader': data['display_name'],
2135                     'upload_date': upload_date,
2136                     'title': data['title'],
2137                     'ext': ext,
2138                     'format': data['media']['mimeType'],
2139                     'thumbnail': data['thumbnailUrl'],
2140                     'description': data['description'],
2141                     'player_url': data['embedUrl'],
2142                     'user_agent': 'iTunes/10.6.1',
2143                 }
2144             except (ValueError,KeyError) as err:
2145                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2146                 return
2147
2148         return [info]
2149
2150
2151 class MyVideoIE(InfoExtractor):
2152     """Information Extractor for myvideo.de."""
2153
2154     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2155     IE_NAME = u'myvideo'
2156
2157     def _real_extract(self,url):
2158         mobj = re.match(self._VALID_URL, url)
2159         if mobj is None:
2160             self._download.report_error(u'invalid URL: %s' % url)
2161             return
2162
2163         video_id = mobj.group(1)
2164
2165         # Get video webpage
2166         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2167         webpage = self._download_webpage(webpage_url, video_id)
2168
2169         self.report_extraction(video_id)
2170         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2171                  webpage)
2172         if mobj is None:
2173             self._downloader.report_error(u'unable to extract media URL')
2174             return
2175         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2176
2177         mobj = re.search('<title>([^<]+)</title>', webpage)
2178         if mobj is None:
2179             self._downloader.report_error(u'unable to extract title')
2180             return
2181
2182         video_title = mobj.group(1)
2183
2184         return [{
2185             'id':       video_id,
2186             'url':      video_url,
2187             'uploader': None,
2188             'upload_date':  None,
2189             'title':    video_title,
2190             'ext':      u'flv',
2191         }]
2192
2193 class ComedyCentralIE(InfoExtractor):
2194     """Information extractor for The Daily Show and Colbert Report """
2195
2196     # urls can be abbreviations like :thedailyshow or :colbert
2197     # urls for episodes like:
2198     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2199     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2200     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2201     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2202                       |(https?://)?(www\.)?
2203                           (?P<showname>thedailyshow|colbertnation)\.com/
2204                          (full-episodes/(?P<episode>.*)|
2205                           (?P<clip>
2206                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2207                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2208                      $"""
2209
2210     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2211
2212     _video_extensions = {
2213         '3500': 'mp4',
2214         '2200': 'mp4',
2215         '1700': 'mp4',
2216         '1200': 'mp4',
2217         '750': 'mp4',
2218         '400': 'mp4',
2219     }
2220     _video_dimensions = {
2221         '3500': '1280x720',
2222         '2200': '960x540',
2223         '1700': '768x432',
2224         '1200': '640x360',
2225         '750': '512x288',
2226         '400': '384x216',
2227     }
2228
2229     @classmethod
2230     def suitable(cls, url):
2231         """Receives a URL and returns True if suitable for this IE."""
2232         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2233
2234     def _print_formats(self, formats):
2235         print('Available formats:')
2236         for x in formats:
2237             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2238
2239
2240     def _real_extract(self, url):
2241         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2242         if mobj is None:
2243             self._downloader.report_error(u'invalid URL: %s' % url)
2244             return
2245
2246         if mobj.group('shortname'):
2247             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2248                 url = u'http://www.thedailyshow.com/full-episodes/'
2249             else:
2250                 url = u'http://www.colbertnation.com/full-episodes/'
2251             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252             assert mobj is not None
2253
2254         if mobj.group('clip'):
2255             if mobj.group('showname') == 'thedailyshow':
2256                 epTitle = mobj.group('tdstitle')
2257             else:
2258                 epTitle = mobj.group('cntitle')
2259             dlNewest = False
2260         else:
2261             dlNewest = not mobj.group('episode')
2262             if dlNewest:
2263                 epTitle = mobj.group('showname')
2264             else:
2265                 epTitle = mobj.group('episode')
2266
2267         self.report_extraction(epTitle)
2268         webpage = self._download_webpage(url, epTitle)
2269         if dlNewest:
2270             url = htmlHandle.geturl()
2271             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2272             if mobj is None:
2273                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2274                 return
2275             if mobj.group('episode') == '':
2276                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2277                 return
2278             epTitle = mobj.group('episode')
2279
2280         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2281
2282         if len(mMovieParams) == 0:
2283             # The Colbert Report embeds the information in a without
2284             # a URL prefix; so extract the alternate reference
2285             # and then add the URL prefix manually.
2286
2287             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2288             if len(altMovieParams) == 0:
2289                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2290                 return
2291             else:
2292                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2293
2294         uri = mMovieParams[0][1]
2295         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2296         indexXml = self._download_webpage(indexUrl, epTitle,
2297                                           u'Downloading show index',
2298                                           u'unable to download episode index')
2299
2300         results = []
2301
2302         idoc = xml.etree.ElementTree.fromstring(indexXml)
2303         itemEls = idoc.findall('.//item')
2304         for partNum,itemEl in enumerate(itemEls):
2305             mediaId = itemEl.findall('./guid')[0].text
2306             shortMediaId = mediaId.split(':')[-1]
2307             showId = mediaId.split(':')[-2].replace('.com', '')
2308             officialTitle = itemEl.findall('./title')[0].text
2309             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2310
2311             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2312                         compat_urllib_parse.urlencode({'uri': mediaId}))
2313             configXml = self._download_webpage(configUrl, epTitle,
2314                                                u'Downloading configuration for %s' % shortMediaId)
2315
2316             cdoc = xml.etree.ElementTree.fromstring(configXml)
2317             turls = []
2318             for rendition in cdoc.findall('.//rendition'):
2319                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2320                 turls.append(finfo)
2321
2322             if len(turls) == 0:
2323                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2324                 continue
2325
2326             if self._downloader.params.get('listformats', None):
2327                 self._print_formats([i[0] for i in turls])
2328                 return
2329
2330             # For now, just pick the highest bitrate
2331             format,rtmp_video_url = turls[-1]
2332
2333             # Get the format arg from the arg stream
2334             req_format = self._downloader.params.get('format', None)
2335
2336             # Select format if we can find one
2337             for f,v in turls:
2338                 if f == req_format:
2339                     format, rtmp_video_url = f, v
2340                     break
2341
2342             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2343             if not m:
2344                 raise ExtractorError(u'Cannot transform RTMP url')
2345             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2346             video_url = base + m.group('finalid')
2347
2348             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2349             info = {
2350                 'id': shortMediaId,
2351                 'url': video_url,
2352                 'uploader': showId,
2353                 'upload_date': officialDate,
2354                 'title': effTitle,
2355                 'ext': 'mp4',
2356                 'format': format,
2357                 'thumbnail': None,
2358                 'description': officialTitle,
2359             }
2360             results.append(info)
2361
2362         return results
2363
2364
2365 class EscapistIE(InfoExtractor):
2366     """Information extractor for The Escapist """
2367
2368     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2369     IE_NAME = u'escapist'
2370
2371     def _real_extract(self, url):
2372         mobj = re.match(self._VALID_URL, url)
2373         if mobj is None:
2374             self._downloader.report_error(u'invalid URL: %s' % url)
2375             return
2376         showName = mobj.group('showname')
2377         videoId = mobj.group('episode')
2378
2379         self.report_extraction(showName)
2380         webPage = self._download_webpage(url, showName)
2381
2382         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2383         description = unescapeHTML(descMatch.group(1))
2384         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2385         imgUrl = unescapeHTML(imgMatch.group(1))
2386         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2387         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2388         configUrlMatch = re.search('config=(.*)$', playerUrl)
2389         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2390
2391         configJSON = self._download_webpage(configUrl, showName,
2392                                             u'Downloading configuration',
2393                                             u'unable to download configuration')
2394
2395         # Technically, it's JavaScript, not JSON
2396         configJSON = configJSON.replace("'", '"')
2397
2398         try:
2399             config = json.loads(configJSON)
2400         except (ValueError,) as err:
2401             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2402             return
2403
2404         playlist = config['playlist']
2405         videoUrl = playlist[1]['url']
2406
2407         info = {
2408             'id': videoId,
2409             'url': videoUrl,
2410             'uploader': showName,
2411             'upload_date': None,
2412             'title': showName,
2413             'ext': 'mp4',
2414             'thumbnail': imgUrl,
2415             'description': description,
2416             'player_url': playerUrl,
2417         }
2418
2419         return [info]
2420
2421 class CollegeHumorIE(InfoExtractor):
2422     """Information extractor for collegehumor.com"""
2423
2424     _WORKING = False
2425     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2426     IE_NAME = u'collegehumor'
2427
2428     def report_manifest(self, video_id):
2429         """Report information extraction."""
2430         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2431
2432     def _real_extract(self, url):
2433         mobj = re.match(self._VALID_URL, url)
2434         if mobj is None:
2435             self._downloader.report_error(u'invalid URL: %s' % url)
2436             return
2437         video_id = mobj.group('videoid')
2438
2439         info = {
2440             'id': video_id,
2441             'uploader': None,
2442             'upload_date': None,
2443         }
2444
2445         self.report_extraction(video_id)
2446         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2447         try:
2448             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2449         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2451             return
2452
2453         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2454         try:
2455             videoNode = mdoc.findall('./video')[0]
2456             info['description'] = videoNode.findall('./description')[0].text
2457             info['title'] = videoNode.findall('./caption')[0].text
2458             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2459             manifest_url = videoNode.findall('./file')[0].text
2460         except IndexError:
2461             self._downloader.report_error(u'Invalid metadata XML file')
2462             return
2463
2464         manifest_url += '?hdcore=2.10.3'
2465         self.report_manifest(video_id)
2466         try:
2467             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2468         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2470             return
2471
2472         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2473         try:
2474             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2475             node_id = media_node.attrib['url']
2476             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2477         except IndexError as err:
2478             self._downloader.report_error(u'Invalid manifest file')
2479             return
2480
2481         url_pr = compat_urllib_parse_urlparse(manifest_url)
2482         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2483
2484         info['url'] = url
2485         info['ext'] = 'f4f'
2486         return [info]
2487
2488
2489 class XVideosIE(InfoExtractor):
2490     """Information extractor for xvideos.com"""
2491
2492     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2493     IE_NAME = u'xvideos'
2494
2495     def _real_extract(self, url):
2496         mobj = re.match(self._VALID_URL, url)
2497         if mobj is None:
2498             self._downloader.report_error(u'invalid URL: %s' % url)
2499             return
2500         video_id = mobj.group(1)
2501
2502         webpage = self._download_webpage(url, video_id)
2503
2504         self.report_extraction(video_id)
2505
2506
2507         # Extract video URL
2508         mobj = re.search(r'flv_url=(.+?)&', webpage)
2509         if mobj is None:
2510             self._downloader.report_error(u'unable to extract video url')
2511             return
2512         video_url = compat_urllib_parse.unquote(mobj.group(1))
2513
2514
2515         # Extract title
2516         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2517         if mobj is None:
2518             self._downloader.report_error(u'unable to extract video title')
2519             return
2520         video_title = mobj.group(1)
2521
2522
2523         # Extract video thumbnail
2524         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2525         if mobj is None:
2526             self._downloader.report_error(u'unable to extract video thumbnail')
2527             return
2528         video_thumbnail = mobj.group(0)
2529
2530         info = {
2531             'id': video_id,
2532             'url': video_url,
2533             'uploader': None,
2534             'upload_date': None,
2535             'title': video_title,
2536             'ext': 'flv',
2537             'thumbnail': video_thumbnail,
2538             'description': None,
2539         }
2540
2541         return [info]
2542
2543
2544 class SoundcloudIE(InfoExtractor):
2545     """Information extractor for soundcloud.com
2546        To access the media, the uid of the song and a stream token
2547        must be extracted from the page source and the script must make
2548        a request to media.soundcloud.com/crossdomain.xml. Then
2549        the media can be grabbed by requesting from an url composed
2550        of the stream token and uid
2551      """
2552
2553     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2554     IE_NAME = u'soundcloud'
2555
2556     def report_resolve(self, video_id):
2557         """Report information extraction."""
2558         self.to_screen(u'%s: Resolving id' % video_id)
2559
2560     def _real_extract(self, url):
2561         mobj = re.match(self._VALID_URL, url)
2562         if mobj is None:
2563             self._downloader.report_error(u'invalid URL: %s' % url)
2564             return
2565
2566         # extract uploader (which is in the url)
2567         uploader = mobj.group(1)
2568         # extract simple title (uploader + slug of song title)
2569         slug_title =  mobj.group(2)
2570         simple_title = uploader + u'-' + slug_title
2571         full_title = '%s/%s' % (uploader, slug_title)
2572
2573         self.report_resolve(full_title)
2574
2575         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2576         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2577         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2578
2579         info = json.loads(info_json)
2580         video_id = info['id']
2581         self.report_extraction(full_title)
2582
2583         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2584         stream_json = self._download_webpage(streams_url, full_title,
2585                                              u'Downloading stream definitions',
2586                                              u'unable to download stream definitions')
2587
2588         streams = json.loads(stream_json)
2589         mediaURL = streams['http_mp3_128_url']
2590         upload_date = unified_strdate(info['created_at'])
2591
2592         return [{
2593             'id':       info['id'],
2594             'url':      mediaURL,
2595             'uploader': info['user']['username'],
2596             'upload_date': upload_date,
2597             'title':    info['title'],
2598             'ext':      u'mp3',
2599             'description': info['description'],
2600         }]
2601
2602 class SoundcloudSetIE(InfoExtractor):
2603     """Information extractor for soundcloud.com sets
2604        To access the media, the uid of the song and a stream token
2605        must be extracted from the page source and the script must make
2606        a request to media.soundcloud.com/crossdomain.xml. Then
2607        the media can be grabbed by requesting from an url composed
2608        of the stream token and uid
2609      """
2610
2611     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2612     IE_NAME = u'soundcloud:set'
2613
2614     def report_resolve(self, video_id):
2615         """Report information extraction."""
2616         self.to_screen(u'%s: Resolving id' % video_id)
2617
2618     def _real_extract(self, url):
2619         mobj = re.match(self._VALID_URL, url)
2620         if mobj is None:
2621             self._downloader.report_error(u'invalid URL: %s' % url)
2622             return
2623
2624         # extract uploader (which is in the url)
2625         uploader = mobj.group(1)
2626         # extract simple title (uploader + slug of song title)
2627         slug_title =  mobj.group(2)
2628         simple_title = uploader + u'-' + slug_title
2629         full_title = '%s/sets/%s' % (uploader, slug_title)
2630
2631         self.report_resolve(full_title)
2632
2633         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2634         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2635         info_json = self._download_webpage(resolv_url, full_title)
2636
2637         videos = []
2638         info = json.loads(info_json)
2639         if 'errors' in info:
2640             for err in info['errors']:
2641                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2642             return
2643
2644         self.report_extraction(full_title)
2645         for track in info['tracks']:
2646             video_id = track['id']
2647
2648             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2649             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2650
2651             self.report_extraction(video_id)
2652             streams = json.loads(stream_json)
2653             mediaURL = streams['http_mp3_128_url']
2654
2655             videos.append({
2656                 'id':       video_id,
2657                 'url':      mediaURL,
2658                 'uploader': track['user']['username'],
2659                 'upload_date':  unified_strdate(track['created_at']),
2660                 'title':    track['title'],
2661                 'ext':      u'mp3',
2662                 'description': track['description'],
2663             })
2664         return videos
2665
2666
2667 class InfoQIE(InfoExtractor):
2668     """Information extractor for infoq.com"""
2669     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2670
2671     def _real_extract(self, url):
2672         mobj = re.match(self._VALID_URL, url)
2673         if mobj is None:
2674             self._downloader.report_error(u'invalid URL: %s' % url)
2675             return
2676
2677         webpage = self._download_webpage(url, video_id=url)
2678         self.report_extraction(url)
2679
2680         # Extract video URL
2681         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2682         if mobj is None:
2683             self._downloader.report_error(u'unable to extract video url')
2684             return
2685         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2686         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2687
2688         # Extract title
2689         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2690         if mobj is None:
2691             self._downloader.report_error(u'unable to extract video title')
2692             return
2693         video_title = mobj.group(1)
2694
2695         # Extract description
2696         video_description = u'No description available.'
2697         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2698         if mobj is not None:
2699             video_description = mobj.group(1)
2700
2701         video_filename = video_url.split('/')[-1]
2702         video_id, extension = video_filename.split('.')
2703
2704         info = {
2705             'id': video_id,
2706             'url': video_url,
2707             'uploader': None,
2708             'upload_date': None,
2709             'title': video_title,
2710             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2711             'thumbnail': None,
2712             'description': video_description,
2713         }
2714
2715         return [info]
2716
2717 class MixcloudIE(InfoExtractor):
2718     """Information extractor for www.mixcloud.com"""
2719
2720     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2721     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2722     IE_NAME = u'mixcloud'
2723
2724     def report_download_json(self, file_id):
2725         """Report JSON download."""
2726         self.to_screen(u'Downloading json')
2727
2728     def get_urls(self, jsonData, fmt, bitrate='best'):
2729         """Get urls from 'audio_formats' section in json"""
2730         file_url = None
2731         try:
2732             bitrate_list = jsonData[fmt]
2733             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2734                 bitrate = max(bitrate_list) # select highest
2735
2736             url_list = jsonData[fmt][bitrate]
2737         except TypeError: # we have no bitrate info.
2738             url_list = jsonData[fmt]
2739         return url_list
2740
2741     def check_urls(self, url_list):
2742         """Returns 1st active url from list"""
2743         for url in url_list:
2744             try:
2745                 compat_urllib_request.urlopen(url)
2746                 return url
2747             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2748                 url = None
2749
2750         return None
2751
2752     def _print_formats(self, formats):
2753         print('Available formats:')
2754         for fmt in formats.keys():
2755             for b in formats[fmt]:
2756                 try:
2757                     ext = formats[fmt][b][0]
2758                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2759                 except TypeError: # we have no bitrate info
2760                     ext = formats[fmt][0]
2761                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2762                     break
2763
2764     def _real_extract(self, url):
2765         mobj = re.match(self._VALID_URL, url)
2766         if mobj is None:
2767             self._downloader.report_error(u'invalid URL: %s' % url)
2768             return
2769         # extract uploader & filename from url
2770         uploader = mobj.group(1).decode('utf-8')
2771         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2772
2773         # construct API request
2774         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2775         # retrieve .json file with links to files
2776         request = compat_urllib_request.Request(file_url)
2777         try:
2778             self.report_download_json(file_url)
2779             jsonData = compat_urllib_request.urlopen(request).read()
2780         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2781             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2782             return
2783
2784         # parse JSON
2785         json_data = json.loads(jsonData)
2786         player_url = json_data['player_swf_url']
2787         formats = dict(json_data['audio_formats'])
2788
2789         req_format = self._downloader.params.get('format', None)
2790         bitrate = None
2791
2792         if self._downloader.params.get('listformats', None):
2793             self._print_formats(formats)
2794             return
2795
2796         if req_format is None or req_format == 'best':
2797             for format_param in formats.keys():
2798                 url_list = self.get_urls(formats, format_param)
2799                 # check urls
2800                 file_url = self.check_urls(url_list)
2801                 if file_url is not None:
2802                     break # got it!
2803         else:
2804             if req_format not in formats:
2805                 self._downloader.report_error(u'format is not available')
2806                 return
2807
2808             url_list = self.get_urls(formats, req_format)
2809             file_url = self.check_urls(url_list)
2810             format_param = req_format
2811
2812         return [{
2813             'id': file_id.decode('utf-8'),
2814             'url': file_url.decode('utf-8'),
2815             'uploader': uploader.decode('utf-8'),
2816             'upload_date': None,
2817             'title': json_data['name'],
2818             'ext': file_url.split('.')[-1].decode('utf-8'),
2819             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2820             'thumbnail': json_data['thumbnail_url'],
2821             'description': json_data['description'],
2822             'player_url': player_url.decode('utf-8'),
2823         }]
2824
2825 class StanfordOpenClassroomIE(InfoExtractor):
2826     """Information extractor for Stanford's Open ClassRoom"""
2827
2828     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2829     IE_NAME = u'stanfordoc'
2830
2831     def _real_extract(self, url):
2832         mobj = re.match(self._VALID_URL, url)
2833         if mobj is None:
2834             raise ExtractorError(u'Invalid URL: %s' % url)
2835
2836         if mobj.group('course') and mobj.group('video'): # A specific video
2837             course = mobj.group('course')
2838             video = mobj.group('video')
2839             info = {
2840                 'id': course + '_' + video,
2841                 'uploader': None,
2842                 'upload_date': None,
2843             }
2844
2845             self.report_extraction(info['id'])
2846             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2847             xmlUrl = baseUrl + video + '.xml'
2848             try:
2849                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2850             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2851                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2852                 return
2853             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2854             try:
2855                 info['title'] = mdoc.findall('./title')[0].text
2856                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2857             except IndexError:
2858                 self._downloader.report_error(u'Invalid metadata XML file')
2859                 return
2860             info['ext'] = info['url'].rpartition('.')[2]
2861             return [info]
2862         elif mobj.group('course'): # A course page
2863             course = mobj.group('course')
2864             info = {
2865                 'id': course,
2866                 'type': 'playlist',
2867                 'uploader': None,
2868                 'upload_date': None,
2869             }
2870
2871             coursepage = self._download_webpage(url, info['id'],
2872                                         note='Downloading course info page',
2873                                         errnote='Unable to download course info page')
2874
2875             m = re.search('<h1>([^<]+)</h1>', coursepage)
2876             if m:
2877                 info['title'] = unescapeHTML(m.group(1))
2878             else:
2879                 info['title'] = info['id']
2880
2881             m = re.search('<description>([^<]+)</description>', coursepage)
2882             if m:
2883                 info['description'] = unescapeHTML(m.group(1))
2884
2885             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2886             info['list'] = [
2887                 {
2888                     'type': 'reference',
2889                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2890                 }
2891                     for vpage in links]
2892             results = []
2893             for entry in info['list']:
2894                 assert entry['type'] == 'reference'
2895                 results += self.extract(entry['url'])
2896             return results
2897         else: # Root page
2898             info = {
2899                 'id': 'Stanford OpenClassroom',
2900                 'type': 'playlist',
2901                 'uploader': None,
2902                 'upload_date': None,
2903             }
2904
2905             self.report_download_webpage(info['id'])
2906             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2907             try:
2908                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2909             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2910                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2911                 return
2912
2913             info['title'] = info['id']
2914
2915             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2916             info['list'] = [
2917                 {
2918                     'type': 'reference',
2919                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2920                 }
2921                     for cpage in links]
2922
2923             results = []
2924             for entry in info['list']:
2925                 assert entry['type'] == 'reference'
2926                 results += self.extract(entry['url'])
2927             return results
2928
2929 class MTVIE(InfoExtractor):
2930     """Information extractor for MTV.com"""
2931
2932     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2933     IE_NAME = u'mtv'
2934
2935     def _real_extract(self, url):
2936         mobj = re.match(self._VALID_URL, url)
2937         if mobj is None:
2938             self._downloader.report_error(u'invalid URL: %s' % url)
2939             return
2940         if not mobj.group('proto'):
2941             url = 'http://' + url
2942         video_id = mobj.group('videoid')
2943
2944         webpage = self._download_webpage(url, video_id)
2945
2946         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2947         if mobj is None:
2948             self._downloader.report_error(u'unable to extract song name')
2949             return
2950         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2951         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2952         if mobj is None:
2953             self._downloader.report_error(u'unable to extract performer')
2954             return
2955         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2956         video_title = performer + ' - ' + song_name
2957
2958         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2959         if mobj is None:
2960             self._downloader.report_error(u'unable to mtvn_uri')
2961             return
2962         mtvn_uri = mobj.group(1)
2963
2964         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2965         if mobj is None:
2966             self._downloader.report_error(u'unable to extract content id')
2967             return
2968         content_id = mobj.group(1)
2969
2970         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2971         self.report_extraction(video_id)
2972         request = compat_urllib_request.Request(videogen_url)
2973         try:
2974             metadataXml = compat_urllib_request.urlopen(request).read()
2975         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2976             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2977             return
2978
2979         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2980         renditions = mdoc.findall('.//rendition')
2981
2982         # For now, always pick the highest quality.
2983         rendition = renditions[-1]
2984
2985         try:
2986             _,_,ext = rendition.attrib['type'].partition('/')
2987             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2988             video_url = rendition.find('./src').text
2989         except KeyError:
2990             self._downloader.report_error('Invalid rendition field.')
2991             return
2992
2993         info = {
2994             'id': video_id,
2995             'url': video_url,
2996             'uploader': performer,
2997             'upload_date': None,
2998             'title': video_title,
2999             'ext': ext,
3000             'format': format,
3001         }
3002
3003         return [info]
3004
3005
3006 class YoukuIE(InfoExtractor):
3007     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3008
3009     def _gen_sid(self):
3010         nowTime = int(time.time() * 1000)
3011         random1 = random.randint(1000,1998)
3012         random2 = random.randint(1000,9999)
3013
3014         return "%d%d%d" %(nowTime,random1,random2)
3015
3016     def _get_file_ID_mix_string(self, seed):
3017         mixed = []
3018         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3019         seed = float(seed)
3020         for i in range(len(source)):
3021             seed  =  (seed * 211 + 30031 ) % 65536
3022             index  =  math.floor(seed / 65536 * len(source) )
3023             mixed.append(source[int(index)])
3024             source.remove(source[int(index)])
3025         #return ''.join(mixed)
3026         return mixed
3027
3028     def _get_file_id(self, fileId, seed):
3029         mixed = self._get_file_ID_mix_string(seed)
3030         ids = fileId.split('*')
3031         realId = []
3032         for ch in ids:
3033             if ch:
3034                 realId.append(mixed[int(ch)])
3035         return ''.join(realId)
3036
3037     def _real_extract(self, url):
3038         mobj = re.match(self._VALID_URL, url)
3039         if mobj is None:
3040             self._downloader.report_error(u'invalid URL: %s' % url)
3041             return
3042         video_id = mobj.group('ID')
3043
3044         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3045
3046         jsondata = self._download_webpage(info_url, video_id)
3047
3048         self.report_extraction(video_id)
3049         try:
3050             config = json.loads(jsondata)
3051
3052             video_title =  config['data'][0]['title']
3053             seed = config['data'][0]['seed']
3054
3055             format = self._downloader.params.get('format', None)
3056             supported_format = list(config['data'][0]['streamfileids'].keys())
3057
3058             if format is None or format == 'best':
3059                 if 'hd2' in supported_format:
3060                     format = 'hd2'
3061                 else:
3062                     format = 'flv'
3063                 ext = u'flv'
3064             elif format == 'worst':
3065                 format = 'mp4'
3066                 ext = u'mp4'
3067             else:
3068                 format = 'flv'
3069                 ext = u'flv'
3070
3071
3072             fileid = config['data'][0]['streamfileids'][format]
3073             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3074         except (UnicodeDecodeError, ValueError, KeyError):
3075             self._downloader.report_error(u'unable to extract info section')
3076             return
3077
3078         files_info=[]
3079         sid = self._gen_sid()
3080         fileid = self._get_file_id(fileid, seed)
3081
3082         #column 8,9 of fileid represent the segment number
3083         #fileid[7:9] should be changed
3084         for index, key in enumerate(keys):
3085
3086             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3087             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3088
3089             info = {
3090                 'id': '%s_part%02d' % (video_id, index),
3091                 'url': download_url,
3092                 'uploader': None,
3093                 'upload_date': None,
3094                 'title': video_title,
3095                 'ext': ext,
3096             }
3097             files_info.append(info)
3098
3099         return files_info
3100
3101
3102 class XNXXIE(InfoExtractor):
3103     """Information extractor for xnxx.com"""
3104
3105     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3106     IE_NAME = u'xnxx'
3107     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3108     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3109     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3110
3111     def _real_extract(self, url):
3112         mobj = re.match(self._VALID_URL, url)
3113         if mobj is None:
3114             self._downloader.report_error(u'invalid URL: %s' % url)
3115             return
3116         video_id = mobj.group(1)
3117
3118         # Get webpage content
3119         webpage = self._download_webpage(url, video_id)
3120
3121         result = re.search(self.VIDEO_URL_RE, webpage)
3122         if result is None:
3123             self._downloader.report_error(u'unable to extract video url')
3124             return
3125         video_url = compat_urllib_parse.unquote(result.group(1))
3126
3127         result = re.search(self.VIDEO_TITLE_RE, webpage)
3128         if result is None:
3129             self._downloader.report_error(u'unable to extract video title')
3130             return
3131         video_title = result.group(1)
3132
3133         result = re.search(self.VIDEO_THUMB_RE, webpage)
3134         if result is None:
3135             self._downloader.report_error(u'unable to extract video thumbnail')
3136             return
3137         video_thumbnail = result.group(1)
3138
3139         return [{
3140             'id': video_id,
3141             'url': video_url,
3142             'uploader': None,
3143             'upload_date': None,
3144             'title': video_title,
3145             'ext': 'flv',
3146             'thumbnail': video_thumbnail,
3147             'description': None,
3148         }]
3149
3150
3151 class GooglePlusIE(InfoExtractor):
3152     """Information extractor for plus.google.com."""
3153
3154     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3155     IE_NAME = u'plus.google'
3156
3157     def report_extract_entry(self, url):
3158         """Report downloading extry"""
3159         self.to_screen(u'Downloading entry: %s' % url)
3160
3161     def report_date(self, upload_date):
3162         """Report downloading extry"""
3163         self.to_screen(u'Entry date: %s' % upload_date)
3164
3165     def report_uploader(self, uploader):
3166         """Report downloading extry"""
3167         self.to_screen(u'Uploader: %s' % uploader)
3168
3169     def report_title(self, video_title):
3170         """Report downloading extry"""
3171         self.to_screen(u'Title: %s' % video_title)
3172
3173     def report_extract_vid_page(self, video_page):
3174         """Report information extraction."""
3175         self.to_screen(u'Extracting video page: %s' % video_page)
3176
3177     def _real_extract(self, url):
3178         # Extract id from URL
3179         mobj = re.match(self._VALID_URL, url)
3180         if mobj is None:
3181             self._downloader.report_error(u'Invalid URL: %s' % url)
3182             return
3183
3184         post_url = mobj.group(0)
3185         video_id = mobj.group(1)
3186
3187         video_extension = 'flv'
3188
3189         # Step 1, Retrieve post webpage to extract further information
3190         self.report_extract_entry(post_url)
3191         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3192
3193         # Extract update date
3194         upload_date = None
3195         pattern = 'title="Timestamp">(.*?)</a>'
3196         mobj = re.search(pattern, webpage)
3197         if mobj:
3198             upload_date = mobj.group(1)
3199             # Convert timestring to a format suitable for filename
3200             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3201             upload_date = upload_date.strftime('%Y%m%d')
3202         self.report_date(upload_date)
3203
3204         # Extract uploader
3205         uploader = None
3206         pattern = r'rel\="author".*?>(.*?)</a>'
3207         mobj = re.search(pattern, webpage)
3208         if mobj:
3209             uploader = mobj.group(1)
3210         self.report_uploader(uploader)
3211
3212         # Extract title
3213         # Get the first line for title
3214         video_title = u'NA'
3215         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3216         mobj = re.search(pattern, webpage)
3217         if mobj:
3218             video_title = mobj.group(1)
3219         self.report_title(video_title)
3220
3221         # Step 2, Stimulate clicking the image box to launch video
3222         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3223         mobj = re.search(pattern, webpage)
3224         if mobj is None:
3225             self._downloader.report_error(u'unable to extract video page URL')
3226
3227         video_page = mobj.group(1)
3228         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3229         self.report_extract_vid_page(video_page)
3230
3231
3232         # Extract video links on video page
3233         """Extract video links of all sizes"""
3234         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3235         mobj = re.findall(pattern, webpage)
3236         if len(mobj) == 0:
3237             self._downloader.report_error(u'unable to extract video links')
3238
3239         # Sort in resolution
3240         links = sorted(mobj)
3241
3242         # Choose the lowest of the sort, i.e. highest resolution
3243         video_url = links[-1]
3244         # Only get the url. The resolution part in the tuple has no use anymore
3245         video_url = video_url[-1]
3246         # Treat escaped \u0026 style hex
3247         try:
3248             video_url = video_url.decode("unicode_escape")
3249         except AttributeError: # Python 3
3250             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3251
3252
3253         return [{
3254             'id':       video_id,
3255             'url':      video_url,
3256             'uploader': uploader,
3257             'upload_date':  upload_date,
3258             'title':    video_title,
3259             'ext':      video_extension,
3260         }]
3261
3262 class NBAIE(InfoExtractor):
3263     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3264     IE_NAME = u'nba'
3265
3266     def _real_extract(self, url):
3267         mobj = re.match(self._VALID_URL, url)
3268         if mobj is None:
3269             self._downloader.report_error(u'invalid URL: %s' % url)
3270             return
3271
3272         video_id = mobj.group(1)
3273         if video_id.endswith('/index.html'):
3274             video_id = video_id[:-len('/index.html')]
3275
3276         webpage = self._download_webpage(url, video_id)
3277
3278         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3279         def _findProp(rexp, default=None):
3280             m = re.search(rexp, webpage)
3281             if m:
3282                 return unescapeHTML(m.group(1))
3283             else:
3284                 return default
3285
3286         shortened_video_id = video_id.rpartition('/')[2]
3287         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3288         info = {
3289             'id': shortened_video_id,
3290             'url': video_url,
3291             'ext': 'mp4',
3292             'title': title,
3293             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3294             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3295         }
3296         return [info]
3297
3298 class JustinTVIE(InfoExtractor):
3299     """Information extractor for justin.tv and twitch.tv"""
3300     # TODO: One broadcast may be split into multiple videos. The key
3301     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3302     # starts at 1 and increases. Can we treat all parts as one video?
3303
3304     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3305         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3306     _JUSTIN_PAGE_LIMIT = 100
3307     IE_NAME = u'justin.tv'
3308
3309     def report_download_page(self, channel, offset):
3310         """Report attempt to download a single page of videos."""
3311         self.to_screen(u'%s: Downloading video information from %d to %d' %
3312                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3313
3314     # Return count of items, list of *valid* items
3315     def _parse_page(self, url, video_id):
3316         webpage = self._download_webpage(url, video_id,
3317                                          u'Downloading video info JSON',
3318                                          u'unable to download video info JSON')
3319
3320         response = json.loads(webpage)
3321         if type(response) != list:
3322             error_text = response.get('error', 'unknown error')
3323             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3324             return
3325         info = []
3326         for clip in response:
3327             video_url = clip['video_file_url']
3328             if video_url:
3329                 video_extension = os.path.splitext(video_url)[1][1:]
3330                 video_date = re.sub('-', '', clip['start_time'][:10])
3331                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3332                 video_id = clip['id']
3333                 video_title = clip.get('title', video_id)
3334                 info.append({
3335                     'id': video_id,
3336                     'url': video_url,
3337                     'title': video_title,
3338                     'uploader': clip.get('channel_name', video_uploader_id),
3339                     'uploader_id': video_uploader_id,
3340                     'upload_date': video_date,
3341                     'ext': video_extension,
3342                 })
3343         return (len(response), info)
3344
3345     def _real_extract(self, url):
3346         mobj = re.match(self._VALID_URL, url)
3347         if mobj is None:
3348             self._downloader.report_error(u'invalid URL: %s' % url)
3349             return
3350
3351         api = 'http://api.justin.tv'
3352         video_id = mobj.group(mobj.lastindex)
3353         paged = False
3354         if mobj.lastindex == 1:
3355             paged = True
3356             api += '/channel/archives/%s.json'
3357         else:
3358             api += '/broadcast/by_archive/%s.json'
3359         api = api % (video_id,)
3360
3361         self.report_extraction(video_id)
3362
3363         info = []
3364         offset = 0
3365         limit = self._JUSTIN_PAGE_LIMIT
3366         while True:
3367             if paged:
3368                 self.report_download_page(video_id, offset)
3369             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3370             page_count, page_info = self._parse_page(page_url, video_id)
3371             info.extend(page_info)
3372             if not paged or page_count != limit:
3373                 break
3374             offset += limit
3375         return info
3376
3377 class FunnyOrDieIE(InfoExtractor):
3378     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3379
3380     def _real_extract(self, url):
3381         mobj = re.match(self._VALID_URL, url)
3382         if mobj is None:
3383             self._downloader.report_error(u'invalid URL: %s' % url)
3384             return
3385
3386         video_id = mobj.group('id')
3387         webpage = self._download_webpage(url, video_id)
3388
3389         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3390         if not m:
3391             self._downloader.report_error(u'unable to find video information')
3392         video_url = unescapeHTML(m.group('url'))
3393
3394         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3395         if not m:
3396             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3397             if not m:
3398                 self._downloader.report_error(u'Cannot find video title')
3399         title = clean_html(m.group('title'))
3400
3401         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3402         if m:
3403             desc = unescapeHTML(m.group('desc'))
3404         else:
3405             desc = None
3406
3407         info = {
3408             'id': video_id,
3409             'url': video_url,
3410             'ext': 'mp4',
3411             'title': title,
3412             'description': desc,
3413         }
3414         return [info]
3415
3416 class SteamIE(InfoExtractor):
3417     _VALID_URL = r"""http://store\.steampowered\.com/
3418                 (agecheck/)?
3419                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3420                 (?P<gameID>\d+)/?
3421                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3422                 """
3423
3424     @classmethod
3425     def suitable(cls, url):
3426         """Receives a URL and returns True if suitable for this IE."""
3427         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3428
3429     def _real_extract(self, url):
3430         m = re.match(self._VALID_URL, url, re.VERBOSE)
3431         gameID = m.group('gameID')
3432         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3433         self.report_age_confirmation()
3434         webpage = self._download_webpage(videourl, gameID)
3435         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3436
3437         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3438         mweb = re.finditer(urlRE, webpage)
3439         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3440         titles = re.finditer(namesRE, webpage)
3441         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3442         thumbs = re.finditer(thumbsRE, webpage)
3443         videos = []
3444         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3445             video_id = vid.group('videoID')
3446             title = vtitle.group('videoName')
3447             video_url = vid.group('videoURL')
3448             video_thumb = thumb.group('thumbnail')
3449             if not video_url:
3450                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3451             info = {
3452                 'id':video_id,
3453                 'url':video_url,
3454                 'ext': 'flv',
3455                 'title': unescapeHTML(title),
3456                 'thumbnail': video_thumb
3457                   }
3458             videos.append(info)
3459         return [self.playlist_result(videos, gameID, game_title)]
3460
3461 class UstreamIE(InfoExtractor):
3462     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3463     IE_NAME = u'ustream'
3464
3465     def _real_extract(self, url):
3466         m = re.match(self._VALID_URL, url)
3467         video_id = m.group('videoID')
3468         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3469         webpage = self._download_webpage(url, video_id)
3470         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3471         title = m.group('title')
3472         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3473         uploader = m.group('uploader')
3474         info = {
3475                 'id':video_id,
3476                 'url':video_url,
3477                 'ext': 'flv',
3478                 'title': title,
3479                 'uploader': uploader
3480                   }
3481         return [info]
3482
3483 class WorldStarHipHopIE(InfoExtractor):
3484     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3485     IE_NAME = u'WorldStarHipHop'
3486
3487     def _real_extract(self, url):
3488         _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3489
3490         m = re.match(self._VALID_URL, url)
3491         video_id = m.group('id')
3492
3493         webpage_src = self._download_webpage(url, video_id)
3494
3495         mobj = re.search(_src_url, webpage_src)
3496
3497         if mobj is not None:
3498             video_url = mobj.group()
3499             if 'mp4' in video_url:
3500                 ext = 'mp4'
3501             else:
3502                 ext = 'flv'
3503         else:
3504             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3505             return
3506
3507         _title = r"""<title>(.*)</title>"""
3508
3509         mobj = re.search(_title, webpage_src)
3510
3511         if mobj is not None:
3512             title = mobj.group(1)
3513         else:
3514             title = 'World Start Hip Hop - %s' % time.ctime()
3515
3516         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3517         mobj = re.search(_thumbnail, webpage_src)
3518
3519         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3520         if mobj is not None:
3521             thumbnail = mobj.group(1)
3522         else:
3523             _title = r"""candytitles.*>(.*)</span>"""
3524             mobj = re.search(_title, webpage_src)
3525             if mobj is not None:
3526                 title = mobj.group(1)
3527             thumbnail = None
3528
3529         results = [{
3530                     'id': video_id,
3531                     'url' : video_url,
3532                     'title' : title,
3533                     'thumbnail' : thumbnail,
3534                     'ext' : ext,
3535                     }]
3536         return results
3537
3538 class RBMARadioIE(InfoExtractor):
3539     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3540
3541     def _real_extract(self, url):
3542         m = re.match(self._VALID_URL, url)
3543         video_id = m.group('videoID')
3544
3545         webpage = self._download_webpage(url, video_id)
3546         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3547         if not m:
3548             raise ExtractorError(u'Cannot find metadata')
3549         json_data = m.group(1)
3550
3551         try:
3552             data = json.loads(json_data)
3553         except ValueError as e:
3554             raise ExtractorError(u'Invalid JSON: ' + str(e))
3555
3556         video_url = data['akamai_url'] + '&cbr=256'
3557         url_parts = compat_urllib_parse_urlparse(video_url)
3558         video_ext = url_parts.path.rpartition('.')[2]
3559         info = {
3560                 'id': video_id,
3561                 'url': video_url,
3562                 'ext': video_ext,
3563                 'title': data['title'],
3564                 'description': data.get('teaser_text'),
3565                 'location': data.get('country_of_origin'),
3566                 'uploader': data.get('host', {}).get('name'),
3567                 'uploader_id': data.get('host', {}).get('slug'),
3568                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3569                 'duration': data.get('duration'),
3570         }
3571         return [info]
3572
3573
3574 class YouPornIE(InfoExtractor):
3575     """Information extractor for youporn.com."""
3576     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3577
3578     def _print_formats(self, formats):
3579         """Print all available formats"""
3580         print(u'Available formats:')
3581         print(u'ext\t\tformat')
3582         print(u'---------------------------------')
3583         for format in formats:
3584             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3585
3586     def _specific(self, req_format, formats):
3587         for x in formats:
3588             if(x["format"]==req_format):
3589                 return x
3590         return None
3591
3592     def _real_extract(self, url):
3593         mobj = re.match(self._VALID_URL, url)
3594         if mobj is None:
3595             self._downloader.report_error(u'invalid URL: %s' % url)
3596             return
3597
3598         video_id = mobj.group('videoid')
3599
3600         req = compat_urllib_request.Request(url)
3601         req.add_header('Cookie', 'age_verified=1')
3602         webpage = self._download_webpage(req, video_id)
3603
3604         # Get the video title
3605         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3606         if result is None:
3607             raise ExtractorError(u'Unable to extract video title')
3608         video_title = result.group('title').strip()
3609
3610         # Get the video date
3611         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3612         if result is None:
3613             self._downloader.report_warning(u'unable to extract video date')
3614             upload_date = None
3615         else:
3616             upload_date = unified_strdate(result.group('date').strip())
3617
3618         # Get the video uploader
3619         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3620         if result is None:
3621             self._downloader.report_warning(u'unable to extract uploader')
3622             video_uploader = None
3623         else:
3624             video_uploader = result.group('uploader').strip()
3625             video_uploader = clean_html( video_uploader )
3626
3627         # Get all of the formats available
3628         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3629         result = re.search(DOWNLOAD_LIST_RE, webpage)
3630         if result is None:
3631             raise ExtractorError(u'Unable to extract download list')
3632         download_list_html = result.group('download_list').strip()
3633
3634         # Get all of the links from the page
3635         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3636         links = re.findall(LINK_RE, download_list_html)
3637         if(len(links) == 0):
3638             raise ExtractorError(u'ERROR: no known formats available for video')
3639
3640         self.to_screen(u'Links found: %d' % len(links))
3641
3642         formats = []
3643         for link in links:
3644
3645             # A link looks like this:
3646             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3647             # A path looks like this:
3648             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3649             video_url = unescapeHTML( link )
3650             path = compat_urllib_parse_urlparse( video_url ).path
3651             extension = os.path.splitext( path )[1][1:]
3652             format = path.split('/')[4].split('_')[:2]
3653             size = format[0]
3654             bitrate = format[1]
3655             format = "-".join( format )
3656             title = u'%s-%s-%s' % (video_title, size, bitrate)
3657
3658             formats.append({
3659                 'id': video_id,
3660                 'url': video_url,
3661                 'uploader': video_uploader,
3662                 'upload_date': upload_date,
3663                 'title': title,
3664                 'ext': extension,
3665                 'format': format,
3666                 'thumbnail': None,
3667                 'description': None,
3668                 'player_url': None
3669             })
3670
3671         if self._downloader.params.get('listformats', None):
3672             self._print_formats(formats)
3673             return
3674
3675         req_format = self._downloader.params.get('format', None)
3676         self.to_screen(u'Format: %s' % req_format)
3677
3678         if req_format is None or req_format == 'best':
3679             return [formats[0]]
3680         elif req_format == 'worst':
3681             return [formats[-1]]
3682         elif req_format in ('-1', 'all'):
3683             return formats
3684         else:
3685             format = self._specific( req_format, formats )
3686             if result is None:
3687                 self._downloader.report_error(u'requested format not available')
3688                 return
3689             return [format]
3690
3691
3692
3693 class PornotubeIE(InfoExtractor):
3694     """Information extractor for pornotube.com."""
3695     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3696
3697     def _real_extract(self, url):
3698         mobj = re.match(self._VALID_URL, url)
3699         if mobj is None:
3700             self._downloader.report_error(u'invalid URL: %s' % url)
3701             return
3702
3703         video_id = mobj.group('videoid')
3704         video_title = mobj.group('title')
3705
3706         # Get webpage content
3707         webpage = self._download_webpage(url, video_id)
3708
3709         # Get the video URL
3710         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3711         result = re.search(VIDEO_URL_RE, webpage)
3712         if result is None:
3713             self._downloader.report_error(u'unable to extract video url')
3714             return
3715         video_url = compat_urllib_parse.unquote(result.group('url'))
3716
3717         #Get the uploaded date
3718         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3719         result = re.search(VIDEO_UPLOADED_RE, webpage)
3720         if result is None:
3721             self._downloader.report_error(u'unable to extract video title')
3722             return
3723         upload_date = unified_strdate(result.group('date'))
3724
3725         info = {'id': video_id,
3726                 'url': video_url,
3727                 'uploader': None,
3728                 'upload_date': upload_date,
3729                 'title': video_title,
3730                 'ext': 'flv',
3731                 'format': 'flv'}
3732
3733         return [info]
3734
3735 class YouJizzIE(InfoExtractor):
3736     """Information extractor for youjizz.com."""
3737     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3738
3739     def _real_extract(self, url):
3740         mobj = re.match(self._VALID_URL, url)
3741         if mobj is None:
3742             self._downloader.report_error(u'invalid URL: %s' % url)
3743             return
3744
3745         video_id = mobj.group('videoid')
3746
3747         # Get webpage content
3748         webpage = self._download_webpage(url, video_id)
3749
3750         # Get the video title
3751         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3752         if result is None:
3753             raise ExtractorError(u'ERROR: unable to extract video title')
3754         video_title = result.group('title').strip()
3755
3756         # Get the embed page
3757         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3758         if result is None:
3759             raise ExtractorError(u'ERROR: unable to extract embed page')
3760
3761         embed_page_url = result.group(0).strip()
3762         video_id = result.group('videoid')
3763
3764         webpage = self._download_webpage(embed_page_url, video_id)
3765
3766         # Get the video URL
3767         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3768         if result is None:
3769             raise ExtractorError(u'ERROR: unable to extract video url')
3770         video_url = result.group('source')
3771
3772         info = {'id': video_id,
3773                 'url': video_url,
3774                 'title': video_title,
3775                 'ext': 'flv',
3776                 'format': 'flv',
3777                 'player_url': embed_page_url}
3778
3779         return [info]
3780
3781 class EightTracksIE(InfoExtractor):
3782     IE_NAME = '8tracks'
3783     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3784
3785     def _real_extract(self, url):
3786         mobj = re.match(self._VALID_URL, url)
3787         if mobj is None:
3788             raise ExtractorError(u'Invalid URL: %s' % url)
3789         playlist_id = mobj.group('id')
3790
3791         webpage = self._download_webpage(url, playlist_id)
3792
3793         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3794         if not m:
3795             raise ExtractorError(u'Cannot find trax information')
3796         json_like = m.group(1)
3797         data = json.loads(json_like)
3798
3799         session = str(random.randint(0, 1000000000))
3800         mix_id = data['id']
3801         track_count = data['tracks_count']
3802         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3803         next_url = first_url
3804         res = []
3805         for i in itertools.count():
3806             api_json = self._download_webpage(next_url, playlist_id,
3807                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3808                 errnote=u'Failed to download song information')
3809             api_data = json.loads(api_json)
3810             track_data = api_data[u'set']['track']
3811             info = {
3812                 'id': track_data['id'],
3813                 'url': track_data['track_file_stream_url'],
3814                 'title': track_data['performer'] + u' - ' + track_data['name'],
3815                 'raw_title': track_data['name'],
3816                 'uploader_id': data['user']['login'],
3817                 'ext': 'm4a',
3818             }
3819             res.append(info)
3820             if api_data['set']['at_last_track']:
3821                 break
3822             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3823         return res
3824
3825 class KeekIE(InfoExtractor):
3826     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3827     IE_NAME = u'keek'
3828
3829     def _real_extract(self, url):
3830         m = re.match(self._VALID_URL, url)
3831         video_id = m.group('videoID')
3832         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3833         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3834         webpage = self._download_webpage(url, video_id)
3835         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3836         title = unescapeHTML(m.group('title'))
3837         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3838         uploader = clean_html(m.group('uploader'))
3839         info = {
3840                 'id': video_id,
3841                 'url': video_url,
3842                 'ext': 'mp4',
3843                 'title': title,
3844                 'thumbnail': thumbnail,
3845                 'uploader': uploader
3846         }
3847         return [info]
3848
3849 class TEDIE(InfoExtractor):
3850     _VALID_URL=r'''http://www\.ted\.com/
3851                    (
3852                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3853                         |
3854                         ((?P<type_talk>talks)) # We have a simple talk
3855                    )
3856                    (/lang/(.*?))? # The url may contain the language
3857                    /(?P<name>\w+) # Here goes the name and then ".html"
3858                    '''
3859
3860     @classmethod
3861     def suitable(cls, url):
3862         """Receives a URL and returns True if suitable for this IE."""
3863         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3864
3865     def _real_extract(self, url):
3866         m=re.match(self._VALID_URL, url, re.VERBOSE)
3867         if m.group('type_talk'):
3868             return [self._talk_info(url)]
3869         else :
3870             playlist_id=m.group('playlist_id')
3871             name=m.group('name')
3872             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3873             return [self._playlist_videos_info(url,name,playlist_id)]
3874
3875     def _talk_video_link(self,mediaSlug):
3876         '''Returns the video link for that mediaSlug'''
3877         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3878
3879     def _playlist_videos_info(self,url,name,playlist_id=0):
3880         '''Returns the videos of the playlist'''
3881         video_RE=r'''
3882                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3883                      ([.\s]*?)data-playlist_item_id="(\d+)"
3884                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3885                      '''
3886         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3887         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3888         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3889         m_names=re.finditer(video_name_RE,webpage)
3890
3891         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3892         m_playlist = re.search(playlist_RE, webpage)
3893         playlist_title = m_playlist.group('playlist_title')
3894
3895         playlist_entries = []
3896         for m_video, m_name in zip(m_videos,m_names):
3897             video_id=m_video.group('video_id')
3898             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3899             playlist_entries.append(self.url_result(talk_url, 'TED'))
3900         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3901
3902     def _talk_info(self, url, video_id=0):
3903         """Return the video for the talk in the url"""
3904         m=re.match(self._VALID_URL, url,re.VERBOSE)
3905         videoName=m.group('name')
3906         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3907         # If the url includes the language we get the title translated
3908         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3909         title=re.search(title_RE, webpage).group('title')
3910         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3911                         "id":(?P<videoID>[\d]+).*?
3912                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3913         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3914         thumb_match=re.search(thumb_RE,webpage)
3915         info_match=re.search(info_RE,webpage,re.VERBOSE)
3916         video_id=info_match.group('videoID')
3917         mediaSlug=info_match.group('mediaSlug')
3918         video_url=self._talk_video_link(mediaSlug)
3919         info = {
3920                 'id': video_id,
3921                 'url': video_url,
3922                 'ext': 'mp4',
3923                 'title': title,
3924                 'thumbnail': thumb_match.group('thumbnail')
3925                 }
3926         return info
3927
3928 class MySpassIE(InfoExtractor):
3929     _VALID_URL = r'http://www.myspass.de/.*'
3930
3931     def _real_extract(self, url):
3932         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3933
3934         # video id is the last path element of the URL
3935         # usually there is a trailing slash, so also try the second but last
3936         url_path = compat_urllib_parse_urlparse(url).path
3937         url_parent_path, video_id = os.path.split(url_path)
3938         if not video_id:
3939             _, video_id = os.path.split(url_parent_path)
3940
3941         # get metadata
3942         metadata_url = META_DATA_URL_TEMPLATE % video_id
3943         metadata_text = self._download_webpage(metadata_url, video_id)
3944         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3945
3946         # extract values from metadata
3947         url_flv_el = metadata.find('url_flv')
3948         if url_flv_el is None:
3949             self._downloader.report_error(u'unable to extract download url')
3950             return
3951         video_url = url_flv_el.text
3952         extension = os.path.splitext(video_url)[1][1:]
3953         title_el = metadata.find('title')
3954         if title_el is None:
3955             self._downloader.report_error(u'unable to extract title')
3956             return
3957         title = title_el.text
3958         format_id_el = metadata.find('format_id')
3959         if format_id_el is None:
3960             format = ext
3961         else:
3962             format = format_id_el.text
3963         description_el = metadata.find('description')
3964         if description_el is not None:
3965             description = description_el.text
3966         else:
3967             description = None
3968         imagePreview_el = metadata.find('imagePreview')
3969         if imagePreview_el is not None:
3970             thumbnail = imagePreview_el.text
3971         else:
3972             thumbnail = None
3973         info = {
3974             'id': video_id,
3975             'url': video_url,
3976             'title': title,
3977             'ext': extension,
3978             'format': format,
3979             'thumbnail': thumbnail,
3980             'description': description
3981         }
3982         return [info]
3983
3984 class SpiegelIE(InfoExtractor):
3985     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3986
3987     def _real_extract(self, url):
3988         m = re.match(self._VALID_URL, url)
3989         video_id = m.group('videoID')
3990
3991         webpage = self._download_webpage(url, video_id)
3992         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3993         if not m:
3994             raise ExtractorError(u'Cannot find title')
3995         video_title = unescapeHTML(m.group(1))
3996
3997         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3998         xml_code = self._download_webpage(xml_url, video_id,
3999                     note=u'Downloading XML', errnote=u'Failed to download XML')
4000
4001         idoc = xml.etree.ElementTree.fromstring(xml_code)
4002         last_type = idoc[-1]
4003         filename = last_type.findall('./filename')[0].text
4004         duration = float(last_type.findall('./duration')[0].text)
4005
4006         video_url = 'http://video2.spiegel.de/flash/' + filename
4007         video_ext = filename.rpartition('.')[2]
4008         info = {
4009             'id': video_id,
4010             'url': video_url,
4011             'ext': video_ext,
4012             'title': video_title,
4013             'duration': duration,
4014         }
4015         return [info]
4016
4017 class LiveLeakIE(InfoExtractor):
4018
4019     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4020     IE_NAME = u'liveleak'
4021
4022     def _real_extract(self, url):
4023         mobj = re.match(self._VALID_URL, url)
4024         if mobj is None:
4025             self._downloader.report_error(u'invalid URL: %s' % url)
4026             return
4027
4028         video_id = mobj.group('video_id')
4029
4030         webpage = self._download_webpage(url, video_id)
4031
4032         m = re.search(r'file: "(.*?)",', webpage)
4033         if not m:
4034             self._downloader.report_error(u'unable to find video url')
4035             return
4036         video_url = m.group(1)
4037
4038         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4039         if not m:
4040             self._downloader.report_error(u'Cannot find video title')
4041         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4042
4043         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4044         if m:
4045             desc = unescapeHTML(m.group('desc'))
4046         else:
4047             desc = None
4048
4049         m = re.search(r'By:.*?(\w+)</a>', webpage)
4050         if m:
4051             uploader = clean_html(m.group(1))
4052         else:
4053             uploader = None
4054
4055         info = {
4056             'id':  video_id,
4057             'url': video_url,
4058             'ext': 'mp4',
4059             'title': title,
4060             'description': desc,
4061             'uploader': uploader
4062         }
4063
4064         return [info]
4065
4066 class ARDIE(InfoExtractor):
4067     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4068     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4069     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4070
4071     def _real_extract(self, url):
4072         # determine video id from url
4073         m = re.match(self._VALID_URL, url)
4074
4075         numid = re.search(r'documentId=([0-9]+)', url)
4076         if numid:
4077             video_id = numid.group(1)
4078         else:
4079             video_id = m.group('video_id')
4080
4081         # determine title and media streams from webpage
4082         html = self._download_webpage(url, video_id)
4083         title = re.search(self._TITLE, html).group('title')
4084         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4085         if not streams:
4086             assert '"fsk"' in html
4087             self._downloader.report_error(u'this video is only available after 8:00 pm')
4088             return
4089
4090         # choose default media type and highest quality for now
4091         stream = max([s for s in streams if int(s["media_type"]) == 0],
4092                      key=lambda s: int(s["quality"]))
4093
4094         # there's two possibilities: RTMP stream or HTTP download
4095         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4096         if stream['rtmp_url']:
4097             self.to_screen(u'RTMP download detected')
4098             assert stream['video_url'].startswith('mp4:')
4099             info["url"] = stream["rtmp_url"]
4100             info["play_path"] = stream['video_url']
4101         else:
4102             assert stream["video_url"].endswith('.mp4')
4103             info["url"] = stream["video_url"]
4104         return [info]
4105
4106 class TumblrIE(InfoExtractor):
4107     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4108
4109     def _real_extract(self, url):
4110         m_url = re.match(self._VALID_URL, url)
4111         video_id = m_url.group('id')
4112         blog = m_url.group('blog_name')
4113
4114         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4115         webpage = self._download_webpage(url, video_id)
4116
4117         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4118         video = re.search(re_video, webpage)
4119         if video is None:
4120             self.to_screen("No video founded")
4121             return []
4122         video_url = video.group('video_url')
4123         ext = video.group('ext')
4124
4125         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4126         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4127
4128         # The only place where you can get a title, it's not complete,
4129         # but searching in other places doesn't work for all videos
4130         re_title = r'<title>(?P<title>.*?)</title>'
4131         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4132
4133         return [{'id': video_id,
4134                  'url': video_url,
4135                  'title': title,
4136                  'thumbnail': thumb,
4137                  'ext': ext
4138                  }]
4139
4140 class BandcampIE(InfoExtractor):
4141     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4142
4143     def _real_extract(self, url):
4144         mobj = re.match(self._VALID_URL, url)
4145         title = mobj.group('title')
4146         webpage = self._download_webpage(url, title)
4147         # We get the link to the free download page
4148         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4149         if m_download is None:
4150             self._downloader.report_error('No free songs founded')
4151             return
4152         download_link = m_download.group(1)
4153         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4154                        webpage, re.MULTILINE|re.DOTALL).group('id')
4155
4156         download_webpage = self._download_webpage(download_link, id,
4157                                                   'Downloading free downloads page')
4158         # We get the dictionary of the track from some javascrip code
4159         info = re.search(r'items: (.*?),$',
4160                          download_webpage, re.MULTILINE).group(1)
4161         info = json.loads(info)[0]
4162         # We pick mp3-320 for now, until format selection can be easily implemented.
4163         mp3_info = info[u'downloads'][u'mp3-320']
4164         # If we try to use this url it says the link has expired
4165         initial_url = mp3_info[u'url']
4166         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4167         m_url = re.match(re_url, initial_url)
4168         #We build the url we will use to get the final track url
4169         # This url is build in Bandcamp in the script download_bunde_*.js
4170         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4171         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4172         # If we could correctly generate the .rand field the url would be
4173         #in the "download_url" key
4174         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4175
4176         track_info = {'id':id,
4177                       'title' : info[u'title'],
4178                       'ext' : 'mp3',
4179                       'url' : final_url,
4180                       'thumbnail' : info[u'thumb_url'],
4181                       'uploader' : info[u'artist']
4182                       }
4183
4184         return [track_info]
4185
4186 class RedtubeIE(InfoExtractor):
4187     """Information Extractor for redtube"""
4188     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4189     IE_NAME = u'redtube'
4190
4191     def _real_extract(self,url):
4192         mobj = re.match(self._VALID_URL, url)
4193         if mobj is None:
4194             self._downloader.report_error(u'invalid URL: %s' % url)
4195             return
4196         video_id = mobj.group('id')
4197         video_extension = 'mp4'
4198         webpage = self._download_webpage(url, video_id)
4199         self.report_extraction(video_id)
4200         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4201         if mobj is not None:
4202             video_url = mobj.group(1)
4203         else:
4204             self._downloader.report_error(u'unable to extract media URL')
4205             return
4206         mobj = re.search('<h1 class="videoTitle slidePanelMovable">'+r'(.+)'+r'</h1>',webpage)
4207         if mobj is not None:
4208             video_title = mobj.group(1)
4209         else:
4210             video_title = 'Redtube - %s' % time.ctime()
4211
4212         return [{
4213             'id':       video_id,
4214             'url':      video_url,
4215             'ext':      video_extension,
4216             'title':    video_title,
4217         }]
4218
4219
4220 def gen_extractors():
4221     """ Return a list of an instance of every supported extractor.
4222     The order does matter; the first extractor matched is the one handling the URL.
4223     """
4224     return [
4225         YoutubePlaylistIE(),
4226         YoutubeChannelIE(),
4227         YoutubeUserIE(),
4228         YoutubeSearchIE(),
4229         YoutubeIE(),
4230         MetacafeIE(),
4231         DailymotionIE(),
4232         GoogleSearchIE(),
4233         PhotobucketIE(),
4234         YahooIE(),
4235         YahooSearchIE(),
4236         DepositFilesIE(),
4237         FacebookIE(),
4238         BlipTVUserIE(),
4239         BlipTVIE(),
4240         VimeoIE(),
4241         MyVideoIE(),
4242         ComedyCentralIE(),
4243         EscapistIE(),
4244         CollegeHumorIE(),
4245         XVideosIE(),
4246         SoundcloudSetIE(),
4247         SoundcloudIE(),
4248         InfoQIE(),
4249         MixcloudIE(),
4250         StanfordOpenClassroomIE(),
4251         MTVIE(),
4252         YoukuIE(),
4253         XNXXIE(),
4254         YouJizzIE(),
4255         PornotubeIE(),
4256         YouPornIE(),
4257         GooglePlusIE(),
4258         ArteTvIE(),
4259         NBAIE(),
4260         WorldStarHipHopIE(),
4261         JustinTVIE(),
4262         FunnyOrDieIE(),
4263         SteamIE(),
4264         UstreamIE(),
4265         RBMARadioIE(),
4266         EightTracksIE(),
4267         KeekIE(),
4268         TEDIE(),
4269         MySpassIE(),
4270         SpiegelIE(),
4271         LiveLeakIE(),
4272         ARDIE(),
4273         TumblrIE(),
4274         BandcampIE(),
4275         RedtubeIE(),
4276         GenericIE()
4277     ]
4278
4279 def get_info_extractor(ie_name):
4280     """Returns the info extractor class with the given ie_name"""
4281     return globals()[ie_name+'IE']