_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             upload_date = unified_strdate(upload_date)
 566
 567         # description
 568         video_description = get_element_by_id("eow-description", video_webpage)
 569         if video_description:
 570             video_description = clean_html(video_description)
 571         else:
 572             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 573             if fd_mobj:
 574                 video_description = unescapeHTML(fd_mobj.group(1))
 575             else:
 576                 video_description = u''
 577
 578         # subtitles
 579         video_subtitles = None
 580
 581         if self._downloader.params.get('writesubtitles', False):
 582             video_subtitles = self._extract_subtitle(video_id)
 583             if video_subtitles:
 584                 (sub_error, sub_lang, sub) = video_subtitles[0]
 585                 if sub_error:
 586                     self._downloader.report_error(sub_error)
 587
 588         if self._downloader.params.get('allsubtitles', False):
 589             video_subtitles = self._extract_all_subtitles(video_id)
 590             for video_subtitle in video_subtitles:
 591                 (sub_error, sub_lang, sub) = video_subtitle
 592                 if sub_error:
 593                     self._downloader.report_error(sub_error)
 594
 595         if self._downloader.params.get('listsubtitles', False):
 596             sub_lang_list = self._list_available_subtitles(video_id)
 597             return
 598
 599         if 'length_seconds' not in video_info:
 600             self._downloader.report_warning(u'unable to extract video duration')
 601             video_duration = ''
 602         else:
 603             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 604
 605         # token
 606         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 607
 608         # Decide which formats to download
 609         req_format = self._downloader.params.get('format', None)
 610
 611         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 612             self.report_rtmp_download()
 613             video_url_list = [(None, video_info['conn'][0])]
 614         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 615             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 616             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 617             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 618             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 619
 620             format_limit = self._downloader.params.get('format_limit', None)
 621             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 622             if format_limit is not None and format_limit in available_formats:
 623                 format_list = available_formats[available_formats.index(format_limit):]
 624             else:
 625                 format_list = available_formats
 626             existing_formats = [x for x in format_list if x in url_map]
 627             if len(existing_formats) == 0:
 628                 raise ExtractorError(u'no known formats available for video')
 629             if self._downloader.params.get('listformats', None):
 630                 self._print_formats(existing_formats)
 631                 return
 632             if req_format is None or req_format == 'best':
 633                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 634             elif req_format == 'worst':
 635                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 636             elif req_format in ('-1', 'all'):
 637                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 638             else:
 639                 # Specific formats. We pick the first in a slash-delimeted sequence.
 640                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 641                 req_formats = req_format.split('/')
 642                 video_url_list = None
 643                 for rf in req_formats:
 644                     if rf in url_map:
 645                         video_url_list = [(rf, url_map[rf])]
 646                         break
 647                 if video_url_list is None:
 648                     raise ExtractorError(u'requested format not available')
 649         else:
 650             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 651
 652         results = []
 653         for format_param, video_real_url in video_url_list:
 654             # Extension
 655             video_extension = self._video_extensions.get(format_param, 'flv')
 656
 657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 658                                               self._video_dimensions.get(format_param, '???'))
 659
 660             results.append({
 661                 'id':       video_id,
 662                 'url':      video_real_url,
 663                 'uploader': video_uploader,
 664                 'uploader_id': video_uploader_id,
 665                 'upload_date':  upload_date,
 666                 'title':    video_title,
 667                 'ext':      video_extension,
 668                 'format':   video_format,
 669                 'thumbnail':    video_thumbnail,
 670                 'description':  video_description,
 671                 'player_url':   player_url,
 672                 'subtitles':    video_subtitles,
 673                 'duration':     video_duration
 674             })
 675         return results
 676
 677
 678 class MetacafeIE(InfoExtractor):
 679     """Information Extractor for metacafe.com."""
 680
 681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 684     IE_NAME = u'metacafe'
 685
 686     def report_disclaimer(self):
 687         """Report disclaimer retrieval."""
 688         self.to_screen(u'Retrieving disclaimer')
 689
 690     def _real_initialize(self):
 691         # Retrieve disclaimer
 692         request = compat_urllib_request.Request(self._DISCLAIMER)
 693         try:
 694             self.report_disclaimer()
 695             disclaimer = compat_urllib_request.urlopen(request).read()
 696         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 697             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 698             return
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 711             return
 712
 713     def _real_extract(self, url):
 714         # Extract id and simplified title from URL
 715         mobj = re.match(self._VALID_URL, url)
 716         if mobj is None:
 717             self._downloader.report_error(u'invalid URL: %s' % url)
 718             return
 719
 720         video_id = mobj.group(1)
 721
 722         # Check if video comes from YouTube
 723         mobj2 = re.match(r'^yt-(.*)$', video_id)
 724         if mobj2 is not None:
 725             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 726
 727         # Retrieve video webpage to extract further information
 728         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 729
 730         # Extract URL, uploader and title from webpage
 731         self.report_extraction(video_id)
 732         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 733         if mobj is not None:
 734             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 735             video_extension = mediaURL[-3:]
 736
 737             # Extract gdaKey if available
 738             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 739             if mobj is None:
 740                 video_url = mediaURL
 741             else:
 742                 gdaKey = mobj.group(1)
 743                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 744         else:
 745             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 746             if mobj is None:
 747                 self._downloader.report_error(u'unable to extract media URL')
 748                 return
 749             vardict = compat_parse_qs(mobj.group(1))
 750             if 'mediaData' not in vardict:
 751                 self._downloader.report_error(u'unable to extract media URL')
 752                 return
 753             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 754             if mobj is None:
 755                 self._downloader.report_error(u'unable to extract media URL')
 756                 return
 757             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 758             video_extension = mediaURL[-3:]
 759             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 760
 761         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 762         if mobj is None:
 763             self._downloader.report_error(u'unable to extract title')
 764             return
 765         video_title = mobj.group(1).decode('utf-8')
 766
 767         mobj = re.search(r'submitter=(.*?);', webpage)
 768         if mobj is None:
 769             self._downloader.report_error(u'unable to extract uploader nickname')
 770             return
 771         video_uploader = mobj.group(1)
 772
 773         return [{
 774             'id':       video_id.decode('utf-8'),
 775             'url':      video_url.decode('utf-8'),
 776             'uploader': video_uploader.decode('utf-8'),
 777             'upload_date':  None,
 778             'title':    video_title,
 779             'ext':      video_extension.decode('utf-8'),
 780         }]
 781
 782
 783 class DailymotionIE(InfoExtractor):
 784     """Information Extractor for Dailymotion"""
 785
 786     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 787     IE_NAME = u'dailymotion'
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             self._downloader.report_error(u'invalid URL: %s' % url)
 794             return
 795
 796         video_id = mobj.group(1).split('_')[0].split('?')[0]
 797
 798         video_extension = 'mp4'
 799
 800         # Retrieve video webpage to extract further information
 801         request = compat_urllib_request.Request(url)
 802         request.add_header('Cookie', 'family_filter=off')
 803         webpage = self._download_webpage(request, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 808         if mobj is None:
 809             self._downloader.report_error(u'unable to extract media URL')
 810             return
 811         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 812
 813         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 814             if key in flashvars:
 815                 max_quality = key
 816                 self.to_screen(u'Using %s' % key)
 817                 break
 818         else:
 819             self._downloader.report_error(u'unable to extract video URL')
 820             return
 821
 822         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 823         if mobj is None:
 824             self._downloader.report_error(u'unable to extract video URL')
 825             return
 826
 827         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 828
 829         # TODO: support choosing qualities
 830
 831         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 832         if mobj is None:
 833             self._downloader.report_error(u'unable to extract title')
 834             return
 835         video_title = unescapeHTML(mobj.group('title'))
 836
 837         video_uploader = None
 838         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 839         if mobj is None:
 840             # lookin for official user
 841             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 842             if mobj_official is None:
 843                 self._downloader.report_warning(u'unable to extract uploader nickname')
 844             else:
 845                 video_uploader = mobj_official.group(1)
 846         else:
 847             video_uploader = mobj.group(1)
 848
 849         video_upload_date = None
 850         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 851         if mobj is not None:
 852             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 853
 854         return [{
 855             'id':       video_id,
 856             'url':      video_url,
 857             'uploader': video_uploader,
 858             'upload_date':  video_upload_date,
 859             'title':    video_title,
 860             'ext':      video_extension,
 861         }]
 862
 863
 864 class PhotobucketIE(InfoExtractor):
 865     """Information extractor for photobucket.com."""
 866
 867     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 868     IE_NAME = u'photobucket'
 869
 870     def _real_extract(self, url):
 871         # Extract id from URL
 872         mobj = re.match(self._VALID_URL, url)
 873         if mobj is None:
 874             self._downloader.report_error(u'Invalid URL: %s' % url)
 875             return
 876
 877         video_id = mobj.group(1)
 878
 879         video_extension = 'flv'
 880
 881         # Retrieve video webpage to extract further information
 882         request = compat_urllib_request.Request(url)
 883         try:
 884             self.report_download_webpage(video_id)
 885             webpage = compat_urllib_request.urlopen(request).read()
 886         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 887             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 888             return
 889
 890         # Extract URL, uploader, and title from webpage
 891         self.report_extraction(video_id)
 892         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 893         if mobj is None:
 894             self._downloader.report_error(u'unable to extract media URL')
 895             return
 896         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 897
 898         video_url = mediaURL
 899
 900         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 901         if mobj is None:
 902             self._downloader.report_error(u'unable to extract title')
 903             return
 904         video_title = mobj.group(1).decode('utf-8')
 905
 906         video_uploader = mobj.group(2).decode('utf-8')
 907
 908         return [{
 909             'id':       video_id.decode('utf-8'),
 910             'url':      video_url.decode('utf-8'),
 911             'uploader': video_uploader,
 912             'upload_date':  None,
 913             'title':    video_title,
 914             'ext':      video_extension.decode('utf-8'),
 915         }]
 916
 917
 918 class YahooIE(InfoExtractor):
 919     """Information extractor for video.yahoo.com."""
 920
 921     _WORKING = False
 922     # _VALID_URL matches all Yahoo! Video URLs
 923     # _VPAGE_URL matches only the extractable '/watch/' URLs
 924     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 925     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 926     IE_NAME = u'video.yahoo'
 927
 928     def _real_extract(self, url, new_video=True):
 929         # Extract ID from URL
 930         mobj = re.match(self._VALID_URL, url)
 931         if mobj is None:
 932             self._downloader.report_error(u'Invalid URL: %s' % url)
 933             return
 934
 935         video_id = mobj.group(2)
 936         video_extension = 'flv'
 937
 938         # Rewrite valid but non-extractable URLs as
 939         # extractable English language /watch/ URLs
 940         if re.match(self._VPAGE_URL, url) is None:
 941             request = compat_urllib_request.Request(url)
 942             try:
 943                 webpage = compat_urllib_request.urlopen(request).read()
 944             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 945                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 946                 return
 947
 948             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 949             if mobj is None:
 950                 self._downloader.report_error(u'Unable to extract id field')
 951                 return
 952             yahoo_id = mobj.group(1)
 953
 954             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 955             if mobj is None:
 956                 self._downloader.report_error(u'Unable to extract vid field')
 957                 return
 958             yahoo_vid = mobj.group(1)
 959
 960             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 961             return self._real_extract(url, new_video=False)
 962
 963         # Retrieve video webpage to extract further information
 964         request = compat_urllib_request.Request(url)
 965         try:
 966             self.report_download_webpage(video_id)
 967             webpage = compat_urllib_request.urlopen(request).read()
 968         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 969             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 970             return
 971
 972         # Extract uploader and title from webpage
 973         self.report_extraction(video_id)
 974         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 975         if mobj is None:
 976             self._downloader.report_error(u'unable to extract video title')
 977             return
 978         video_title = mobj.group(1).decode('utf-8')
 979
 980         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 981         if mobj is None:
 982             self._downloader.report_error(u'unable to extract video uploader')
 983             return
 984         video_uploader = mobj.group(1).decode('utf-8')
 985
 986         # Extract video thumbnail
 987         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 988         if mobj is None:
 989             self._downloader.report_error(u'unable to extract video thumbnail')
 990             return
 991         video_thumbnail = mobj.group(1).decode('utf-8')
 992
 993         # Extract video description
 994         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 995         if mobj is None:
 996             self._downloader.report_error(u'unable to extract video description')
 997             return
 998         video_description = mobj.group(1).decode('utf-8')
 999         if not video_description:
1000             video_description = 'No description available.'
1001
1002         # Extract video height and width
1003         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1004         if mobj is None:
1005             self._downloader.report_error(u'unable to extract video height')
1006             return
1007         yv_video_height = mobj.group(1)
1008
1009         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video width')
1012             return
1013         yv_video_width = mobj.group(1)
1014
1015         # Retrieve video playlist to extract media URL
1016         # I'm not completely sure what all these options are, but we
1017         # seem to need most of them, otherwise the server sends a 401.
1018         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1019         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1020         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1021                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1022                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1023         try:
1024             self.report_download_webpage(video_id)
1025             webpage = compat_urllib_request.urlopen(request).read()
1026         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1028             return
1029
1030         # Extract media URL from playlist XML
1031         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'Unable to extract media URL')
1034             return
1035         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1036         video_url = unescapeHTML(video_url)
1037
1038         return [{
1039             'id':       video_id.decode('utf-8'),
1040             'url':      video_url,
1041             'uploader': video_uploader,
1042             'upload_date':  None,
1043             'title':    video_title,
1044             'ext':      video_extension.decode('utf-8'),
1045             'thumbnail':    video_thumbnail.decode('utf-8'),
1046             'description':  video_description,
1047         }]
1048
1049
1050 class VimeoIE(InfoExtractor):
1051     """Information extractor for vimeo.com."""
1052
1053     # _VALID_URL matches Vimeo URLs
1054     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1055     IE_NAME = u'vimeo'
1056
1057     def _real_extract(self, url, new_video=True):
1058         # Extract ID from URL
1059         mobj = re.match(self._VALID_URL, url)
1060         if mobj is None:
1061             self._downloader.report_error(u'Invalid URL: %s' % url)
1062             return
1063
1064         video_id = mobj.group('id')
1065         if not mobj.group('proto'):
1066             url = 'https://' + url
1067         if mobj.group('direct_link'):
1068             url = 'https://vimeo.com/' + video_id
1069
1070         # Retrieve video webpage to extract further information
1071         request = compat_urllib_request.Request(url, None, std_headers)
1072         webpage = self._download_webpage(request, video_id)
1073
1074         # Now we begin extracting as much information as we can from what we
1075         # retrieved. First we extract the information common to all extractors,
1076         # and latter we extract those that are Vimeo specific.
1077         self.report_extraction(video_id)
1078
1079         # Extract the config JSON
1080         try:
1081             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1082             config = json.loads(config)
1083         except:
1084             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1085                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1086             else:
1087                 self._downloader.report_error(u'unable to extract info section')
1088             return
1089
1090         # Extract title
1091         video_title = config["video"]["title"]
1092
1093         # Extract uploader and uploader_id
1094         video_uploader = config["video"]["owner"]["name"]
1095         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1096
1097         # Extract video thumbnail
1098         video_thumbnail = config["video"]["thumbnail"]
1099
1100         # Extract video description
1101         video_description = get_element_by_attribute("itemprop", "description", webpage)
1102         if video_description: video_description = clean_html(video_description)
1103         else: video_description = u''
1104
1105         # Extract upload date
1106         video_upload_date = None
1107         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1108         if mobj is not None:
1109             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1110
1111         # Vimeo specific: extract request signature and timestamp
1112         sig = config['request']['signature']
1113         timestamp = config['request']['timestamp']
1114
1115         # Vimeo specific: extract video codec and quality information
1116         # First consider quality, then codecs, then take everything
1117         # TODO bind to format param
1118         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1119         files = { 'hd': [], 'sd': [], 'other': []}
1120         for codec_name, codec_extension in codecs:
1121             if codec_name in config["video"]["files"]:
1122                 if 'hd' in config["video"]["files"][codec_name]:
1123                     files['hd'].append((codec_name, codec_extension, 'hd'))
1124                 elif 'sd' in config["video"]["files"][codec_name]:
1125                     files['sd'].append((codec_name, codec_extension, 'sd'))
1126                 else:
1127                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1128
1129         for quality in ('hd', 'sd', 'other'):
1130             if len(files[quality]) > 0:
1131                 video_quality = files[quality][0][2]
1132                 video_codec = files[quality][0][0]
1133                 video_extension = files[quality][0][1]
1134                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1135                 break
1136         else:
1137             self._downloader.report_error(u'no known codec found')
1138             return
1139
1140         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1141                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1142
1143         return [{
1144             'id':       video_id,
1145             'url':      video_url,
1146             'uploader': video_uploader,
1147             'uploader_id': video_uploader_id,
1148             'upload_date':  video_upload_date,
1149             'title':    video_title,
1150             'ext':      video_extension,
1151             'thumbnail':    video_thumbnail,
1152             'description':  video_description,
1153         }]
1154
1155
1156 class ArteTvIE(InfoExtractor):
1157     """arte.tv information extractor."""
1158
1159     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1160     _LIVE_URL = r'index-[0-9]+\.html$'
1161
1162     IE_NAME = u'arte.tv'
1163
1164     def fetch_webpage(self, url):
1165         request = compat_urllib_request.Request(url)
1166         try:
1167             self.report_download_webpage(url)
1168             webpage = compat_urllib_request.urlopen(request).read()
1169         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1170             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1171             return
1172         except ValueError as err:
1173             self._downloader.report_error(u'Invalid URL: %s' % url)
1174             return
1175         return webpage
1176
1177     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1178         page = self.fetch_webpage(url)
1179         mobj = re.search(regex, page, regexFlags)
1180         info = {}
1181
1182         if mobj is None:
1183             self._downloader.report_error(u'Invalid URL: %s' % url)
1184             return
1185
1186         for (i, key, err) in matchTuples:
1187             if mobj.group(i) is None:
1188                 self._downloader.report_error(err)
1189                 return
1190             else:
1191                 info[key] = mobj.group(i)
1192
1193         return info
1194
1195     def extractLiveStream(self, url):
1196         video_lang = url.split('/')[-4]
1197         info = self.grep_webpage(
1198             url,
1199             r'src="(.*?/videothek_js.*?\.js)',
1200             0,
1201             [
1202                 (1, 'url', u'Invalid URL: %s' % url)
1203             ]
1204         )
1205         http_host = url.split('/')[2]
1206         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1207         info = self.grep_webpage(
1208             next_url,
1209             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1210                 '(http://.*?\.swf).*?' +
1211                 '(rtmp://.*?)\'',
1212             re.DOTALL,
1213             [
1214                 (1, 'path',   u'could not extract video path: %s' % url),
1215                 (2, 'player', u'could not extract video player: %s' % url),
1216                 (3, 'url',    u'could not extract video url: %s' % url)
1217             ]
1218         )
1219         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1220
1221     def extractPlus7Stream(self, url):
1222         video_lang = url.split('/')[-3]
1223         info = self.grep_webpage(
1224             url,
1225             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1226             0,
1227             [
1228                 (1, 'url', u'Invalid URL: %s' % url)
1229             ]
1230         )
1231         next_url = compat_urllib_parse.unquote(info.get('url'))
1232         info = self.grep_webpage(
1233             next_url,
1234             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1235             0,
1236             [
1237                 (1, 'url', u'Could not find <video> tag: %s' % url)
1238             ]
1239         )
1240         next_url = compat_urllib_parse.unquote(info.get('url'))
1241
1242         info = self.grep_webpage(
1243             next_url,
1244             r'<video id="(.*?)".*?>.*?' +
1245                 '<name>(.*?)</name>.*?' +
1246                 '<dateVideo>(.*?)</dateVideo>.*?' +
1247                 '<url quality="hd">(.*?)</url>',
1248             re.DOTALL,
1249             [
1250                 (1, 'id',    u'could not extract video id: %s' % url),
1251                 (2, 'title', u'could not extract video title: %s' % url),
1252                 (3, 'date',  u'could not extract video date: %s' % url),
1253                 (4, 'url',   u'could not extract video url: %s' % url)
1254             ]
1255         )
1256
1257         return {
1258             'id':           info.get('id'),
1259             'url':          compat_urllib_parse.unquote(info.get('url')),
1260             'uploader':     u'arte.tv',
1261             'upload_date':  info.get('date'),
1262             'title':        info.get('title').decode('utf-8'),
1263             'ext':          u'mp4',
1264             'format':       u'NA',
1265             'player_url':   None,
1266         }
1267
1268     def _real_extract(self, url):
1269         video_id = url.split('/')[-1]
1270         self.report_extraction(video_id)
1271
1272         if re.search(self._LIVE_URL, video_id) is not None:
1273             self.extractLiveStream(url)
1274             return
1275         else:
1276             info = self.extractPlus7Stream(url)
1277
1278         return [info]
1279
1280
1281 class GenericIE(InfoExtractor):
1282     """Generic last-resort information extractor."""
1283
1284     _VALID_URL = r'.*'
1285     IE_NAME = u'generic'
1286
1287     def report_download_webpage(self, video_id):
1288         """Report webpage download."""
1289         if not self._downloader.params.get('test', False):
1290             self._downloader.report_warning(u'Falling back on generic information extractor.')
1291         super(GenericIE, self).report_download_webpage(video_id)
1292
1293     def report_following_redirect(self, new_url):
1294         """Report information extraction."""
1295         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1296
1297     def _test_redirect(self, url):
1298         """Check if it is a redirect, like url shorteners, in case return the new url."""
1299         class HeadRequest(compat_urllib_request.Request):
1300             def get_method(self):
1301                 return "HEAD"
1302
1303         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1304             """
1305             Subclass the HTTPRedirectHandler to make it use our
1306             HeadRequest also on the redirected URL
1307             """
1308             def redirect_request(self, req, fp, code, msg, headers, newurl):
1309                 if code in (301, 302, 303, 307):
1310                     newurl = newurl.replace(' ', '%20')
1311                     newheaders = dict((k,v) for k,v in req.headers.items()
1312                                       if k.lower() not in ("content-length", "content-type"))
1313                     return HeadRequest(newurl,
1314                                        headers=newheaders,
1315                                        origin_req_host=req.get_origin_req_host(),
1316                                        unverifiable=True)
1317                 else:
1318                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1319
1320         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1321             """
1322             Fallback to GET if HEAD is not allowed (405 HTTP error)
1323             """
1324             def http_error_405(self, req, fp, code, msg, headers):
1325                 fp.read()
1326                 fp.close()
1327
1328                 newheaders = dict((k,v) for k,v in req.headers.items()
1329                                   if k.lower() not in ("content-length", "content-type"))
1330                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1331                                                  headers=newheaders,
1332                                                  origin_req_host=req.get_origin_req_host(),
1333                                                  unverifiable=True))
1334
1335         # Build our opener
1336         opener = compat_urllib_request.OpenerDirector()
1337         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1338                         HTTPMethodFallback, HEADRedirectHandler,
1339                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1340             opener.add_handler(handler())
1341
1342         response = opener.open(HeadRequest(url))
1343         new_url = response.geturl()
1344
1345         if url == new_url:
1346             return False
1347
1348         self.report_following_redirect(new_url)
1349         return new_url
1350
1351     def _real_extract(self, url):
1352         new_url = self._test_redirect(url)
1353         if new_url: return [self.url_result(new_url)]
1354
1355         video_id = url.split('/')[-1]
1356         try:
1357             webpage = self._download_webpage(url, video_id)
1358         except ValueError as err:
1359             # since this is the last-resort InfoExtractor, if
1360             # this error is thrown, it'll be thrown here
1361             self._downloader.report_error(u'Invalid URL: %s' % url)
1362             return
1363
1364         self.report_extraction(video_id)
1365         # Start with something easy: JW Player in SWFObject
1366         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367         if mobj is None:
1368             # Broaden the search a little bit
1369             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit: JWPlayer JS loader
1372             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             self._downloader.report_error(u'Invalid URL: %s' % url)
1375             return
1376
1377         # It's possible that one of the regexes
1378         # matched, but returned an empty group:
1379         if mobj.group(1) is None:
1380             self._downloader.report_error(u'Invalid URL: %s' % url)
1381             return
1382
1383         video_url = compat_urllib_parse.unquote(mobj.group(1))
1384         video_id = os.path.basename(video_url)
1385
1386         # here's a fun little line of code for you:
1387         video_extension = os.path.splitext(video_id)[1][1:]
1388         video_id = os.path.splitext(video_id)[0]
1389
1390         # it's tempting to parse this further, but you would
1391         # have to take into account all the variations like
1392         #   Video Title - Site Name
1393         #   Site Name | Video Title
1394         #   Video Title - Tagline | Site Name
1395         # and so on and so forth; it's just not practical
1396         mobj = re.search(r'<title>(.*)</title>', webpage)
1397         if mobj is None:
1398             self._downloader.report_error(u'unable to extract title')
1399             return
1400         video_title = mobj.group(1)
1401
1402         # video uploader is domain name
1403         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404         if mobj is None:
1405             self._downloader.report_error(u'unable to extract title')
1406             return
1407         video_uploader = mobj.group(1)
1408
1409         return [{
1410             'id':       video_id,
1411             'url':      video_url,
1412             'uploader': video_uploader,
1413             'upload_date':  None,
1414             'title':    video_title,
1415             'ext':      video_extension,
1416         }]
1417
1418
1419 class YoutubeSearchIE(InfoExtractor):
1420     """Information Extractor for YouTube search queries."""
1421     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1422     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1423     _max_youtube_results = 1000
1424     IE_NAME = u'youtube:search'
1425
1426     def report_download_page(self, query, pagenum):
1427         """Report attempt to download search page with given number."""
1428         query = query.decode(preferredencoding())
1429         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1430
1431     def _real_extract(self, query):
1432         mobj = re.match(self._VALID_URL, query)
1433         if mobj is None:
1434             self._downloader.report_error(u'invalid search query "%s"' % query)
1435             return
1436
1437         prefix, query = query.split(':')
1438         prefix = prefix[8:]
1439         query = query.encode('utf-8')
1440         if prefix == '':
1441             return self._get_n_results(query, 1)
1442         elif prefix == 'all':
1443             self._get_n_results(query, self._max_youtube_results)
1444         else:
1445             try:
1446                 n = int(prefix)
1447                 if n <= 0:
1448                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1449                     return
1450                 elif n > self._max_youtube_results:
1451                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1452                     n = self._max_youtube_results
1453                 return self._get_n_results(query, n)
1454             except ValueError: # parsing prefix as integer fails
1455                 return self._get_n_results(query, 1)
1456
1457     def _get_n_results(self, query, n):
1458         """Get a specified number of results for a query"""
1459
1460         video_ids = []
1461         pagenum = 0
1462         limit = n
1463
1464         while (50 * pagenum) < limit:
1465             self.report_download_page(query, pagenum+1)
1466             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467             request = compat_urllib_request.Request(result_url)
1468             try:
1469                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1472                 return
1473             api_response = json.loads(data)['data']
1474
1475             if not 'items' in api_response:
1476                 self._downloader.report_error(u'[youtube] No video results')
1477                 return
1478
1479             new_ids = list(video['id'] for video in api_response['items'])
1480             video_ids += new_ids
1481
1482             limit = min(n, api_response['totalItems'])
1483             pagenum += 1
1484
1485         if len(video_ids) > n:
1486             video_ids = video_ids[:n]
1487         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1488         return videos
1489
1490
1491 class GoogleSearchIE(InfoExtractor):
1492     """Information Extractor for Google Video search queries."""
1493     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497     _max_google_results = 1000
1498     IE_NAME = u'video.google:search'
1499
1500     def report_download_page(self, query, pagenum):
1501         """Report attempt to download playlist page with given number."""
1502         query = query.decode(preferredencoding())
1503         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1504
1505     def _real_extract(self, query):
1506         mobj = re.match(self._VALID_URL, query)
1507         if mobj is None:
1508             self._downloader.report_error(u'invalid search query "%s"' % query)
1509             return
1510
1511         prefix, query = query.split(':')
1512         prefix = prefix[8:]
1513         query = query.encode('utf-8')
1514         if prefix == '':
1515             self._download_n_results(query, 1)
1516             return
1517         elif prefix == 'all':
1518             self._download_n_results(query, self._max_google_results)
1519             return
1520         else:
1521             try:
1522                 n = int(prefix)
1523                 if n <= 0:
1524                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1525                     return
1526                 elif n > self._max_google_results:
1527                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1528                     n = self._max_google_results
1529                 self._download_n_results(query, n)
1530                 return
1531             except ValueError: # parsing prefix as integer fails
1532                 self._download_n_results(query, 1)
1533                 return
1534
1535     def _download_n_results(self, query, n):
1536         """Downloads a specified number of results for a query"""
1537
1538         video_ids = []
1539         pagenum = 0
1540
1541         while True:
1542             self.report_download_page(query, pagenum)
1543             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1544             request = compat_urllib_request.Request(result_url)
1545             try:
1546                 page = compat_urllib_request.urlopen(request).read()
1547             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1549                 return
1550
1551             # Extract video identifiers
1552             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1553                 video_id = mobj.group(1)
1554                 if video_id not in video_ids:
1555                     video_ids.append(video_id)
1556                     if len(video_ids) == n:
1557                         # Specified n videos reached
1558                         for id in video_ids:
1559                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1560                         return
1561
1562             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1563                 for id in video_ids:
1564                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1565                 return
1566
1567             pagenum = pagenum + 1
1568
1569
1570 class YahooSearchIE(InfoExtractor):
1571     """Information Extractor for Yahoo! Video search queries."""
1572
1573     _WORKING = False
1574     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1575     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1576     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1577     _MORE_PAGES_INDICATOR = r'\s*Next'
1578     _max_yahoo_results = 1000
1579     IE_NAME = u'video.yahoo:search'
1580
1581     def report_download_page(self, query, pagenum):
1582         """Report attempt to download playlist page with given number."""
1583         query = query.decode(preferredencoding())
1584         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1585
1586     def _real_extract(self, query):
1587         mobj = re.match(self._VALID_URL, query)
1588         if mobj is None:
1589             self._downloader.report_error(u'invalid search query "%s"' % query)
1590             return
1591
1592         prefix, query = query.split(':')
1593         prefix = prefix[8:]
1594         query = query.encode('utf-8')
1595         if prefix == '':
1596             self._download_n_results(query, 1)
1597             return
1598         elif prefix == 'all':
1599             self._download_n_results(query, self._max_yahoo_results)
1600             return
1601         else:
1602             try:
1603                 n = int(prefix)
1604                 if n <= 0:
1605                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606                     return
1607                 elif n > self._max_yahoo_results:
1608                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1609                     n = self._max_yahoo_results
1610                 self._download_n_results(query, n)
1611                 return
1612             except ValueError: # parsing prefix as integer fails
1613                 self._download_n_results(query, 1)
1614                 return
1615
1616     def _download_n_results(self, query, n):
1617         """Downloads a specified number of results for a query"""
1618
1619         video_ids = []
1620         already_seen = set()
1621         pagenum = 1
1622
1623         while True:
1624             self.report_download_page(query, pagenum)
1625             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1626             request = compat_urllib_request.Request(result_url)
1627             try:
1628                 page = compat_urllib_request.urlopen(request).read()
1629             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631                 return
1632
1633             # Extract video identifiers
1634             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1635                 video_id = mobj.group(1)
1636                 if video_id not in already_seen:
1637                     video_ids.append(video_id)
1638                     already_seen.add(video_id)
1639                     if len(video_ids) == n:
1640                         # Specified n videos reached
1641                         for id in video_ids:
1642                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1643                         return
1644
1645             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1646                 for id in video_ids:
1647                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1648                 return
1649
1650             pagenum = pagenum + 1
1651
1652
1653 class YoutubePlaylistIE(InfoExtractor):
1654     """Information Extractor for YouTube playlists."""
1655
1656     _VALID_URL = r"""(?:
1657                         (?:https?://)?
1658                         (?:\w+\.)?
1659                         youtube\.com/
1660                         (?:
1661                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1662                            \? (?:.*?&)*? (?:p|a|list)=
1663                         |  p/
1664                         )
1665                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1666                         .*
1667                      |
1668                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1669                      )"""
1670     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1671     _MAX_RESULTS = 50
1672     IE_NAME = u'youtube:playlist'
1673
1674     @classmethod
1675     def suitable(cls, url):
1676         """Receives a URL and returns True if suitable for this IE."""
1677         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1678
1679     def _real_extract(self, url):
1680         # Extract playlist id
1681         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1682         if mobj is None:
1683             self._downloader.report_error(u'invalid url: %s' % url)
1684             return
1685
1686         # Download playlist videos from API
1687         playlist_id = mobj.group(1) or mobj.group(2)
1688         page_num = 1
1689         videos = []
1690
1691         while True:
1692             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1693             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1694
1695             try:
1696                 response = json.loads(page)
1697             except ValueError as err:
1698                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1699                 return
1700
1701             if 'feed' not in response:
1702                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1703                 return
1704             playlist_title = response['feed']['title']['$t']
1705             if 'entry' not in response['feed']:
1706                 # Number of videos is a multiple of self._MAX_RESULTS
1707                 break
1708
1709             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1710                         for entry in response['feed']['entry']
1711                         if 'content' in entry ]
1712
1713             if len(response['feed']['entry']) < self._MAX_RESULTS:
1714                 break
1715             page_num += 1
1716
1717         videos = [v[1] for v in sorted(videos)]
1718
1719         url_results = [self.url_result(url, 'Youtube') for url in videos]
1720         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1721
1722
1723 class YoutubeChannelIE(InfoExtractor):
1724     """Information Extractor for YouTube channels."""
1725
1726     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1727     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1728     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1729     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1730     IE_NAME = u'youtube:channel'
1731
1732     def extract_videos_from_page(self, page):
1733         ids_in_page = []
1734         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1735             if mobj.group(1) not in ids_in_page:
1736                 ids_in_page.append(mobj.group(1))
1737         return ids_in_page
1738
1739     def _real_extract(self, url):
1740         # Extract channel id
1741         mobj = re.match(self._VALID_URL, url)
1742         if mobj is None:
1743             self._downloader.report_error(u'invalid url: %s' % url)
1744             return
1745
1746         # Download channel page
1747         channel_id = mobj.group(1)
1748         video_ids = []
1749         pagenum = 1
1750
1751         url = self._TEMPLATE_URL % (channel_id, pagenum)
1752         page = self._download_webpage(url, channel_id,
1753                                       u'Downloading page #%s' % pagenum)
1754
1755         # Extract video identifiers
1756         ids_in_page = self.extract_videos_from_page(page)
1757         video_ids.extend(ids_in_page)
1758
1759         # Download any subsequent channel pages using the json-based channel_ajax query
1760         if self._MORE_PAGES_INDICATOR in page:
1761             while True:
1762                 pagenum = pagenum + 1
1763
1764                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1765                 page = self._download_webpage(url, channel_id,
1766                                               u'Downloading page #%s' % pagenum)
1767
1768                 page = json.loads(page)
1769
1770                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1771                 video_ids.extend(ids_in_page)
1772
1773                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1774                     break
1775
1776         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1777
1778         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1779         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1780         return [self.playlist_result(url_entries, channel_id)]
1781
1782
1783 class YoutubeUserIE(InfoExtractor):
1784     """Information Extractor for YouTube users."""
1785
1786     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1787     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1788     _GDATA_PAGE_SIZE = 50
1789     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1790     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1791     IE_NAME = u'youtube:user'
1792
1793     def _real_extract(self, url):
1794         # Extract username
1795         mobj = re.match(self._VALID_URL, url)
1796         if mobj is None:
1797             self._downloader.report_error(u'invalid url: %s' % url)
1798             return
1799
1800         username = mobj.group(1)
1801
1802         # Download video ids using YouTube Data API. Result size per
1803         # query is limited (currently to 50 videos) so we need to query
1804         # page by page until there are no video ids - it means we got
1805         # all of them.
1806
1807         video_ids = []
1808         pagenum = 0
1809
1810         while True:
1811             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1812
1813             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1814             page = self._download_webpage(gdata_url, username,
1815                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1816
1817             # Extract video identifiers
1818             ids_in_page = []
1819
1820             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1821                 if mobj.group(1) not in ids_in_page:
1822                     ids_in_page.append(mobj.group(1))
1823
1824             video_ids.extend(ids_in_page)
1825
1826             # A little optimization - if current page is not
1827             # "full", ie. does not contain PAGE_SIZE video ids then
1828             # we can assume that this page is the last one - there
1829             # are no more ids on further pages - no need to query
1830             # again.
1831
1832             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1833                 break
1834
1835             pagenum += 1
1836
1837         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1838         url_results = [self.url_result(url, 'Youtube') for url in urls]
1839         return [self.playlist_result(url_results, playlist_title = username)]
1840
1841
1842 class BlipTVUserIE(InfoExtractor):
1843     """Information Extractor for blip.tv users."""
1844
1845     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1846     _PAGE_SIZE = 12
1847     IE_NAME = u'blip.tv:user'
1848
1849     def _real_extract(self, url):
1850         # Extract username
1851         mobj = re.match(self._VALID_URL, url)
1852         if mobj is None:
1853             self._downloader.report_error(u'invalid url: %s' % url)
1854             return
1855
1856         username = mobj.group(1)
1857
1858         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1859
1860         page = self._download_webpage(url, username, u'Downloading user page')
1861         mobj = re.search(r'data-users-id="([^"]+)"', page)
1862         page_base = page_base % mobj.group(1)
1863
1864
1865         # Download video ids using BlipTV Ajax calls. Result size per
1866         # query is limited (currently to 12 videos) so we need to query
1867         # page by page until there are no video ids - it means we got
1868         # all of them.
1869
1870         video_ids = []
1871         pagenum = 1
1872
1873         while True:
1874             url = page_base + "&page=" + str(pagenum)
1875             page = self._download_webpage(url, username,
1876                                           u'Downloading video ids from page %d' % pagenum)
1877
1878             # Extract video identifiers
1879             ids_in_page = []
1880
1881             for mobj in re.finditer(r'href="/([^"]+)"', page):
1882                 if mobj.group(1) not in ids_in_page:
1883                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1884
1885             video_ids.extend(ids_in_page)
1886
1887             # A little optimization - if current page is not
1888             # "full", ie. does not contain PAGE_SIZE video ids then
1889             # we can assume that this page is the last one - there
1890             # are no more ids on further pages - no need to query
1891             # again.
1892
1893             if len(ids_in_page) < self._PAGE_SIZE:
1894                 break
1895
1896             pagenum += 1
1897
1898         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1899         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1900         return [self.playlist_result(url_entries, playlist_title = username)]
1901
1902
1903 class DepositFilesIE(InfoExtractor):
1904     """Information extractor for depositfiles.com"""
1905
1906     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1907
1908     def _real_extract(self, url):
1909         file_id = url.split('/')[-1]
1910         # Rebuild url in english locale
1911         url = 'http://depositfiles.com/en/files/' + file_id
1912
1913         # Retrieve file webpage with 'Free download' button pressed
1914         free_download_indication = { 'gateway_result' : '1' }
1915         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1916         try:
1917             self.report_download_webpage(file_id)
1918             webpage = compat_urllib_request.urlopen(request).read()
1919         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1920             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1921             return
1922
1923         # Search for the real file URL
1924         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1925         if (mobj is None) or (mobj.group(1) is None):
1926             # Try to figure out reason of the error.
1927             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1928             if (mobj is not None) and (mobj.group(1) is not None):
1929                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1930                 self._downloader.report_error(u'%s' % restriction_message)
1931             else:
1932                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1933             return
1934
1935         file_url = mobj.group(1)
1936         file_extension = os.path.splitext(file_url)[1][1:]
1937
1938         # Search for file title
1939         mobj = re.search(r'<b title="(.*?)">', webpage)
1940         if mobj is None:
1941             self._downloader.report_error(u'unable to extract title')
1942             return
1943         file_title = mobj.group(1).decode('utf-8')
1944
1945         return [{
1946             'id':       file_id.decode('utf-8'),
1947             'url':      file_url.decode('utf-8'),
1948             'uploader': None,
1949             'upload_date':  None,
1950             'title':    file_title,
1951             'ext':      file_extension.decode('utf-8'),
1952         }]
1953
1954
1955 class FacebookIE(InfoExtractor):
1956     """Information Extractor for Facebook"""
1957
1958     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1959     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1960     _NETRC_MACHINE = 'facebook'
1961     IE_NAME = u'facebook'
1962
1963     def report_login(self):
1964         """Report attempt to log in."""
1965         self.to_screen(u'Logging in')
1966
1967     def _real_initialize(self):
1968         if self._downloader is None:
1969             return
1970
1971         useremail = None
1972         password = None
1973         downloader_params = self._downloader.params
1974
1975         # Attempt to use provided username and password or .netrc data
1976         if downloader_params.get('username', None) is not None:
1977             useremail = downloader_params['username']
1978             password = downloader_params['password']
1979         elif downloader_params.get('usenetrc', False):
1980             try:
1981                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1982                 if info is not None:
1983                     useremail = info[0]
1984                     password = info[2]
1985                 else:
1986                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1987             except (IOError, netrc.NetrcParseError) as err:
1988                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1989                 return
1990
1991         if useremail is None:
1992             return
1993
1994         # Log in
1995         login_form = {
1996             'email': useremail,
1997             'pass': password,
1998             'login': 'Log+In'
1999             }
2000         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2001         try:
2002             self.report_login()
2003             login_results = compat_urllib_request.urlopen(request).read()
2004             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2005                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2006                 return
2007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2009             return
2010
2011     def _real_extract(self, url):
2012         mobj = re.match(self._VALID_URL, url)
2013         if mobj is None:
2014             self._downloader.report_error(u'invalid URL: %s' % url)
2015             return
2016         video_id = mobj.group('ID')
2017
2018         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2019         webpage = self._download_webpage(url, video_id)
2020
2021         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2022         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2023         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2024         if not m:
2025             raise ExtractorError(u'Cannot parse data')
2026         data = dict(json.loads(m.group(1)))
2027         params_raw = compat_urllib_parse.unquote(data['params'])
2028         params = json.loads(params_raw)
2029         video_data = params['video_data'][0]
2030         video_url = video_data.get('hd_src')
2031         if not video_url:
2032             video_url = video_data['sd_src']
2033         if not video_url:
2034             raise ExtractorError(u'Cannot find video URL')
2035         video_duration = int(video_data['video_duration'])
2036         thumbnail = video_data['thumbnail_src']
2037
2038         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2039         if not m:
2040             raise ExtractorError(u'Cannot find title in webpage')
2041         video_title = unescapeHTML(m.group(1))
2042
2043         info = {
2044             'id': video_id,
2045             'title': video_title,
2046             'url': video_url,
2047             'ext': 'mp4',
2048             'duration': video_duration,
2049             'thumbnail': thumbnail,
2050         }
2051         return [info]
2052
2053
2054 class BlipTVIE(InfoExtractor):
2055     """Information extractor for blip.tv"""
2056
2057     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2058     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2059     IE_NAME = u'blip.tv'
2060
2061     def report_direct_download(self, title):
2062         """Report information extraction."""
2063         self.to_screen(u'%s: Direct download detected' % title)
2064
2065     def _real_extract(self, url):
2066         mobj = re.match(self._VALID_URL, url)
2067         if mobj is None:
2068             self._downloader.report_error(u'invalid URL: %s' % url)
2069             return
2070
2071         urlp = compat_urllib_parse_urlparse(url)
2072         if urlp.path.startswith('/play/'):
2073             request = compat_urllib_request.Request(url)
2074             response = compat_urllib_request.urlopen(request)
2075             redirecturl = response.geturl()
2076             rurlp = compat_urllib_parse_urlparse(redirecturl)
2077             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2078             url = 'http://blip.tv/a/a-' + file_id
2079             return self._real_extract(url)
2080
2081
2082         if '?' in url:
2083             cchar = '&'
2084         else:
2085             cchar = '?'
2086         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2087         request = compat_urllib_request.Request(json_url)
2088         request.add_header('User-Agent', 'iTunes/10.6.1')
2089         self.report_extraction(mobj.group(1))
2090         info = None
2091         try:
2092             urlh = compat_urllib_request.urlopen(request)
2093             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2094                 basename = url.split('/')[-1]
2095                 title,ext = os.path.splitext(basename)
2096                 title = title.decode('UTF-8')
2097                 ext = ext.replace('.', '')
2098                 self.report_direct_download(title)
2099                 info = {
2100                     'id': title,
2101                     'url': url,
2102                     'uploader': None,
2103                     'upload_date': None,
2104                     'title': title,
2105                     'ext': ext,
2106                     'urlhandle': urlh
2107                 }
2108         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2109             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2110         if info is None: # Regular URL
2111             try:
2112                 json_code_bytes = urlh.read()
2113                 json_code = json_code_bytes.decode('utf-8')
2114             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2115                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2116                 return
2117
2118             try:
2119                 json_data = json.loads(json_code)
2120                 if 'Post' in json_data:
2121                     data = json_data['Post']
2122                 else:
2123                     data = json_data
2124
2125                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2126                 video_url = data['media']['url']
2127                 umobj = re.match(self._URL_EXT, video_url)
2128                 if umobj is None:
2129                     raise ValueError('Can not determine filename extension')
2130                 ext = umobj.group(1)
2131
2132                 info = {
2133                     'id': data['item_id'],
2134                     'url': video_url,
2135                     'uploader': data['display_name'],
2136                     'upload_date': upload_date,
2137                     'title': data['title'],
2138                     'ext': ext,
2139                     'format': data['media']['mimeType'],
2140                     'thumbnail': data['thumbnailUrl'],
2141                     'description': data['description'],
2142                     'player_url': data['embedUrl'],
2143                     'user_agent': 'iTunes/10.6.1',
2144                 }
2145             except (ValueError,KeyError) as err:
2146                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2147                 return
2148
2149         return [info]
2150
2151
2152 class MyVideoIE(InfoExtractor):
2153     """Information Extractor for myvideo.de."""
2154
2155     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2156     IE_NAME = u'myvideo'
2157
2158     def _real_extract(self,url):
2159         mobj = re.match(self._VALID_URL, url)
2160         if mobj is None:
2161             self._download.report_error(u'invalid URL: %s' % url)
2162             return
2163
2164         video_id = mobj.group(1)
2165
2166         # Get video webpage
2167         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2168         webpage = self._download_webpage(webpage_url, video_id)
2169
2170         self.report_extraction(video_id)
2171         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2172                  webpage)
2173         if mobj is None:
2174             self._downloader.report_error(u'unable to extract media URL')
2175             return
2176         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2177
2178         mobj = re.search('<title>([^<]+)</title>', webpage)
2179         if mobj is None:
2180             self._downloader.report_error(u'unable to extract title')
2181             return
2182
2183         video_title = mobj.group(1)
2184
2185         return [{
2186             'id':       video_id,
2187             'url':      video_url,
2188             'uploader': None,
2189             'upload_date':  None,
2190             'title':    video_title,
2191             'ext':      u'flv',
2192         }]
2193
2194 class ComedyCentralIE(InfoExtractor):
2195     """Information extractor for The Daily Show and Colbert Report """
2196
2197     # urls can be abbreviations like :thedailyshow or :colbert
2198     # urls for episodes like:
2199     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2200     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2201     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2202     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2203                       |(https?://)?(www\.)?
2204                           (?P<showname>thedailyshow|colbertnation)\.com/
2205                          (full-episodes/(?P<episode>.*)|
2206                           (?P<clip>
2207                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2208                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2209                      $"""
2210
2211     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2212
2213     _video_extensions = {
2214         '3500': 'mp4',
2215         '2200': 'mp4',
2216         '1700': 'mp4',
2217         '1200': 'mp4',
2218         '750': 'mp4',
2219         '400': 'mp4',
2220     }
2221     _video_dimensions = {
2222         '3500': '1280x720',
2223         '2200': '960x540',
2224         '1700': '768x432',
2225         '1200': '640x360',
2226         '750': '512x288',
2227         '400': '384x216',
2228     }
2229
2230     @classmethod
2231     def suitable(cls, url):
2232         """Receives a URL and returns True if suitable for this IE."""
2233         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2234
2235     def _print_formats(self, formats):
2236         print('Available formats:')
2237         for x in formats:
2238             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2239
2240
2241     def _real_extract(self, url):
2242         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2243         if mobj is None:
2244             self._downloader.report_error(u'invalid URL: %s' % url)
2245             return
2246
2247         if mobj.group('shortname'):
2248             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2249                 url = u'http://www.thedailyshow.com/full-episodes/'
2250             else:
2251                 url = u'http://www.colbertnation.com/full-episodes/'
2252             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2253             assert mobj is not None
2254
2255         if mobj.group('clip'):
2256             if mobj.group('showname') == 'thedailyshow':
2257                 epTitle = mobj.group('tdstitle')
2258             else:
2259                 epTitle = mobj.group('cntitle')
2260             dlNewest = False
2261         else:
2262             dlNewest = not mobj.group('episode')
2263             if dlNewest:
2264                 epTitle = mobj.group('showname')
2265             else:
2266                 epTitle = mobj.group('episode')
2267
2268         self.report_extraction(epTitle)
2269         webpage = self._download_webpage(url, epTitle)
2270         if dlNewest:
2271             url = htmlHandle.geturl()
2272             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273             if mobj is None:
2274                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2275                 return
2276             if mobj.group('episode') == '':
2277                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2278                 return
2279             epTitle = mobj.group('episode')
2280
2281         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2282
2283         if len(mMovieParams) == 0:
2284             # The Colbert Report embeds the information in a without
2285             # a URL prefix; so extract the alternate reference
2286             # and then add the URL prefix manually.
2287
2288             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2289             if len(altMovieParams) == 0:
2290                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2291                 return
2292             else:
2293                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2294
2295         uri = mMovieParams[0][1]
2296         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2297         indexXml = self._download_webpage(indexUrl, epTitle,
2298                                           u'Downloading show index',
2299                                           u'unable to download episode index')
2300
2301         results = []
2302
2303         idoc = xml.etree.ElementTree.fromstring(indexXml)
2304         itemEls = idoc.findall('.//item')
2305         for partNum,itemEl in enumerate(itemEls):
2306             mediaId = itemEl.findall('./guid')[0].text
2307             shortMediaId = mediaId.split(':')[-1]
2308             showId = mediaId.split(':')[-2].replace('.com', '')
2309             officialTitle = itemEl.findall('./title')[0].text
2310             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2311
2312             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2313                         compat_urllib_parse.urlencode({'uri': mediaId}))
2314             configXml = self._download_webpage(configUrl, epTitle,
2315                                                u'Downloading configuration for %s' % shortMediaId)
2316
2317             cdoc = xml.etree.ElementTree.fromstring(configXml)
2318             turls = []
2319             for rendition in cdoc.findall('.//rendition'):
2320                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2321                 turls.append(finfo)
2322
2323             if len(turls) == 0:
2324                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2325                 continue
2326
2327             if self._downloader.params.get('listformats', None):
2328                 self._print_formats([i[0] for i in turls])
2329                 return
2330
2331             # For now, just pick the highest bitrate
2332             format,rtmp_video_url = turls[-1]
2333
2334             # Get the format arg from the arg stream
2335             req_format = self._downloader.params.get('format', None)
2336
2337             # Select format if we can find one
2338             for f,v in turls:
2339                 if f == req_format:
2340                     format, rtmp_video_url = f, v
2341                     break
2342
2343             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2344             if not m:
2345                 raise ExtractorError(u'Cannot transform RTMP url')
2346             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2347             video_url = base + m.group('finalid')
2348
2349             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2350             info = {
2351                 'id': shortMediaId,
2352                 'url': video_url,
2353                 'uploader': showId,
2354                 'upload_date': officialDate,
2355                 'title': effTitle,
2356                 'ext': 'mp4',
2357                 'format': format,
2358                 'thumbnail': None,
2359                 'description': officialTitle,
2360             }
2361             results.append(info)
2362
2363         return results
2364
2365
2366 class EscapistIE(InfoExtractor):
2367     """Information extractor for The Escapist """
2368
2369     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2370     IE_NAME = u'escapist'
2371
2372     def _real_extract(self, url):
2373         mobj = re.match(self._VALID_URL, url)
2374         if mobj is None:
2375             self._downloader.report_error(u'invalid URL: %s' % url)
2376             return
2377         showName = mobj.group('showname')
2378         videoId = mobj.group('episode')
2379
2380         self.report_extraction(showName)
2381         webPage = self._download_webpage(url, showName)
2382
2383         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2384         description = unescapeHTML(descMatch.group(1))
2385         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2386         imgUrl = unescapeHTML(imgMatch.group(1))
2387         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2388         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2389         configUrlMatch = re.search('config=(.*)$', playerUrl)
2390         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2391
2392         configJSON = self._download_webpage(configUrl, showName,
2393                                             u'Downloading configuration',
2394                                             u'unable to download configuration')
2395
2396         # Technically, it's JavaScript, not JSON
2397         configJSON = configJSON.replace("'", '"')
2398
2399         try:
2400             config = json.loads(configJSON)
2401         except (ValueError,) as err:
2402             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2403             return
2404
2405         playlist = config['playlist']
2406         videoUrl = playlist[1]['url']
2407
2408         info = {
2409             'id': videoId,
2410             'url': videoUrl,
2411             'uploader': showName,
2412             'upload_date': None,
2413             'title': showName,
2414             'ext': 'mp4',
2415             'thumbnail': imgUrl,
2416             'description': description,
2417             'player_url': playerUrl,
2418         }
2419
2420         return [info]
2421
2422 class CollegeHumorIE(InfoExtractor):
2423     """Information extractor for collegehumor.com"""
2424
2425     _WORKING = False
2426     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2427     IE_NAME = u'collegehumor'
2428
2429     def report_manifest(self, video_id):
2430         """Report information extraction."""
2431         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2432
2433     def _real_extract(self, url):
2434         mobj = re.match(self._VALID_URL, url)
2435         if mobj is None:
2436             self._downloader.report_error(u'invalid URL: %s' % url)
2437             return
2438         video_id = mobj.group('videoid')
2439
2440         info = {
2441             'id': video_id,
2442             'uploader': None,
2443             'upload_date': None,
2444         }
2445
2446         self.report_extraction(video_id)
2447         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2448         try:
2449             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2452             return
2453
2454         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2455         try:
2456             videoNode = mdoc.findall('./video')[0]
2457             info['description'] = videoNode.findall('./description')[0].text
2458             info['title'] = videoNode.findall('./caption')[0].text
2459             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2460             manifest_url = videoNode.findall('./file')[0].text
2461         except IndexError:
2462             self._downloader.report_error(u'Invalid metadata XML file')
2463             return
2464
2465         manifest_url += '?hdcore=2.10.3'
2466         self.report_manifest(video_id)
2467         try:
2468             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2471             return
2472
2473         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2474         try:
2475             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2476             node_id = media_node.attrib['url']
2477             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2478         except IndexError as err:
2479             self._downloader.report_error(u'Invalid manifest file')
2480             return
2481
2482         url_pr = compat_urllib_parse_urlparse(manifest_url)
2483         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2484
2485         info['url'] = url
2486         info['ext'] = 'f4f'
2487         return [info]
2488
2489
2490 class XVideosIE(InfoExtractor):
2491     """Information extractor for xvideos.com"""
2492
2493     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2494     IE_NAME = u'xvideos'
2495
2496     def _real_extract(self, url):
2497         mobj = re.match(self._VALID_URL, url)
2498         if mobj is None:
2499             self._downloader.report_error(u'invalid URL: %s' % url)
2500             return
2501         video_id = mobj.group(1)
2502
2503         webpage = self._download_webpage(url, video_id)
2504
2505         self.report_extraction(video_id)
2506
2507
2508         # Extract video URL
2509         mobj = re.search(r'flv_url=(.+?)&', webpage)
2510         if mobj is None:
2511             self._downloader.report_error(u'unable to extract video url')
2512             return
2513         video_url = compat_urllib_parse.unquote(mobj.group(1))
2514
2515
2516         # Extract title
2517         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2518         if mobj is None:
2519             self._downloader.report_error(u'unable to extract video title')
2520             return
2521         video_title = mobj.group(1)
2522
2523
2524         # Extract video thumbnail
2525         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2526         if mobj is None:
2527             self._downloader.report_error(u'unable to extract video thumbnail')
2528             return
2529         video_thumbnail = mobj.group(0)
2530
2531         info = {
2532             'id': video_id,
2533             'url': video_url,
2534             'uploader': None,
2535             'upload_date': None,
2536             'title': video_title,
2537             'ext': 'flv',
2538             'thumbnail': video_thumbnail,
2539             'description': None,
2540         }
2541
2542         return [info]
2543
2544
2545 class SoundcloudIE(InfoExtractor):
2546     """Information extractor for soundcloud.com
2547        To access the media, the uid of the song and a stream token
2548        must be extracted from the page source and the script must make
2549        a request to media.soundcloud.com/crossdomain.xml. Then
2550        the media can be grabbed by requesting from an url composed
2551        of the stream token and uid
2552      """
2553
2554     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2555     IE_NAME = u'soundcloud'
2556
2557     def report_resolve(self, video_id):
2558         """Report information extraction."""
2559         self.to_screen(u'%s: Resolving id' % video_id)
2560
2561     def _real_extract(self, url):
2562         mobj = re.match(self._VALID_URL, url)
2563         if mobj is None:
2564             self._downloader.report_error(u'invalid URL: %s' % url)
2565             return
2566
2567         # extract uploader (which is in the url)
2568         uploader = mobj.group(1)
2569         # extract simple title (uploader + slug of song title)
2570         slug_title =  mobj.group(2)
2571         simple_title = uploader + u'-' + slug_title
2572         full_title = '%s/%s' % (uploader, slug_title)
2573
2574         self.report_resolve(full_title)
2575
2576         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2577         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2579
2580         info = json.loads(info_json)
2581         video_id = info['id']
2582         self.report_extraction(full_title)
2583
2584         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2585         stream_json = self._download_webpage(streams_url, full_title,
2586                                              u'Downloading stream definitions',
2587                                              u'unable to download stream definitions')
2588
2589         streams = json.loads(stream_json)
2590         mediaURL = streams['http_mp3_128_url']
2591         upload_date = unified_strdate(info['created_at'])
2592
2593         return [{
2594             'id':       info['id'],
2595             'url':      mediaURL,
2596             'uploader': info['user']['username'],
2597             'upload_date': upload_date,
2598             'title':    info['title'],
2599             'ext':      u'mp3',
2600             'description': info['description'],
2601         }]
2602
2603 class SoundcloudSetIE(InfoExtractor):
2604     """Information extractor for soundcloud.com sets
2605        To access the media, the uid of the song and a stream token
2606        must be extracted from the page source and the script must make
2607        a request to media.soundcloud.com/crossdomain.xml. Then
2608        the media can be grabbed by requesting from an url composed
2609        of the stream token and uid
2610      """
2611
2612     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2613     IE_NAME = u'soundcloud:set'
2614
2615     def report_resolve(self, video_id):
2616         """Report information extraction."""
2617         self.to_screen(u'%s: Resolving id' % video_id)
2618
2619     def _real_extract(self, url):
2620         mobj = re.match(self._VALID_URL, url)
2621         if mobj is None:
2622             self._downloader.report_error(u'invalid URL: %s' % url)
2623             return
2624
2625         # extract uploader (which is in the url)
2626         uploader = mobj.group(1)
2627         # extract simple title (uploader + slug of song title)
2628         slug_title =  mobj.group(2)
2629         simple_title = uploader + u'-' + slug_title
2630         full_title = '%s/sets/%s' % (uploader, slug_title)
2631
2632         self.report_resolve(full_title)
2633
2634         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2635         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2636         info_json = self._download_webpage(resolv_url, full_title)
2637
2638         videos = []
2639         info = json.loads(info_json)
2640         if 'errors' in info:
2641             for err in info['errors']:
2642                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2643             return
2644
2645         self.report_extraction(full_title)
2646         for track in info['tracks']:
2647             video_id = track['id']
2648
2649             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2651
2652             self.report_extraction(video_id)
2653             streams = json.loads(stream_json)
2654             mediaURL = streams['http_mp3_128_url']
2655
2656             videos.append({
2657                 'id':       video_id,
2658                 'url':      mediaURL,
2659                 'uploader': track['user']['username'],
2660                 'upload_date':  unified_strdate(track['created_at']),
2661                 'title':    track['title'],
2662                 'ext':      u'mp3',
2663                 'description': track['description'],
2664             })
2665         return videos
2666
2667
2668 class InfoQIE(InfoExtractor):
2669     """Information extractor for infoq.com"""
2670     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2671
2672     def _real_extract(self, url):
2673         mobj = re.match(self._VALID_URL, url)
2674         if mobj is None:
2675             self._downloader.report_error(u'invalid URL: %s' % url)
2676             return
2677
2678         webpage = self._download_webpage(url, video_id=url)
2679         self.report_extraction(url)
2680
2681         # Extract video URL
2682         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2683         if mobj is None:
2684             self._downloader.report_error(u'unable to extract video url')
2685             return
2686         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2687         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2688
2689         # Extract title
2690         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2691         if mobj is None:
2692             self._downloader.report_error(u'unable to extract video title')
2693             return
2694         video_title = mobj.group(1)
2695
2696         # Extract description
2697         video_description = u'No description available.'
2698         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2699         if mobj is not None:
2700             video_description = mobj.group(1)
2701
2702         video_filename = video_url.split('/')[-1]
2703         video_id, extension = video_filename.split('.')
2704
2705         info = {
2706             'id': video_id,
2707             'url': video_url,
2708             'uploader': None,
2709             'upload_date': None,
2710             'title': video_title,
2711             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2712             'thumbnail': None,
2713             'description': video_description,
2714         }
2715
2716         return [info]
2717
2718 class MixcloudIE(InfoExtractor):
2719     """Information extractor for www.mixcloud.com"""
2720
2721     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2722     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2723     IE_NAME = u'mixcloud'
2724
2725     def report_download_json(self, file_id):
2726         """Report JSON download."""
2727         self.to_screen(u'Downloading json')
2728
2729     def get_urls(self, jsonData, fmt, bitrate='best'):
2730         """Get urls from 'audio_formats' section in json"""
2731         file_url = None
2732         try:
2733             bitrate_list = jsonData[fmt]
2734             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2735                 bitrate = max(bitrate_list) # select highest
2736
2737             url_list = jsonData[fmt][bitrate]
2738         except TypeError: # we have no bitrate info.
2739             url_list = jsonData[fmt]
2740         return url_list
2741
2742     def check_urls(self, url_list):
2743         """Returns 1st active url from list"""
2744         for url in url_list:
2745             try:
2746                 compat_urllib_request.urlopen(url)
2747                 return url
2748             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2749                 url = None
2750
2751         return None
2752
2753     def _print_formats(self, formats):
2754         print('Available formats:')
2755         for fmt in formats.keys():
2756             for b in formats[fmt]:
2757                 try:
2758                     ext = formats[fmt][b][0]
2759                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2760                 except TypeError: # we have no bitrate info
2761                     ext = formats[fmt][0]
2762                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2763                     break
2764
2765     def _real_extract(self, url):
2766         mobj = re.match(self._VALID_URL, url)
2767         if mobj is None:
2768             self._downloader.report_error(u'invalid URL: %s' % url)
2769             return
2770         # extract uploader & filename from url
2771         uploader = mobj.group(1).decode('utf-8')
2772         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2773
2774         # construct API request
2775         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2776         # retrieve .json file with links to files
2777         request = compat_urllib_request.Request(file_url)
2778         try:
2779             self.report_download_json(file_url)
2780             jsonData = compat_urllib_request.urlopen(request).read()
2781         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2783             return
2784
2785         # parse JSON
2786         json_data = json.loads(jsonData)
2787         player_url = json_data['player_swf_url']
2788         formats = dict(json_data['audio_formats'])
2789
2790         req_format = self._downloader.params.get('format', None)
2791         bitrate = None
2792
2793         if self._downloader.params.get('listformats', None):
2794             self._print_formats(formats)
2795             return
2796
2797         if req_format is None or req_format == 'best':
2798             for format_param in formats.keys():
2799                 url_list = self.get_urls(formats, format_param)
2800                 # check urls
2801                 file_url = self.check_urls(url_list)
2802                 if file_url is not None:
2803                     break # got it!
2804         else:
2805             if req_format not in formats:
2806                 self._downloader.report_error(u'format is not available')
2807                 return
2808
2809             url_list = self.get_urls(formats, req_format)
2810             file_url = self.check_urls(url_list)
2811             format_param = req_format
2812
2813         return [{
2814             'id': file_id.decode('utf-8'),
2815             'url': file_url.decode('utf-8'),
2816             'uploader': uploader.decode('utf-8'),
2817             'upload_date': None,
2818             'title': json_data['name'],
2819             'ext': file_url.split('.')[-1].decode('utf-8'),
2820             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2821             'thumbnail': json_data['thumbnail_url'],
2822             'description': json_data['description'],
2823             'player_url': player_url.decode('utf-8'),
2824         }]
2825
2826 class StanfordOpenClassroomIE(InfoExtractor):
2827     """Information extractor for Stanford's Open ClassRoom"""
2828
2829     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2830     IE_NAME = u'stanfordoc'
2831
2832     def _real_extract(self, url):
2833         mobj = re.match(self._VALID_URL, url)
2834         if mobj is None:
2835             raise ExtractorError(u'Invalid URL: %s' % url)
2836
2837         if mobj.group('course') and mobj.group('video'): # A specific video
2838             course = mobj.group('course')
2839             video = mobj.group('video')
2840             info = {
2841                 'id': course + '_' + video,
2842                 'uploader': None,
2843                 'upload_date': None,
2844             }
2845
2846             self.report_extraction(info['id'])
2847             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2848             xmlUrl = baseUrl + video + '.xml'
2849             try:
2850                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2851             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2852                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2853                 return
2854             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2855             try:
2856                 info['title'] = mdoc.findall('./title')[0].text
2857                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2858             except IndexError:
2859                 self._downloader.report_error(u'Invalid metadata XML file')
2860                 return
2861             info['ext'] = info['url'].rpartition('.')[2]
2862             return [info]
2863         elif mobj.group('course'): # A course page
2864             course = mobj.group('course')
2865             info = {
2866                 'id': course,
2867                 'type': 'playlist',
2868                 'uploader': None,
2869                 'upload_date': None,
2870             }
2871
2872             coursepage = self._download_webpage(url, info['id'],
2873                                         note='Downloading course info page',
2874                                         errnote='Unable to download course info page')
2875
2876             m = re.search('<h1>([^<]+)</h1>', coursepage)
2877             if m:
2878                 info['title'] = unescapeHTML(m.group(1))
2879             else:
2880                 info['title'] = info['id']
2881
2882             m = re.search('<description>([^<]+)</description>', coursepage)
2883             if m:
2884                 info['description'] = unescapeHTML(m.group(1))
2885
2886             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2887             info['list'] = [
2888                 {
2889                     'type': 'reference',
2890                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2891                 }
2892                     for vpage in links]
2893             results = []
2894             for entry in info['list']:
2895                 assert entry['type'] == 'reference'
2896                 results += self.extract(entry['url'])
2897             return results
2898         else: # Root page
2899             info = {
2900                 'id': 'Stanford OpenClassroom',
2901                 'type': 'playlist',
2902                 'uploader': None,
2903                 'upload_date': None,
2904             }
2905
2906             self.report_download_webpage(info['id'])
2907             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2908             try:
2909                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2910             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2911                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2912                 return
2913
2914             info['title'] = info['id']
2915
2916             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2917             info['list'] = [
2918                 {
2919                     'type': 'reference',
2920                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2921                 }
2922                     for cpage in links]
2923
2924             results = []
2925             for entry in info['list']:
2926                 assert entry['type'] == 'reference'
2927                 results += self.extract(entry['url'])
2928             return results
2929
2930 class MTVIE(InfoExtractor):
2931     """Information extractor for MTV.com"""
2932
2933     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2934     IE_NAME = u'mtv'
2935
2936     def _real_extract(self, url):
2937         mobj = re.match(self._VALID_URL, url)
2938         if mobj is None:
2939             self._downloader.report_error(u'invalid URL: %s' % url)
2940             return
2941         if not mobj.group('proto'):
2942             url = 'http://' + url
2943         video_id = mobj.group('videoid')
2944
2945         webpage = self._download_webpage(url, video_id)
2946
2947         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2948         if mobj is None:
2949             self._downloader.report_error(u'unable to extract song name')
2950             return
2951         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2952         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2953         if mobj is None:
2954             self._downloader.report_error(u'unable to extract performer')
2955             return
2956         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2957         video_title = performer + ' - ' + song_name
2958
2959         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2960         if mobj is None:
2961             self._downloader.report_error(u'unable to mtvn_uri')
2962             return
2963         mtvn_uri = mobj.group(1)
2964
2965         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2966         if mobj is None:
2967             self._downloader.report_error(u'unable to extract content id')
2968             return
2969         content_id = mobj.group(1)
2970
2971         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2972         self.report_extraction(video_id)
2973         request = compat_urllib_request.Request(videogen_url)
2974         try:
2975             metadataXml = compat_urllib_request.urlopen(request).read()
2976         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2977             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2978             return
2979
2980         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2981         renditions = mdoc.findall('.//rendition')
2982
2983         # For now, always pick the highest quality.
2984         rendition = renditions[-1]
2985
2986         try:
2987             _,_,ext = rendition.attrib['type'].partition('/')
2988             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2989             video_url = rendition.find('./src').text
2990         except KeyError:
2991             self._downloader.report_error('Invalid rendition field.')
2992             return
2993
2994         info = {
2995             'id': video_id,
2996             'url': video_url,
2997             'uploader': performer,
2998             'upload_date': None,
2999             'title': video_title,
3000             'ext': ext,
3001             'format': format,
3002         }
3003
3004         return [info]
3005
3006
3007 class YoukuIE(InfoExtractor):
3008     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3009
3010     def _gen_sid(self):
3011         nowTime = int(time.time() * 1000)
3012         random1 = random.randint(1000,1998)
3013         random2 = random.randint(1000,9999)
3014
3015         return "%d%d%d" %(nowTime,random1,random2)
3016
3017     def _get_file_ID_mix_string(self, seed):
3018         mixed = []
3019         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3020         seed = float(seed)
3021         for i in range(len(source)):
3022             seed  =  (seed * 211 + 30031 ) % 65536
3023             index  =  math.floor(seed / 65536 * len(source) )
3024             mixed.append(source[int(index)])
3025             source.remove(source[int(index)])
3026         #return ''.join(mixed)
3027         return mixed
3028
3029     def _get_file_id(self, fileId, seed):
3030         mixed = self._get_file_ID_mix_string(seed)
3031         ids = fileId.split('*')
3032         realId = []
3033         for ch in ids:
3034             if ch:
3035                 realId.append(mixed[int(ch)])
3036         return ''.join(realId)
3037
3038     def _real_extract(self, url):
3039         mobj = re.match(self._VALID_URL, url)
3040         if mobj is None:
3041             self._downloader.report_error(u'invalid URL: %s' % url)
3042             return
3043         video_id = mobj.group('ID')
3044
3045         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3046
3047         jsondata = self._download_webpage(info_url, video_id)
3048
3049         self.report_extraction(video_id)
3050         try:
3051             config = json.loads(jsondata)
3052
3053             video_title =  config['data'][0]['title']
3054             seed = config['data'][0]['seed']
3055
3056             format = self._downloader.params.get('format', None)
3057             supported_format = list(config['data'][0]['streamfileids'].keys())
3058
3059             if format is None or format == 'best':
3060                 if 'hd2' in supported_format:
3061                     format = 'hd2'
3062                 else:
3063                     format = 'flv'
3064                 ext = u'flv'
3065             elif format == 'worst':
3066                 format = 'mp4'
3067                 ext = u'mp4'
3068             else:
3069                 format = 'flv'
3070                 ext = u'flv'
3071
3072
3073             fileid = config['data'][0]['streamfileids'][format]
3074             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3075         except (UnicodeDecodeError, ValueError, KeyError):
3076             self._downloader.report_error(u'unable to extract info section')
3077             return
3078
3079         files_info=[]
3080         sid = self._gen_sid()
3081         fileid = self._get_file_id(fileid, seed)
3082
3083         #column 8,9 of fileid represent the segment number
3084         #fileid[7:9] should be changed
3085         for index, key in enumerate(keys):
3086
3087             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3088             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3089
3090             info = {
3091                 'id': '%s_part%02d' % (video_id, index),
3092                 'url': download_url,
3093                 'uploader': None,
3094                 'upload_date': None,
3095                 'title': video_title,
3096                 'ext': ext,
3097             }
3098             files_info.append(info)
3099
3100         return files_info
3101
3102
3103 class XNXXIE(InfoExtractor):
3104     """Information extractor for xnxx.com"""
3105
3106     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3107     IE_NAME = u'xnxx'
3108     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3109     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3110     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3111
3112     def _real_extract(self, url):
3113         mobj = re.match(self._VALID_URL, url)
3114         if mobj is None:
3115             self._downloader.report_error(u'invalid URL: %s' % url)
3116             return
3117         video_id = mobj.group(1)
3118
3119         # Get webpage content
3120         webpage = self._download_webpage(url, video_id)
3121
3122         result = re.search(self.VIDEO_URL_RE, webpage)
3123         if result is None:
3124             self._downloader.report_error(u'unable to extract video url')
3125             return
3126         video_url = compat_urllib_parse.unquote(result.group(1))
3127
3128         result = re.search(self.VIDEO_TITLE_RE, webpage)
3129         if result is None:
3130             self._downloader.report_error(u'unable to extract video title')
3131             return
3132         video_title = result.group(1)
3133
3134         result = re.search(self.VIDEO_THUMB_RE, webpage)
3135         if result is None:
3136             self._downloader.report_error(u'unable to extract video thumbnail')
3137             return
3138         video_thumbnail = result.group(1)
3139
3140         return [{
3141             'id': video_id,
3142             'url': video_url,
3143             'uploader': None,
3144             'upload_date': None,
3145             'title': video_title,
3146             'ext': 'flv',
3147             'thumbnail': video_thumbnail,
3148             'description': None,
3149         }]
3150
3151
3152 class GooglePlusIE(InfoExtractor):
3153     """Information extractor for plus.google.com."""
3154
3155     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3156     IE_NAME = u'plus.google'
3157
3158     def report_extract_entry(self, url):
3159         """Report downloading extry"""
3160         self.to_screen(u'Downloading entry: %s' % url)
3161
3162     def report_date(self, upload_date):
3163         """Report downloading extry"""
3164         self.to_screen(u'Entry date: %s' % upload_date)
3165
3166     def report_uploader(self, uploader):
3167         """Report downloading extry"""
3168         self.to_screen(u'Uploader: %s' % uploader)
3169
3170     def report_title(self, video_title):
3171         """Report downloading extry"""
3172         self.to_screen(u'Title: %s' % video_title)
3173
3174     def report_extract_vid_page(self, video_page):
3175         """Report information extraction."""
3176         self.to_screen(u'Extracting video page: %s' % video_page)
3177
3178     def _real_extract(self, url):
3179         # Extract id from URL
3180         mobj = re.match(self._VALID_URL, url)
3181         if mobj is None:
3182             self._downloader.report_error(u'Invalid URL: %s' % url)
3183             return
3184
3185         post_url = mobj.group(0)
3186         video_id = mobj.group(1)
3187
3188         video_extension = 'flv'
3189
3190         # Step 1, Retrieve post webpage to extract further information
3191         self.report_extract_entry(post_url)
3192         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3193
3194         # Extract update date
3195         upload_date = None
3196         pattern = 'title="Timestamp">(.*?)</a>'
3197         mobj = re.search(pattern, webpage)
3198         if mobj:
3199             upload_date = mobj.group(1)
3200             # Convert timestring to a format suitable for filename
3201             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3202             upload_date = upload_date.strftime('%Y%m%d')
3203         self.report_date(upload_date)
3204
3205         # Extract uploader
3206         uploader = None
3207         pattern = r'rel\="author".*?>(.*?)</a>'
3208         mobj = re.search(pattern, webpage)
3209         if mobj:
3210             uploader = mobj.group(1)
3211         self.report_uploader(uploader)
3212
3213         # Extract title
3214         # Get the first line for title
3215         video_title = u'NA'
3216         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3217         mobj = re.search(pattern, webpage)
3218         if mobj:
3219             video_title = mobj.group(1)
3220         self.report_title(video_title)
3221
3222         # Step 2, Stimulate clicking the image box to launch video
3223         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3224         mobj = re.search(pattern, webpage)
3225         if mobj is None:
3226             self._downloader.report_error(u'unable to extract video page URL')
3227
3228         video_page = mobj.group(1)
3229         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3230         self.report_extract_vid_page(video_page)
3231
3232
3233         # Extract video links on video page
3234         """Extract video links of all sizes"""
3235         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3236         mobj = re.findall(pattern, webpage)
3237         if len(mobj) == 0:
3238             self._downloader.report_error(u'unable to extract video links')
3239
3240         # Sort in resolution
3241         links = sorted(mobj)
3242
3243         # Choose the lowest of the sort, i.e. highest resolution
3244         video_url = links[-1]
3245         # Only get the url. The resolution part in the tuple has no use anymore
3246         video_url = video_url[-1]
3247         # Treat escaped \u0026 style hex
3248         try:
3249             video_url = video_url.decode("unicode_escape")
3250         except AttributeError: # Python 3
3251             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3252
3253
3254         return [{
3255             'id':       video_id,
3256             'url':      video_url,
3257             'uploader': uploader,
3258             'upload_date':  upload_date,
3259             'title':    video_title,
3260             'ext':      video_extension,
3261         }]
3262
3263 class NBAIE(InfoExtractor):
3264     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3265     IE_NAME = u'nba'
3266
3267     def _real_extract(self, url):
3268         mobj = re.match(self._VALID_URL, url)
3269         if mobj is None:
3270             self._downloader.report_error(u'invalid URL: %s' % url)
3271             return
3272
3273         video_id = mobj.group(1)
3274         if video_id.endswith('/index.html'):
3275             video_id = video_id[:-len('/index.html')]
3276
3277         webpage = self._download_webpage(url, video_id)
3278
3279         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3280         def _findProp(rexp, default=None):
3281             m = re.search(rexp, webpage)
3282             if m:
3283                 return unescapeHTML(m.group(1))
3284             else:
3285                 return default
3286
3287         shortened_video_id = video_id.rpartition('/')[2]
3288         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3289         info = {
3290             'id': shortened_video_id,
3291             'url': video_url,
3292             'ext': 'mp4',
3293             'title': title,
3294             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3295             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3296         }
3297         return [info]
3298
3299 class JustinTVIE(InfoExtractor):
3300     """Information extractor for justin.tv and twitch.tv"""
3301     # TODO: One broadcast may be split into multiple videos. The key
3302     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3303     # starts at 1 and increases. Can we treat all parts as one video?
3304
3305     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3306         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3307     _JUSTIN_PAGE_LIMIT = 100
3308     IE_NAME = u'justin.tv'
3309
3310     def report_download_page(self, channel, offset):
3311         """Report attempt to download a single page of videos."""
3312         self.to_screen(u'%s: Downloading video information from %d to %d' %
3313                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3314
3315     # Return count of items, list of *valid* items
3316     def _parse_page(self, url, video_id):
3317         webpage = self._download_webpage(url, video_id,
3318                                          u'Downloading video info JSON',
3319                                          u'unable to download video info JSON')
3320
3321         response = json.loads(webpage)
3322         if type(response) != list:
3323             error_text = response.get('error', 'unknown error')
3324             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3325             return
3326         info = []
3327         for clip in response:
3328             video_url = clip['video_file_url']
3329             if video_url:
3330                 video_extension = os.path.splitext(video_url)[1][1:]
3331                 video_date = re.sub('-', '', clip['start_time'][:10])
3332                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3333                 video_id = clip['id']
3334                 video_title = clip.get('title', video_id)
3335                 info.append({
3336                     'id': video_id,
3337                     'url': video_url,
3338                     'title': video_title,
3339                     'uploader': clip.get('channel_name', video_uploader_id),
3340                     'uploader_id': video_uploader_id,
3341                     'upload_date': video_date,
3342                     'ext': video_extension,
3343                 })
3344         return (len(response), info)
3345
3346     def _real_extract(self, url):
3347         mobj = re.match(self._VALID_URL, url)
3348         if mobj is None:
3349             self._downloader.report_error(u'invalid URL: %s' % url)
3350             return
3351
3352         api = 'http://api.justin.tv'
3353         video_id = mobj.group(mobj.lastindex)
3354         paged = False
3355         if mobj.lastindex == 1:
3356             paged = True
3357             api += '/channel/archives/%s.json'
3358         else:
3359             api += '/broadcast/by_archive/%s.json'
3360         api = api % (video_id,)
3361
3362         self.report_extraction(video_id)
3363
3364         info = []
3365         offset = 0
3366         limit = self._JUSTIN_PAGE_LIMIT
3367         while True:
3368             if paged:
3369                 self.report_download_page(video_id, offset)
3370             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3371             page_count, page_info = self._parse_page(page_url, video_id)
3372             info.extend(page_info)
3373             if not paged or page_count != limit:
3374                 break
3375             offset += limit
3376         return info
3377
3378 class FunnyOrDieIE(InfoExtractor):
3379     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3380
3381     def _real_extract(self, url):
3382         mobj = re.match(self._VALID_URL, url)
3383         if mobj is None:
3384             self._downloader.report_error(u'invalid URL: %s' % url)
3385             return
3386
3387         video_id = mobj.group('id')
3388         webpage = self._download_webpage(url, video_id)
3389
3390         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3391         if not m:
3392             self._downloader.report_error(u'unable to find video information')
3393         video_url = unescapeHTML(m.group('url'))
3394
3395         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3396         if not m:
3397             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3398             if not m:
3399                 self._downloader.report_error(u'Cannot find video title')
3400         title = clean_html(m.group('title'))
3401
3402         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3403         if m:
3404             desc = unescapeHTML(m.group('desc'))
3405         else:
3406             desc = None
3407
3408         info = {
3409             'id': video_id,
3410             'url': video_url,
3411             'ext': 'mp4',
3412             'title': title,
3413             'description': desc,
3414         }
3415         return [info]
3416
3417 class SteamIE(InfoExtractor):
3418     _VALID_URL = r"""http://store\.steampowered\.com/
3419                 (agecheck/)?
3420                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3421                 (?P<gameID>\d+)/?
3422                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3423                 """
3424
3425     @classmethod
3426     def suitable(cls, url):
3427         """Receives a URL and returns True if suitable for this IE."""
3428         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3429
3430     def _real_extract(self, url):
3431         m = re.match(self._VALID_URL, url, re.VERBOSE)
3432         gameID = m.group('gameID')
3433         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3434         self.report_age_confirmation()
3435         webpage = self._download_webpage(videourl, gameID)
3436         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3437
3438         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3439         mweb = re.finditer(urlRE, webpage)
3440         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3441         titles = re.finditer(namesRE, webpage)
3442         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3443         thumbs = re.finditer(thumbsRE, webpage)
3444         videos = []
3445         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3446             video_id = vid.group('videoID')
3447             title = vtitle.group('videoName')
3448             video_url = vid.group('videoURL')
3449             video_thumb = thumb.group('thumbnail')
3450             if not video_url:
3451                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3452             info = {
3453                 'id':video_id,
3454                 'url':video_url,
3455                 'ext': 'flv',
3456                 'title': unescapeHTML(title),
3457                 'thumbnail': video_thumb
3458                   }
3459             videos.append(info)
3460         return [self.playlist_result(videos, gameID, game_title)]
3461
3462 class UstreamIE(InfoExtractor):
3463     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3464     IE_NAME = u'ustream'
3465
3466     def _real_extract(self, url):
3467         m = re.match(self._VALID_URL, url)
3468         video_id = m.group('videoID')
3469         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3470         webpage = self._download_webpage(url, video_id)
3471         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3472         title = m.group('title')
3473         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3474         uploader = m.group('uploader')
3475         info = {
3476                 'id':video_id,
3477                 'url':video_url,
3478                 'ext': 'flv',
3479                 'title': title,
3480                 'uploader': uploader
3481                   }
3482         return [info]
3483
3484 class WorldStarHipHopIE(InfoExtractor):
3485     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3486     IE_NAME = u'WorldStarHipHop'
3487
3488     def _real_extract(self, url):
3489         _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3490
3491         m = re.match(self._VALID_URL, url)
3492         video_id = m.group('id')
3493
3494         webpage_src = self._download_webpage(url, video_id)
3495
3496         mobj = re.search(_src_url, webpage_src)
3497
3498         if mobj is not None:
3499             video_url = mobj.group()
3500             if 'mp4' in video_url:
3501                 ext = 'mp4'
3502             else:
3503                 ext = 'flv'
3504         else:
3505             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3506             return
3507
3508         _title = r"""<title>(.*)</title>"""
3509
3510         mobj = re.search(_title, webpage_src)
3511
3512         if mobj is not None:
3513             title = mobj.group(1)
3514         else:
3515             title = 'World Start Hip Hop - %s' % time.ctime()
3516
3517         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3518         mobj = re.search(_thumbnail, webpage_src)
3519
3520         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3521         if mobj is not None:
3522             thumbnail = mobj.group(1)
3523         else:
3524             _title = r"""candytitles.*>(.*)</span>"""
3525             mobj = re.search(_title, webpage_src)
3526             if mobj is not None:
3527                 title = mobj.group(1)
3528             thumbnail = None
3529
3530         results = [{
3531                     'id': video_id,
3532                     'url' : video_url,
3533                     'title' : title,
3534                     'thumbnail' : thumbnail,
3535                     'ext' : ext,
3536                     }]
3537         return results
3538
3539 class RBMARadioIE(InfoExtractor):
3540     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3541
3542     def _real_extract(self, url):
3543         m = re.match(self._VALID_URL, url)
3544         video_id = m.group('videoID')
3545
3546         webpage = self._download_webpage(url, video_id)
3547         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3548         if not m:
3549             raise ExtractorError(u'Cannot find metadata')
3550         json_data = m.group(1)
3551
3552         try:
3553             data = json.loads(json_data)
3554         except ValueError as e:
3555             raise ExtractorError(u'Invalid JSON: ' + str(e))
3556
3557         video_url = data['akamai_url'] + '&cbr=256'
3558         url_parts = compat_urllib_parse_urlparse(video_url)
3559         video_ext = url_parts.path.rpartition('.')[2]
3560         info = {
3561                 'id': video_id,
3562                 'url': video_url,
3563                 'ext': video_ext,
3564                 'title': data['title'],
3565                 'description': data.get('teaser_text'),
3566                 'location': data.get('country_of_origin'),
3567                 'uploader': data.get('host', {}).get('name'),
3568                 'uploader_id': data.get('host', {}).get('slug'),
3569                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3570                 'duration': data.get('duration'),
3571         }
3572         return [info]
3573
3574
3575 class YouPornIE(InfoExtractor):
3576     """Information extractor for youporn.com."""
3577     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3578
3579     def _print_formats(self, formats):
3580         """Print all available formats"""
3581         print(u'Available formats:')
3582         print(u'ext\t\tformat')
3583         print(u'---------------------------------')
3584         for format in formats:
3585             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3586
3587     def _specific(self, req_format, formats):
3588         for x in formats:
3589             if(x["format"]==req_format):
3590                 return x
3591         return None
3592
3593     def _real_extract(self, url):
3594         mobj = re.match(self._VALID_URL, url)
3595         if mobj is None:
3596             self._downloader.report_error(u'invalid URL: %s' % url)
3597             return
3598
3599         video_id = mobj.group('videoid')
3600
3601         req = compat_urllib_request.Request(url)
3602         req.add_header('Cookie', 'age_verified=1')
3603         webpage = self._download_webpage(req, video_id)
3604
3605         # Get the video title
3606         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3607         if result is None:
3608             raise ExtractorError(u'Unable to extract video title')
3609         video_title = result.group('title').strip()
3610
3611         # Get the video date
3612         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3613         if result is None:
3614             self._downloader.report_warning(u'unable to extract video date')
3615             upload_date = None
3616         else:
3617             upload_date = unified_strdate(result.group('date').strip())
3618
3619         # Get the video uploader
3620         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3621         if result is None:
3622             self._downloader.report_warning(u'unable to extract uploader')
3623             video_uploader = None
3624         else:
3625             video_uploader = result.group('uploader').strip()
3626             video_uploader = clean_html( video_uploader )
3627
3628         # Get all of the formats available
3629         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3630         result = re.search(DOWNLOAD_LIST_RE, webpage)
3631         if result is None:
3632             raise ExtractorError(u'Unable to extract download list')
3633         download_list_html = result.group('download_list').strip()
3634
3635         # Get all of the links from the page
3636         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3637         links = re.findall(LINK_RE, download_list_html)
3638         if(len(links) == 0):
3639             raise ExtractorError(u'ERROR: no known formats available for video')
3640
3641         self.to_screen(u'Links found: %d' % len(links))
3642
3643         formats = []
3644         for link in links:
3645
3646             # A link looks like this:
3647             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3648             # A path looks like this:
3649             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3650             video_url = unescapeHTML( link )
3651             path = compat_urllib_parse_urlparse( video_url ).path
3652             extension = os.path.splitext( path )[1][1:]
3653             format = path.split('/')[4].split('_')[:2]
3654             size = format[0]
3655             bitrate = format[1]
3656             format = "-".join( format )
3657             title = u'%s-%s-%s' % (video_title, size, bitrate)
3658
3659             formats.append({
3660                 'id': video_id,
3661                 'url': video_url,
3662                 'uploader': video_uploader,
3663                 'upload_date': upload_date,
3664                 'title': title,
3665                 'ext': extension,
3666                 'format': format,
3667                 'thumbnail': None,
3668                 'description': None,
3669                 'player_url': None
3670             })
3671
3672         if self._downloader.params.get('listformats', None):
3673             self._print_formats(formats)
3674             return
3675
3676         req_format = self._downloader.params.get('format', None)
3677         self.to_screen(u'Format: %s' % req_format)
3678
3679         if req_format is None or req_format == 'best':
3680             return [formats[0]]
3681         elif req_format == 'worst':
3682             return [formats[-1]]
3683         elif req_format in ('-1', 'all'):
3684             return formats
3685         else:
3686             format = self._specific( req_format, formats )
3687             if result is None:
3688                 self._downloader.report_error(u'requested format not available')
3689                 return
3690             return [format]
3691
3692
3693
3694 class PornotubeIE(InfoExtractor):
3695     """Information extractor for pornotube.com."""
3696     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3697
3698     def _real_extract(self, url):
3699         mobj = re.match(self._VALID_URL, url)
3700         if mobj is None:
3701             self._downloader.report_error(u'invalid URL: %s' % url)
3702             return
3703
3704         video_id = mobj.group('videoid')
3705         video_title = mobj.group('title')
3706
3707         # Get webpage content
3708         webpage = self._download_webpage(url, video_id)
3709
3710         # Get the video URL
3711         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3712         result = re.search(VIDEO_URL_RE, webpage)
3713         if result is None:
3714             self._downloader.report_error(u'unable to extract video url')
3715             return
3716         video_url = compat_urllib_parse.unquote(result.group('url'))
3717
3718         #Get the uploaded date
3719         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3720         result = re.search(VIDEO_UPLOADED_RE, webpage)
3721         if result is None:
3722             self._downloader.report_error(u'unable to extract video title')
3723             return
3724         upload_date = unified_strdate(result.group('date'))
3725
3726         info = {'id': video_id,
3727                 'url': video_url,
3728                 'uploader': None,
3729                 'upload_date': upload_date,
3730                 'title': video_title,
3731                 'ext': 'flv',
3732                 'format': 'flv'}
3733
3734         return [info]
3735
3736 class YouJizzIE(InfoExtractor):
3737     """Information extractor for youjizz.com."""
3738     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3739
3740     def _real_extract(self, url):
3741         mobj = re.match(self._VALID_URL, url)
3742         if mobj is None:
3743             self._downloader.report_error(u'invalid URL: %s' % url)
3744             return
3745
3746         video_id = mobj.group('videoid')
3747
3748         # Get webpage content
3749         webpage = self._download_webpage(url, video_id)
3750
3751         # Get the video title
3752         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3753         if result is None:
3754             raise ExtractorError(u'ERROR: unable to extract video title')
3755         video_title = result.group('title').strip()
3756
3757         # Get the embed page
3758         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3759         if result is None:
3760             raise ExtractorError(u'ERROR: unable to extract embed page')
3761
3762         embed_page_url = result.group(0).strip()
3763         video_id = result.group('videoid')
3764
3765         webpage = self._download_webpage(embed_page_url, video_id)
3766
3767         # Get the video URL
3768         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3769         if result is None:
3770             raise ExtractorError(u'ERROR: unable to extract video url')
3771         video_url = result.group('source')
3772
3773         info = {'id': video_id,
3774                 'url': video_url,
3775                 'title': video_title,
3776                 'ext': 'flv',
3777                 'format': 'flv',
3778                 'player_url': embed_page_url}
3779
3780         return [info]
3781
3782 class EightTracksIE(InfoExtractor):
3783     IE_NAME = '8tracks'
3784     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3785
3786     def _real_extract(self, url):
3787         mobj = re.match(self._VALID_URL, url)
3788         if mobj is None:
3789             raise ExtractorError(u'Invalid URL: %s' % url)
3790         playlist_id = mobj.group('id')
3791
3792         webpage = self._download_webpage(url, playlist_id)
3793
3794         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3795         if not m:
3796             raise ExtractorError(u'Cannot find trax information')
3797         json_like = m.group(1)
3798         data = json.loads(json_like)
3799
3800         session = str(random.randint(0, 1000000000))
3801         mix_id = data['id']
3802         track_count = data['tracks_count']
3803         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3804         next_url = first_url
3805         res = []
3806         for i in itertools.count():
3807             api_json = self._download_webpage(next_url, playlist_id,
3808                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3809                 errnote=u'Failed to download song information')
3810             api_data = json.loads(api_json)
3811             track_data = api_data[u'set']['track']
3812             info = {
3813                 'id': track_data['id'],
3814                 'url': track_data['track_file_stream_url'],
3815                 'title': track_data['performer'] + u' - ' + track_data['name'],
3816                 'raw_title': track_data['name'],
3817                 'uploader_id': data['user']['login'],
3818                 'ext': 'm4a',
3819             }
3820             res.append(info)
3821             if api_data['set']['at_last_track']:
3822                 break
3823             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3824         return res
3825
3826 class KeekIE(InfoExtractor):
3827     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3828     IE_NAME = u'keek'
3829
3830     def _real_extract(self, url):
3831         m = re.match(self._VALID_URL, url)
3832         video_id = m.group('videoID')
3833         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3834         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3835         webpage = self._download_webpage(url, video_id)
3836         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3837         title = unescapeHTML(m.group('title'))
3838         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3839         uploader = clean_html(m.group('uploader'))
3840         info = {
3841                 'id': video_id,
3842                 'url': video_url,
3843                 'ext': 'mp4',
3844                 'title': title,
3845                 'thumbnail': thumbnail,
3846                 'uploader': uploader
3847         }
3848         return [info]
3849
3850 class TEDIE(InfoExtractor):
3851     _VALID_URL=r'''http://www\.ted\.com/
3852                    (
3853                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3854                         |
3855                         ((?P<type_talk>talks)) # We have a simple talk
3856                    )
3857                    /(?P<name>\w+) # Here goes the name and then ".html"
3858                    '''
3859
3860     @classmethod
3861     def suitable(cls, url):
3862         """Receives a URL and returns True if suitable for this IE."""
3863         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3864
3865     def _real_extract(self, url):
3866         m=re.match(self._VALID_URL, url, re.VERBOSE)
3867         if m.group('type_talk'):
3868             return [self._talk_info(url)]
3869         else :
3870             playlist_id=m.group('playlist_id')
3871             name=m.group('name')
3872             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3873             return [self._playlist_videos_info(url,name,playlist_id)]
3874
3875     def _talk_video_link(self,mediaSlug):
3876         '''Returns the video link for that mediaSlug'''
3877         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3878
3879     def _playlist_videos_info(self,url,name,playlist_id=0):
3880         '''Returns the videos of the playlist'''
3881         video_RE=r'''
3882                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3883                      ([.\s]*?)data-playlist_item_id="(\d+)"
3884                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3885                      '''
3886         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3887         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3888         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3889         m_names=re.finditer(video_name_RE,webpage)
3890
3891         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3892         m_playlist = re.search(playlist_RE, webpage)
3893         playlist_title = m_playlist.group('playlist_title')
3894
3895         playlist_entries = []
3896         for m_video, m_name in zip(m_videos,m_names):
3897             video_id=m_video.group('video_id')
3898             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3899             playlist_entries.append(self.url_result(talk_url, 'TED'))
3900         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3901
3902     def _talk_info(self, url, video_id=0):
3903         """Return the video for the talk in the url"""
3904         m=re.match(self._VALID_URL, url,re.VERBOSE)
3905         videoName=m.group('name')
3906         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3907         # If the url includes the language we get the title translated
3908         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3909         title=re.search(title_RE, webpage).group('title')
3910         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3911                         "id":(?P<videoID>[\d]+).*?
3912                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3913         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3914         thumb_match=re.search(thumb_RE,webpage)
3915         info_match=re.search(info_RE,webpage,re.VERBOSE)
3916         video_id=info_match.group('videoID')
3917         mediaSlug=info_match.group('mediaSlug')
3918         video_url=self._talk_video_link(mediaSlug)
3919         info = {
3920                 'id': video_id,
3921                 'url': video_url,
3922                 'ext': 'mp4',
3923                 'title': title,
3924                 'thumbnail': thumb_match.group('thumbnail')
3925                 }
3926         return info
3927
3928 class MySpassIE(InfoExtractor):
3929     _VALID_URL = r'http://www.myspass.de/.*'
3930
3931     def _real_extract(self, url):
3932         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3933
3934         # video id is the last path element of the URL
3935         # usually there is a trailing slash, so also try the second but last
3936         url_path = compat_urllib_parse_urlparse(url).path
3937         url_parent_path, video_id = os.path.split(url_path)
3938         if not video_id:
3939             _, video_id = os.path.split(url_parent_path)
3940
3941         # get metadata
3942         metadata_url = META_DATA_URL_TEMPLATE % video_id
3943         metadata_text = self._download_webpage(metadata_url, video_id)
3944         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3945
3946         # extract values from metadata
3947         url_flv_el = metadata.find('url_flv')
3948         if url_flv_el is None:
3949             self._downloader.report_error(u'unable to extract download url')
3950             return
3951         video_url = url_flv_el.text
3952         extension = os.path.splitext(video_url)[1][1:]
3953         title_el = metadata.find('title')
3954         if title_el is None:
3955             self._downloader.report_error(u'unable to extract title')
3956             return
3957         title = title_el.text
3958         format_id_el = metadata.find('format_id')
3959         if format_id_el is None:
3960             format = ext
3961         else:
3962             format = format_id_el.text
3963         description_el = metadata.find('description')
3964         if description_el is not None:
3965             description = description_el.text
3966         else:
3967             description = None
3968         imagePreview_el = metadata.find('imagePreview')
3969         if imagePreview_el is not None:
3970             thumbnail = imagePreview_el.text
3971         else:
3972             thumbnail = None
3973         info = {
3974             'id': video_id,
3975             'url': video_url,
3976             'title': title,
3977             'ext': extension,
3978             'format': format,
3979             'thumbnail': thumbnail,
3980             'description': description
3981         }
3982         return [info]
3983
3984 class SpiegelIE(InfoExtractor):
3985     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3986
3987     def _real_extract(self, url):
3988         m = re.match(self._VALID_URL, url)
3989         video_id = m.group('videoID')
3990
3991         webpage = self._download_webpage(url, video_id)
3992         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3993         if not m:
3994             raise ExtractorError(u'Cannot find title')
3995         video_title = unescapeHTML(m.group(1))
3996
3997         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3998         xml_code = self._download_webpage(xml_url, video_id,
3999                     note=u'Downloading XML', errnote=u'Failed to download XML')
4000
4001         idoc = xml.etree.ElementTree.fromstring(xml_code)
4002         last_type = idoc[-1]
4003         filename = last_type.findall('./filename')[0].text
4004         duration = float(last_type.findall('./duration')[0].text)
4005
4006         video_url = 'http://video2.spiegel.de/flash/' + filename
4007         video_ext = filename.rpartition('.')[2]
4008         info = {
4009             'id': video_id,
4010             'url': video_url,
4011             'ext': video_ext,
4012             'title': video_title,
4013             'duration': duration,
4014         }
4015         return [info]
4016
4017 class LiveLeakIE(InfoExtractor):
4018
4019     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4020     IE_NAME = u'liveleak'
4021
4022     def _real_extract(self, url):
4023         mobj = re.match(self._VALID_URL, url)
4024         if mobj is None:
4025             self._downloader.report_error(u'invalid URL: %s' % url)
4026             return
4027
4028         video_id = mobj.group('video_id')
4029
4030         webpage = self._download_webpage(url, video_id)
4031
4032         m = re.search(r'file: "(.*?)",', webpage)
4033         if not m:
4034             self._downloader.report_error(u'unable to find video url')
4035             return
4036         video_url = m.group(1)
4037
4038         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4039         if not m:
4040             self._downloader.report_error(u'Cannot find video title')
4041         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4042
4043         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4044         if m:
4045             desc = unescapeHTML(m.group('desc'))
4046         else:
4047             desc = None
4048
4049         m = re.search(r'By:.*?(\w+)</a>', webpage)
4050         if m:
4051             uploader = clean_html(m.group(1))
4052         else:
4053             uploader = None
4054
4055         info = {
4056             'id':  video_id,
4057             'url': video_url,
4058             'ext': 'mp4',
4059             'title': title,
4060             'description': desc,
4061             'uploader': uploader
4062         }
4063
4064         return [info]
4065
4066 class ARDIE(InfoExtractor):
4067     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4068     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4069     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4070
4071     def _real_extract(self, url):
4072         # determine video id from url
4073         m = re.match(self._VALID_URL, url)
4074
4075         numid = re.search(r'documentId=([0-9]+)', url)
4076         if numid:
4077             video_id = numid.group(1)
4078         else:
4079             video_id = m.group('video_id')
4080
4081         # determine title and media streams from webpage
4082         html = self._download_webpage(url, video_id)
4083         title = re.search(self._TITLE, html).group('title')
4084         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4085         if not streams:
4086             assert '"fsk"' in html
4087             self._downloader.report_error(u'this video is only available after 8:00 pm')
4088             return
4089
4090         # choose default media type and highest quality for now
4091         stream = max([s for s in streams if int(s["media_type"]) == 0],
4092                      key=lambda s: int(s["quality"]))
4093
4094         # there's two possibilities: RTMP stream or HTTP download
4095         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4096         if stream['rtmp_url']:
4097             self.to_screen(u'RTMP download detected')
4098             assert stream['video_url'].startswith('mp4:')
4099             info["url"] = stream["rtmp_url"]
4100             info["play_path"] = stream['video_url']
4101         else:
4102             assert stream["video_url"].endswith('.mp4')
4103             info["url"] = stream["video_url"]
4104         return [info]
4105
4106 class TumblrIE(InfoExtractor):
4107     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4108
4109     def _real_extract(self, url):
4110         m_url = re.match(self._VALID_URL, url)
4111         video_id = m_url.group('id')
4112         blog = m_url.group('blog_name')
4113
4114         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4115         webpage = self._download_webpage(url, video_id)
4116
4117         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4118         video = re.search(re_video, webpage)
4119         if video is None:
4120             self.to_screen("No video founded")
4121             return []
4122         video_url = video.group('video_url')
4123         ext = video.group('ext')
4124
4125         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4126         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4127
4128         # The only place where you can get a title, it's not complete,
4129         # but searching in other places doesn't work for all videos
4130         re_title = r'<title>(?P<title>.*?)</title>'
4131         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4132
4133         return [{'id': video_id,
4134                  'url': video_url,
4135                  'title': title,
4136                  'thumbnail': thumb,
4137                  'ext': ext
4138                  }]
4139
4140 class BandcampIE(InfoExtractor):
4141     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4142
4143     def _real_extract(self, url):
4144         mobj = re.match(self._VALID_URL, url)
4145         title = mobj.group('title')
4146         webpage = self._download_webpage(url, title)
4147         # We get the link to the free download page
4148         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4149         if m_download is None:
4150             self._downloader.report_error('No free songs founded')
4151             return
4152         download_link = m_download.group(1)
4153         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4154                        webpage, re.MULTILINE|re.DOTALL).group('id')
4155
4156         download_webpage = self._download_webpage(download_link, id,
4157                                                   'Downloading free downloads page')
4158         # We get the dictionary of the track from some javascrip code
4159         info = re.search(r'items: (.*?),$',
4160                          download_webpage, re.MULTILINE).group(1)
4161         info = json.loads(info)[0]
4162         # We pick mp3-320 for now, until format selection can be easily implemented.
4163         mp3_info = info[u'downloads'][u'mp3-320']
4164         # If we try to use this url it says the link has expired
4165         initial_url = mp3_info[u'url']
4166         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4167         m_url = re.match(re_url, initial_url)
4168         #We build the url we will use to get the final track url
4169         # This url is build in Bandcamp in the script download_bunde_*.js
4170         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4171         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4172         # If we could correctly generate the .rand field the url would be
4173         #in the "download_url" key
4174         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4175
4176         track_info = {'id':id,
4177                       'title' : info[u'title'],
4178                       'ext' : 'mp3',
4179                       'url' : final_url,
4180                       'thumbnail' : info[u'thumb_url'],
4181                       'uploader' : info[u'artist']
4182                       }
4183
4184         return [track_info]
4185
4186
4187 def gen_extractors():
4188     """ Return a list of an instance of every supported extractor.
4189     The order does matter; the first extractor matched is the one handling the URL.
4190     """
4191     return [
4192         YoutubePlaylistIE(),
4193         YoutubeChannelIE(),
4194         YoutubeUserIE(),
4195         YoutubeSearchIE(),
4196         YoutubeIE(),
4197         MetacafeIE(),
4198         DailymotionIE(),
4199         GoogleSearchIE(),
4200         PhotobucketIE(),
4201         YahooIE(),
4202         YahooSearchIE(),
4203         DepositFilesIE(),
4204         FacebookIE(),
4205         BlipTVUserIE(),
4206         BlipTVIE(),
4207         VimeoIE(),
4208         MyVideoIE(),
4209         ComedyCentralIE(),
4210         EscapistIE(),
4211         CollegeHumorIE(),
4212         XVideosIE(),
4213         SoundcloudSetIE(),
4214         SoundcloudIE(),
4215         InfoQIE(),
4216         MixcloudIE(),
4217         StanfordOpenClassroomIE(),
4218         MTVIE(),
4219         YoukuIE(),
4220         XNXXIE(),
4221         YouJizzIE(),
4222         PornotubeIE(),
4223         YouPornIE(),
4224         GooglePlusIE(),
4225         ArteTvIE(),
4226         NBAIE(),
4227         WorldStarHipHopIE(),
4228         JustinTVIE(),
4229         FunnyOrDieIE(),
4230         SteamIE(),
4231         UstreamIE(),
4232         RBMARadioIE(),
4233         EightTracksIE(),
4234         KeekIE(),
4235         TEDIE(),
4236         MySpassIE(),
4237         SpiegelIE(),
4238         LiveLeakIE(),
4239         ARDIE(),
4240         TumblrIE(),
4241         BandcampIE(),
4242         GenericIE()
4243     ]
4244
4245 def get_info_extractor(ie_name):
4246     """Returns the info extractor class with the given ie_name"""
4247     return globals()[ie_name+'IE']