_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 476             return
 477
 478     def _extract_id(self, url):
 479         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 480         if mobj is None:
 481             self._downloader.report_error(u'invalid URL: %s' % url)
 482             return
 483         video_id = mobj.group(2)
 484         return video_id
 485
 486     def _real_extract(self, url):
 487         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 488         mobj = re.search(self._NEXT_URL_RE, url)
 489         if mobj:
 490             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 491         video_id = self._extract_id(url)
 492
 493         # Get video webpage
 494         self.report_video_webpage_download(video_id)
 495         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 496         request = compat_urllib_request.Request(url)
 497         try:
 498             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 499         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 500             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 501             return
 502
 503         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 504
 505         # Attempt to extract SWF player URL
 506         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 507         if mobj is not None:
 508             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 509         else:
 510             player_url = None
 511
 512         # Get video info
 513         self.report_video_info_webpage_download(video_id)
 514         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 515             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 516                     % (video_id, el_type))
 517             video_info_webpage = self._download_webpage(video_info_url, video_id,
 518                                     note=False,
 519                                     errnote='unable to download video info webpage')
 520             video_info = compat_parse_qs(video_info_webpage)
 521             if 'token' in video_info:
 522                 break
 523         if 'token' not in video_info:
 524             if 'reason' in video_info:
 525                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 526             else:
 527                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 528             return
 529
 530         # Check for "rental" videos
 531         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 532             self._downloader.report_error(u'"rental" videos not supported')
 533             return
 534
 535         # Start extracting information
 536         self.report_information_extraction(video_id)
 537
 538         # uploader
 539         if 'author' not in video_info:
 540             self._downloader.report_error(u'unable to extract uploader name')
 541             return
 542         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 543
 544         # uploader_id
 545         video_uploader_id = None
 546         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 547         if mobj is not None:
 548             video_uploader_id = mobj.group(1)
 549         else:
 550             self._downloader.report_warning(u'unable to extract uploader nickname')
 551
 552         # title
 553         if 'title' not in video_info:
 554             self._downloader.report_error(u'unable to extract video title')
 555             return
 556         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 557
 558         # thumbnail image
 559         if 'thumbnail_url' not in video_info:
 560             self._downloader.report_warning(u'unable to extract video thumbnail')
 561             video_thumbnail = ''
 562         else:   # don't panic if we can't find it
 563             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 564
 565         # upload date
 566         upload_date = None
 567         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 568         if mobj is not None:
 569             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 570             upload_date = unified_strdate(upload_date)
 571
 572         # description
 573         video_description = get_element_by_id("eow-description", video_webpage)
 574         if video_description:
 575             video_description = clean_html(video_description)
 576         else:
 577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 578             if fd_mobj:
 579                 video_description = unescapeHTML(fd_mobj.group(1))
 580             else:
 581                 video_description = u''
 582
 583         # subtitles
 584         video_subtitles = None
 585
 586         if self._downloader.params.get('writesubtitles', False):
 587             video_subtitles = self._extract_subtitle(video_id)
 588             if video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitles[0]
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('allsubtitles', False):
 594             video_subtitles = self._extract_all_subtitles(video_id)
 595             for video_subtitle in video_subtitles:
 596                 (sub_error, sub_lang, sub) = video_subtitle
 597                 if sub_error:
 598                     self._downloader.report_error(sub_error)
 599
 600         if self._downloader.params.get('listsubtitles', False):
 601             sub_lang_list = self._list_available_subtitles(video_id)
 602             return
 603
 604         if 'length_seconds' not in video_info:
 605             self._downloader.report_warning(u'unable to extract video duration')
 606             video_duration = ''
 607         else:
 608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 609
 610         # token
 611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 612
 613         # Decide which formats to download
 614         req_format = self._downloader.params.get('format', None)
 615
 616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 617             self.report_rtmp_download()
 618             video_url_list = [(None, video_info['conn'][0])]
 619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 624
 625             format_limit = self._downloader.params.get('format_limit', None)
 626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 627             if format_limit is not None and format_limit in available_formats:
 628                 format_list = available_formats[available_formats.index(format_limit):]
 629             else:
 630                 format_list = available_formats
 631             existing_formats = [x for x in format_list if x in url_map]
 632             if len(existing_formats) == 0:
 633                 raise ExtractorError(u'no known formats available for video')
 634             if self._downloader.params.get('listformats', None):
 635                 self._print_formats(existing_formats)
 636                 return
 637             if req_format is None or req_format == 'best':
 638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 639             elif req_format == 'worst':
 640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 641             elif req_format in ('-1', 'all'):
 642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 643             else:
 644                 # Specific formats. We pick the first in a slash-delimeted sequence.
 645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 646                 req_formats = req_format.split('/')
 647                 video_url_list = None
 648                 for rf in req_formats:
 649                     if rf in url_map:
 650                         video_url_list = [(rf, url_map[rf])]
 651                         break
 652                 if video_url_list is None:
 653                     raise ExtractorError(u'requested format not available')
 654         else:
 655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 656
 657         results = []
 658         for format_param, video_real_url in video_url_list:
 659             # Extension
 660             video_extension = self._video_extensions.get(format_param, 'flv')
 661
 662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 663                                               self._video_dimensions.get(format_param, '???'))
 664
 665             results.append({
 666                 'id':       video_id,
 667                 'url':      video_real_url,
 668                 'uploader': video_uploader,
 669                 'uploader_id': video_uploader_id,
 670                 'upload_date':  upload_date,
 671                 'title':    video_title,
 672                 'ext':      video_extension,
 673                 'format':   video_format,
 674                 'thumbnail':    video_thumbnail,
 675                 'description':  video_description,
 676                 'player_url':   player_url,
 677                 'subtitles':    video_subtitles,
 678                 'duration':     video_duration
 679             })
 680         return results
 681
 682
 683 class MetacafeIE(InfoExtractor):
 684     """Information Extractor for metacafe.com."""
 685
 686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 689     IE_NAME = u'metacafe'
 690
 691     def report_disclaimer(self):
 692         """Report disclaimer retrieval."""
 693         self.to_screen(u'Retrieving disclaimer')
 694
 695     def _real_initialize(self):
 696         # Retrieve disclaimer
 697         request = compat_urllib_request.Request(self._DISCLAIMER)
 698         try:
 699             self.report_disclaimer()
 700             disclaimer = compat_urllib_request.urlopen(request).read()
 701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 703             return
 704
 705         # Confirm age
 706         disclaimer_form = {
 707             'filters': '0',
 708             'submit': "Continue - I'm over 18",
 709             }
 710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 711         try:
 712             self.report_age_confirmation()
 713             disclaimer = compat_urllib_request.urlopen(request).read()
 714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 716             return
 717
 718     def _real_extract(self, url):
 719         # Extract id and simplified title from URL
 720         mobj = re.match(self._VALID_URL, url)
 721         if mobj is None:
 722             self._downloader.report_error(u'invalid URL: %s' % url)
 723             return
 724
 725         video_id = mobj.group(1)
 726
 727         # Check if video comes from YouTube
 728         mobj2 = re.match(r'^yt-(.*)$', video_id)
 729         if mobj2 is not None:
 730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 731
 732         # Retrieve video webpage to extract further information
 733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 734
 735         # Extract URL, uploader and title from webpage
 736         self.report_extraction(video_id)
 737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 738         if mobj is not None:
 739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 740             video_extension = mediaURL[-3:]
 741
 742             # Extract gdaKey if available
 743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 744             if mobj is None:
 745                 video_url = mediaURL
 746             else:
 747                 gdaKey = mobj.group(1)
 748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 749         else:
 750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 751             if mobj is None:
 752                 self._downloader.report_error(u'unable to extract media URL')
 753                 return
 754             vardict = compat_parse_qs(mobj.group(1))
 755             if 'mediaData' not in vardict:
 756                 self._downloader.report_error(u'unable to extract media URL')
 757                 return
 758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 759             if mobj is None:
 760                 self._downloader.report_error(u'unable to extract media URL')
 761                 return
 762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 763             video_extension = mediaURL[-3:]
 764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 765
 766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 767         if mobj is None:
 768             self._downloader.report_error(u'unable to extract title')
 769             return
 770         video_title = mobj.group(1).decode('utf-8')
 771
 772         mobj = re.search(r'submitter=(.*?);', webpage)
 773         if mobj is None:
 774             self._downloader.report_error(u'unable to extract uploader nickname')
 775             return
 776         video_uploader = mobj.group(1)
 777
 778         return [{
 779             'id':       video_id.decode('utf-8'),
 780             'url':      video_url.decode('utf-8'),
 781             'uploader': video_uploader.decode('utf-8'),
 782             'upload_date':  None,
 783             'title':    video_title,
 784             'ext':      video_extension.decode('utf-8'),
 785         }]
 786
 787 class DailymotionIE(InfoExtractor):
 788     """Information Extractor for Dailymotion"""
 789
 790     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 791     IE_NAME = u'dailymotion'
 792
 793     def _real_extract(self, url):
 794         # Extract id and simplified title from URL
 795         mobj = re.match(self._VALID_URL, url)
 796         if mobj is None:
 797             self._downloader.report_error(u'invalid URL: %s' % url)
 798             return
 799
 800         video_id = mobj.group(1).split('_')[0].split('?')[0]
 801
 802         video_extension = 'mp4'
 803
 804         # Retrieve video webpage to extract further information
 805         request = compat_urllib_request.Request(url)
 806         request.add_header('Cookie', 'family_filter=off')
 807         webpage = self._download_webpage(request, video_id)
 808
 809         # Extract URL, uploader and title from webpage
 810         self.report_extraction(video_id)
 811         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 812         if mobj is None:
 813             self._downloader.report_error(u'unable to extract media URL')
 814             return
 815         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 816
 817         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 818             if key in flashvars:
 819                 max_quality = key
 820                 self.to_screen(u'Using %s' % key)
 821                 break
 822         else:
 823             self._downloader.report_error(u'unable to extract video URL')
 824             return
 825
 826         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 827         if mobj is None:
 828             self._downloader.report_error(u'unable to extract video URL')
 829             return
 830
 831         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 832
 833         # TODO: support choosing qualities
 834
 835         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 836         if mobj is None:
 837             self._downloader.report_error(u'unable to extract title')
 838             return
 839         video_title = unescapeHTML(mobj.group('title'))
 840
 841         video_uploader = None
 842         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 843         if mobj is None:
 844             # lookin for official user
 845             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 846             if mobj_official is None:
 847                 self._downloader.report_warning(u'unable to extract uploader nickname')
 848             else:
 849                 video_uploader = mobj_official.group(1)
 850         else:
 851             video_uploader = mobj.group(1)
 852
 853         video_upload_date = None
 854         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 855         if mobj is not None:
 856             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 857
 858         return [{
 859             'id':       video_id,
 860             'url':      video_url,
 861             'uploader': video_uploader,
 862             'upload_date':  video_upload_date,
 863             'title':    video_title,
 864             'ext':      video_extension,
 865         }]
 866
 867
 868 class PhotobucketIE(InfoExtractor):
 869     """Information extractor for photobucket.com."""
 870
 871     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 872     IE_NAME = u'photobucket'
 873
 874     def _real_extract(self, url):
 875         # Extract id from URL
 876         mobj = re.match(self._VALID_URL, url)
 877         if mobj is None:
 878             self._downloader.report_error(u'Invalid URL: %s' % url)
 879             return
 880
 881         video_id = mobj.group(1)
 882
 883         video_extension = 'flv'
 884
 885         # Retrieve video webpage to extract further information
 886         request = compat_urllib_request.Request(url)
 887         try:
 888             self.report_download_webpage(video_id)
 889             webpage = compat_urllib_request.urlopen(request).read()
 890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 891             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 892             return
 893
 894         # Extract URL, uploader, and title from webpage
 895         self.report_extraction(video_id)
 896         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 897         if mobj is None:
 898             self._downloader.report_error(u'unable to extract media URL')
 899             return
 900         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 901
 902         video_url = mediaURL
 903
 904         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 905         if mobj is None:
 906             self._downloader.report_error(u'unable to extract title')
 907             return
 908         video_title = mobj.group(1).decode('utf-8')
 909
 910         video_uploader = mobj.group(2).decode('utf-8')
 911
 912         return [{
 913             'id':       video_id.decode('utf-8'),
 914             'url':      video_url.decode('utf-8'),
 915             'uploader': video_uploader,
 916             'upload_date':  None,
 917             'title':    video_title,
 918             'ext':      video_extension.decode('utf-8'),
 919         }]
 920
 921
 922 class YahooIE(InfoExtractor):
 923     """Information extractor for video.yahoo.com."""
 924
 925     _WORKING = False
 926     # _VALID_URL matches all Yahoo! Video URLs
 927     # _VPAGE_URL matches only the extractable '/watch/' URLs
 928     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 929     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 930     IE_NAME = u'video.yahoo'
 931
 932     def _real_extract(self, url, new_video=True):
 933         # Extract ID from URL
 934         mobj = re.match(self._VALID_URL, url)
 935         if mobj is None:
 936             self._downloader.report_error(u'Invalid URL: %s' % url)
 937             return
 938
 939         video_id = mobj.group(2)
 940         video_extension = 'flv'
 941
 942         # Rewrite valid but non-extractable URLs as
 943         # extractable English language /watch/ URLs
 944         if re.match(self._VPAGE_URL, url) is None:
 945             request = compat_urllib_request.Request(url)
 946             try:
 947                 webpage = compat_urllib_request.urlopen(request).read()
 948             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 949                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 950                 return
 951
 952             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 953             if mobj is None:
 954                 self._downloader.report_error(u'Unable to extract id field')
 955                 return
 956             yahoo_id = mobj.group(1)
 957
 958             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 959             if mobj is None:
 960                 self._downloader.report_error(u'Unable to extract vid field')
 961                 return
 962             yahoo_vid = mobj.group(1)
 963
 964             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 965             return self._real_extract(url, new_video=False)
 966
 967         # Retrieve video webpage to extract further information
 968         request = compat_urllib_request.Request(url)
 969         try:
 970             self.report_download_webpage(video_id)
 971             webpage = compat_urllib_request.urlopen(request).read()
 972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 973             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 974             return
 975
 976         # Extract uploader and title from webpage
 977         self.report_extraction(video_id)
 978         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 979         if mobj is None:
 980             self._downloader.report_error(u'unable to extract video title')
 981             return
 982         video_title = mobj.group(1).decode('utf-8')
 983
 984         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 985         if mobj is None:
 986             self._downloader.report_error(u'unable to extract video uploader')
 987             return
 988         video_uploader = mobj.group(1).decode('utf-8')
 989
 990         # Extract video thumbnail
 991         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 992         if mobj is None:
 993             self._downloader.report_error(u'unable to extract video thumbnail')
 994             return
 995         video_thumbnail = mobj.group(1).decode('utf-8')
 996
 997         # Extract video description
 998         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video description')
1001             return
1002         video_description = mobj.group(1).decode('utf-8')
1003         if not video_description:
1004             video_description = 'No description available.'
1005
1006         # Extract video height and width
1007         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1008         if mobj is None:
1009             self._downloader.report_error(u'unable to extract video height')
1010             return
1011         yv_video_height = mobj.group(1)
1012
1013         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1014         if mobj is None:
1015             self._downloader.report_error(u'unable to extract video width')
1016             return
1017         yv_video_width = mobj.group(1)
1018
1019         # Retrieve video playlist to extract media URL
1020         # I'm not completely sure what all these options are, but we
1021         # seem to need most of them, otherwise the server sends a 401.
1022         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1023         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1024         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1025                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1026                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1027         try:
1028             self.report_download_webpage(video_id)
1029             webpage = compat_urllib_request.urlopen(request).read()
1030         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1031             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1032             return
1033
1034         # Extract media URL from playlist XML
1035         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1036         if mobj is None:
1037             self._downloader.report_error(u'Unable to extract media URL')
1038             return
1039         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1040         video_url = unescapeHTML(video_url)
1041
1042         return [{
1043             'id':       video_id.decode('utf-8'),
1044             'url':      video_url,
1045             'uploader': video_uploader,
1046             'upload_date':  None,
1047             'title':    video_title,
1048             'ext':      video_extension.decode('utf-8'),
1049             'thumbnail':    video_thumbnail.decode('utf-8'),
1050             'description':  video_description,
1051         }]
1052
1053
1054 class VimeoIE(InfoExtractor):
1055     """Information extractor for vimeo.com."""
1056
1057     # _VALID_URL matches Vimeo URLs
1058     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1059     IE_NAME = u'vimeo'
1060
1061     def _real_extract(self, url, new_video=True):
1062         # Extract ID from URL
1063         mobj = re.match(self._VALID_URL, url)
1064         if mobj is None:
1065             self._downloader.report_error(u'Invalid URL: %s' % url)
1066             return
1067
1068         video_id = mobj.group('id')
1069         if not mobj.group('proto'):
1070             url = 'https://' + url
1071         if mobj.group('direct_link'):
1072             url = 'https://vimeo.com/' + video_id
1073
1074         # Retrieve video webpage to extract further information
1075         request = compat_urllib_request.Request(url, None, std_headers)
1076         webpage = self._download_webpage(request, video_id)
1077
1078         # Now we begin extracting as much information as we can from what we
1079         # retrieved. First we extract the information common to all extractors,
1080         # and latter we extract those that are Vimeo specific.
1081         self.report_extraction(video_id)
1082
1083         # Extract the config JSON
1084         try:
1085             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1086             config = json.loads(config)
1087         except:
1088             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1089                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1090             else:
1091                 self._downloader.report_error(u'unable to extract info section')
1092             return
1093
1094         # Extract title
1095         video_title = config["video"]["title"]
1096
1097         # Extract uploader and uploader_id
1098         video_uploader = config["video"]["owner"]["name"]
1099         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1100
1101         # Extract video thumbnail
1102         video_thumbnail = config["video"]["thumbnail"]
1103
1104         # Extract video description
1105         video_description = get_element_by_attribute("itemprop", "description", webpage)
1106         if video_description: video_description = clean_html(video_description)
1107         else: video_description = u''
1108
1109         # Extract upload date
1110         video_upload_date = None
1111         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1112         if mobj is not None:
1113             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1114
1115         # Vimeo specific: extract request signature and timestamp
1116         sig = config['request']['signature']
1117         timestamp = config['request']['timestamp']
1118
1119         # Vimeo specific: extract video codec and quality information
1120         # First consider quality, then codecs, then take everything
1121         # TODO bind to format param
1122         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1123         files = { 'hd': [], 'sd': [], 'other': []}
1124         for codec_name, codec_extension in codecs:
1125             if codec_name in config["video"]["files"]:
1126                 if 'hd' in config["video"]["files"][codec_name]:
1127                     files['hd'].append((codec_name, codec_extension, 'hd'))
1128                 elif 'sd' in config["video"]["files"][codec_name]:
1129                     files['sd'].append((codec_name, codec_extension, 'sd'))
1130                 else:
1131                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1132
1133         for quality in ('hd', 'sd', 'other'):
1134             if len(files[quality]) > 0:
1135                 video_quality = files[quality][0][2]
1136                 video_codec = files[quality][0][0]
1137                 video_extension = files[quality][0][1]
1138                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1139                 break
1140         else:
1141             self._downloader.report_error(u'no known codec found')
1142             return
1143
1144         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1145                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1146
1147         return [{
1148             'id':       video_id,
1149             'url':      video_url,
1150             'uploader': video_uploader,
1151             'uploader_id': video_uploader_id,
1152             'upload_date':  video_upload_date,
1153             'title':    video_title,
1154             'ext':      video_extension,
1155             'thumbnail':    video_thumbnail,
1156             'description':  video_description,
1157         }]
1158
1159
1160 class ArteTvIE(InfoExtractor):
1161     """arte.tv information extractor."""
1162
1163     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1164     _LIVE_URL = r'index-[0-9]+\.html$'
1165
1166     IE_NAME = u'arte.tv'
1167
1168     def fetch_webpage(self, url):
1169         request = compat_urllib_request.Request(url)
1170         try:
1171             self.report_download_webpage(url)
1172             webpage = compat_urllib_request.urlopen(request).read()
1173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1175             return
1176         except ValueError as err:
1177             self._downloader.report_error(u'Invalid URL: %s' % url)
1178             return
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             self._downloader.report_error(u'Invalid URL: %s' % url)
1188             return
1189
1190         for (i, key, err) in matchTuples:
1191             if mobj.group(i) is None:
1192                 self._downloader.report_error(err)
1193                 return
1194             else:
1195                 info[key] = mobj.group(i)
1196
1197         return info
1198
1199     def extractLiveStream(self, url):
1200         video_lang = url.split('/')[-4]
1201         info = self.grep_webpage(
1202             url,
1203             r'src="(.*?/videothek_js.*?\.js)',
1204             0,
1205             [
1206                 (1, 'url', u'Invalid URL: %s' % url)
1207             ]
1208         )
1209         http_host = url.split('/')[2]
1210         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211         info = self.grep_webpage(
1212             next_url,
1213             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214                 '(http://.*?\.swf).*?' +
1215                 '(rtmp://.*?)\'',
1216             re.DOTALL,
1217             [
1218                 (1, 'path',   u'could not extract video path: %s' % url),
1219                 (2, 'player', u'could not extract video player: %s' % url),
1220                 (3, 'url',    u'could not extract video url: %s' % url)
1221             ]
1222         )
1223         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224
1225     def extractPlus7Stream(self, url):
1226         video_lang = url.split('/')[-3]
1227         info = self.grep_webpage(
1228             url,
1229             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230             0,
1231             [
1232                 (1, 'url', u'Invalid URL: %s' % url)
1233             ]
1234         )
1235         next_url = compat_urllib_parse.unquote(info.get('url'))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1239             0,
1240             [
1241                 (1, 'url', u'Could not find <video> tag: %s' % url)
1242             ]
1243         )
1244         next_url = compat_urllib_parse.unquote(info.get('url'))
1245
1246         info = self.grep_webpage(
1247             next_url,
1248             r'<video id="(.*?)".*?>.*?' +
1249                 '<name>(.*?)</name>.*?' +
1250                 '<dateVideo>(.*?)</dateVideo>.*?' +
1251                 '<url quality="hd">(.*?)</url>',
1252             re.DOTALL,
1253             [
1254                 (1, 'id',    u'could not extract video id: %s' % url),
1255                 (2, 'title', u'could not extract video title: %s' % url),
1256                 (3, 'date',  u'could not extract video date: %s' % url),
1257                 (4, 'url',   u'could not extract video url: %s' % url)
1258             ]
1259         )
1260
1261         return {
1262             'id':           info.get('id'),
1263             'url':          compat_urllib_parse.unquote(info.get('url')),
1264             'uploader':     u'arte.tv',
1265             'upload_date':  info.get('date'),
1266             'title':        info.get('title').decode('utf-8'),
1267             'ext':          u'mp4',
1268             'format':       u'NA',
1269             'player_url':   None,
1270         }
1271
1272     def _real_extract(self, url):
1273         video_id = url.split('/')[-1]
1274         self.report_extraction(video_id)
1275
1276         if re.search(self._LIVE_URL, video_id) is not None:
1277             self.extractLiveStream(url)
1278             return
1279         else:
1280             info = self.extractPlus7Stream(url)
1281
1282         return [info]
1283
1284
1285 class GenericIE(InfoExtractor):
1286     """Generic last-resort information extractor."""
1287
1288     _VALID_URL = r'.*'
1289     IE_NAME = u'generic'
1290
1291     def report_download_webpage(self, video_id):
1292         """Report webpage download."""
1293         if not self._downloader.params.get('test', False):
1294             self._downloader.report_warning(u'Falling back on generic information extractor.')
1295         super(GenericIE, self).report_download_webpage(video_id)
1296
1297     def report_following_redirect(self, new_url):
1298         """Report information extraction."""
1299         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1300
1301     def _test_redirect(self, url):
1302         """Check if it is a redirect, like url shorteners, in case return the new url."""
1303         class HeadRequest(compat_urllib_request.Request):
1304             def get_method(self):
1305                 return "HEAD"
1306
1307         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1308             """
1309             Subclass the HTTPRedirectHandler to make it use our
1310             HeadRequest also on the redirected URL
1311             """
1312             def redirect_request(self, req, fp, code, msg, headers, newurl):
1313                 if code in (301, 302, 303, 307):
1314                     newurl = newurl.replace(' ', '%20')
1315                     newheaders = dict((k,v) for k,v in req.headers.items()
1316                                       if k.lower() not in ("content-length", "content-type"))
1317                     return HeadRequest(newurl,
1318                                        headers=newheaders,
1319                                        origin_req_host=req.get_origin_req_host(),
1320                                        unverifiable=True)
1321                 else:
1322                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1323
1324         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1325             """
1326             Fallback to GET if HEAD is not allowed (405 HTTP error)
1327             """
1328             def http_error_405(self, req, fp, code, msg, headers):
1329                 fp.read()
1330                 fp.close()
1331
1332                 newheaders = dict((k,v) for k,v in req.headers.items()
1333                                   if k.lower() not in ("content-length", "content-type"))
1334                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1335                                                  headers=newheaders,
1336                                                  origin_req_host=req.get_origin_req_host(),
1337                                                  unverifiable=True))
1338
1339         # Build our opener
1340         opener = compat_urllib_request.OpenerDirector()
1341         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1342                         HTTPMethodFallback, HEADRedirectHandler,
1343                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1344             opener.add_handler(handler())
1345
1346         response = opener.open(HeadRequest(url))
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             self._downloader.report_error(u'Invalid URL: %s' % url)
1366             return
1367
1368         self.report_extraction(video_id)
1369         # Start with something easy: JW Player in SWFObject
1370         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371         if mobj is None:
1372             # Broaden the search a little bit
1373             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374         if mobj is None:
1375             # Broaden the search a little bit: JWPlayer JS loader
1376             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377         if mobj is None:
1378             self._downloader.report_error(u'Invalid URL: %s' % url)
1379             return
1380
1381         # It's possible that one of the regexes
1382         # matched, but returned an empty group:
1383         if mobj.group(1) is None:
1384             self._downloader.report_error(u'Invalid URL: %s' % url)
1385             return
1386
1387         video_url = compat_urllib_parse.unquote(mobj.group(1))
1388         video_id = os.path.basename(video_url)
1389
1390         # here's a fun little line of code for you:
1391         video_extension = os.path.splitext(video_id)[1][1:]
1392         video_id = os.path.splitext(video_id)[0]
1393
1394         # it's tempting to parse this further, but you would
1395         # have to take into account all the variations like
1396         #   Video Title - Site Name
1397         #   Site Name | Video Title
1398         #   Video Title - Tagline | Site Name
1399         # and so on and so forth; it's just not practical
1400         mobj = re.search(r'<title>(.*)</title>', webpage)
1401         if mobj is None:
1402             self._downloader.report_error(u'unable to extract title')
1403             return
1404         video_title = mobj.group(1)
1405
1406         # video uploader is domain name
1407         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408         if mobj is None:
1409             self._downloader.report_error(u'unable to extract title')
1410             return
1411         video_uploader = mobj.group(1)
1412
1413         return [{
1414             'id':       video_id,
1415             'url':      video_url,
1416             'uploader': video_uploader,
1417             'upload_date':  None,
1418             'title':    video_title,
1419             'ext':      video_extension,
1420         }]
1421
1422
1423 class YoutubeSearchIE(InfoExtractor):
1424     """Information Extractor for YouTube search queries."""
1425     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1426     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1427     _max_youtube_results = 1000
1428     IE_NAME = u'youtube:search'
1429
1430     def report_download_page(self, query, pagenum):
1431         """Report attempt to download search page with given number."""
1432         query = query.decode(preferredencoding())
1433         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1434
1435     def _real_extract(self, query):
1436         mobj = re.match(self._VALID_URL, query)
1437         if mobj is None:
1438             self._downloader.report_error(u'invalid search query "%s"' % query)
1439             return
1440
1441         prefix, query = query.split(':')
1442         prefix = prefix[8:]
1443         query = query.encode('utf-8')
1444         if prefix == '':
1445             return self._get_n_results(query, 1)
1446         elif prefix == 'all':
1447             self._get_n_results(query, self._max_youtube_results)
1448         else:
1449             try:
1450                 n = int(prefix)
1451                 if n <= 0:
1452                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1453                     return
1454                 elif n > self._max_youtube_results:
1455                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456                     n = self._max_youtube_results
1457                 return self._get_n_results(query, n)
1458             except ValueError: # parsing prefix as integer fails
1459                 return self._get_n_results(query, 1)
1460
1461     def _get_n_results(self, query, n):
1462         """Get a specified number of results for a query"""
1463
1464         video_ids = []
1465         pagenum = 0
1466         limit = n
1467
1468         while (50 * pagenum) < limit:
1469             self.report_download_page(query, pagenum+1)
1470             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471             request = compat_urllib_request.Request(result_url)
1472             try:
1473                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1476                 return
1477             api_response = json.loads(data)['data']
1478
1479             if not 'items' in api_response:
1480                 self._downloader.report_error(u'[youtube] No video results')
1481                 return
1482
1483             new_ids = list(video['id'] for video in api_response['items'])
1484             video_ids += new_ids
1485
1486             limit = min(n, api_response['totalItems'])
1487             pagenum += 1
1488
1489         if len(video_ids) > n:
1490             video_ids = video_ids[:n]
1491         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1492         return videos
1493
1494
1495 class GoogleSearchIE(InfoExtractor):
1496     """Information Extractor for Google Video search queries."""
1497     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1498     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1499     _max_google_results = 1000
1500     IE_NAME = u'video.google:search'
1501
1502     def _real_extract(self, query):
1503         mobj = re.match(self._VALID_URL, query)
1504
1505         prefix = mobj.group('prefix')
1506         query = mobj.group('query')
1507         if prefix == '':
1508             return self._download_n_results(query, 1)
1509         elif prefix == 'all':
1510             return self._download_n_results(query, self._max_google_results)
1511         else:
1512             n = int(prefix)
1513             if n <= 0:
1514                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1515             elif n > self._max_google_results:
1516                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1517                 n = self._max_google_results
1518             return self._download_n_results(query, n)
1519
1520     def _download_n_results(self, query, n):
1521         """Downloads a specified number of results for a query"""
1522
1523         res = {
1524             '_type': 'playlist',
1525             'id': query,
1526             'entries': []
1527         }
1528
1529         for pagenum in itertools.count(1):
1530             result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1531             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1532                                              note='Downloading result page ' + str(pagenum))
1533
1534             # Extract video identifiers
1535             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1536                 e = {
1537                     '_type': 'url',
1538                     'url': mobj.group(1)
1539                 }
1540                 res['entries'].append(e)
1541
1542             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1543                 return res
1544
1545 class YahooSearchIE(InfoExtractor):
1546     """Information Extractor for Yahoo! Video search queries."""
1547
1548     _WORKING = False
1549     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1550     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1551     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1552     _MORE_PAGES_INDICATOR = r'\s*Next'
1553     _max_yahoo_results = 1000
1554     IE_NAME = u'video.yahoo:search'
1555
1556     def report_download_page(self, query, pagenum):
1557         """Report attempt to download playlist page with given number."""
1558         query = query.decode(preferredencoding())
1559         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1560
1561     def _real_extract(self, query):
1562         mobj = re.match(self._VALID_URL, query)
1563         if mobj is None:
1564             self._downloader.report_error(u'invalid search query "%s"' % query)
1565             return
1566
1567         prefix, query = query.split(':')
1568         prefix = prefix[8:]
1569         query = query.encode('utf-8')
1570         if prefix == '':
1571             self._download_n_results(query, 1)
1572             return
1573         elif prefix == 'all':
1574             self._download_n_results(query, self._max_yahoo_results)
1575             return
1576         else:
1577             try:
1578                 n = int(prefix)
1579                 if n <= 0:
1580                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1581                     return
1582                 elif n > self._max_yahoo_results:
1583                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1584                     n = self._max_yahoo_results
1585                 self._download_n_results(query, n)
1586                 return
1587             except ValueError: # parsing prefix as integer fails
1588                 self._download_n_results(query, 1)
1589                 return
1590
1591     def _download_n_results(self, query, n):
1592         """Downloads a specified number of results for a query"""
1593
1594         video_ids = []
1595         already_seen = set()
1596         pagenum = 1
1597
1598         while True:
1599             self.report_download_page(query, pagenum)
1600             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1601             request = compat_urllib_request.Request(result_url)
1602             try:
1603                 page = compat_urllib_request.urlopen(request).read()
1604             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1605                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1606                 return
1607
1608             # Extract video identifiers
1609             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1610                 video_id = mobj.group(1)
1611                 if video_id not in already_seen:
1612                     video_ids.append(video_id)
1613                     already_seen.add(video_id)
1614                     if len(video_ids) == n:
1615                         # Specified n videos reached
1616                         for id in video_ids:
1617                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1618                         return
1619
1620             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1621                 for id in video_ids:
1622                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1623                 return
1624
1625             pagenum = pagenum + 1
1626
1627
1628 class YoutubePlaylistIE(InfoExtractor):
1629     """Information Extractor for YouTube playlists."""
1630
1631     _VALID_URL = r"""(?:
1632                         (?:https?://)?
1633                         (?:\w+\.)?
1634                         youtube\.com/
1635                         (?:
1636                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1637                            \? (?:.*?&)*? (?:p|a|list)=
1638                         |  p/
1639                         )
1640                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1641                         .*
1642                      |
1643                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1644                      )"""
1645     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1646     _MAX_RESULTS = 50
1647     IE_NAME = u'youtube:playlist'
1648
1649     @classmethod
1650     def suitable(cls, url):
1651         """Receives a URL and returns True if suitable for this IE."""
1652         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1653
1654     def _real_extract(self, url):
1655         # Extract playlist id
1656         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1657         if mobj is None:
1658             self._downloader.report_error(u'invalid url: %s' % url)
1659             return
1660
1661         # Download playlist videos from API
1662         playlist_id = mobj.group(1) or mobj.group(2)
1663         page_num = 1
1664         videos = []
1665
1666         while True:
1667             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1668             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1669
1670             try:
1671                 response = json.loads(page)
1672             except ValueError as err:
1673                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1674                 return
1675
1676             if 'feed' not in response:
1677                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1678                 return
1679             playlist_title = response['feed']['title']['$t']
1680             if 'entry' not in response['feed']:
1681                 # Number of videos is a multiple of self._MAX_RESULTS
1682                 break
1683
1684             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1685                         for entry in response['feed']['entry']
1686                         if 'content' in entry ]
1687
1688             if len(response['feed']['entry']) < self._MAX_RESULTS:
1689                 break
1690             page_num += 1
1691
1692         videos = [v[1] for v in sorted(videos)]
1693
1694         url_results = [self.url_result(url, 'Youtube') for url in videos]
1695         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1696
1697
1698 class YoutubeChannelIE(InfoExtractor):
1699     """Information Extractor for YouTube channels."""
1700
1701     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1702     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1703     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1704     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1705     IE_NAME = u'youtube:channel'
1706
1707     def extract_videos_from_page(self, page):
1708         ids_in_page = []
1709         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1710             if mobj.group(1) not in ids_in_page:
1711                 ids_in_page.append(mobj.group(1))
1712         return ids_in_page
1713
1714     def _real_extract(self, url):
1715         # Extract channel id
1716         mobj = re.match(self._VALID_URL, url)
1717         if mobj is None:
1718             self._downloader.report_error(u'invalid url: %s' % url)
1719             return
1720
1721         # Download channel page
1722         channel_id = mobj.group(1)
1723         video_ids = []
1724         pagenum = 1
1725
1726         url = self._TEMPLATE_URL % (channel_id, pagenum)
1727         page = self._download_webpage(url, channel_id,
1728                                       u'Downloading page #%s' % pagenum)
1729
1730         # Extract video identifiers
1731         ids_in_page = self.extract_videos_from_page(page)
1732         video_ids.extend(ids_in_page)
1733
1734         # Download any subsequent channel pages using the json-based channel_ajax query
1735         if self._MORE_PAGES_INDICATOR in page:
1736             while True:
1737                 pagenum = pagenum + 1
1738
1739                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1740                 page = self._download_webpage(url, channel_id,
1741                                               u'Downloading page #%s' % pagenum)
1742
1743                 page = json.loads(page)
1744
1745                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1746                 video_ids.extend(ids_in_page)
1747
1748                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1749                     break
1750
1751         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1752
1753         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1754         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1755         return [self.playlist_result(url_entries, channel_id)]
1756
1757
1758 class YoutubeUserIE(InfoExtractor):
1759     """Information Extractor for YouTube users."""
1760
1761     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1762     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1763     _GDATA_PAGE_SIZE = 50
1764     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1765     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1766     IE_NAME = u'youtube:user'
1767
1768     def _real_extract(self, url):
1769         # Extract username
1770         mobj = re.match(self._VALID_URL, url)
1771         if mobj is None:
1772             self._downloader.report_error(u'invalid url: %s' % url)
1773             return
1774
1775         username = mobj.group(1)
1776
1777         # Download video ids using YouTube Data API. Result size per
1778         # query is limited (currently to 50 videos) so we need to query
1779         # page by page until there are no video ids - it means we got
1780         # all of them.
1781
1782         video_ids = []
1783         pagenum = 0
1784
1785         while True:
1786             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1787
1788             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1789             page = self._download_webpage(gdata_url, username,
1790                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1791
1792             # Extract video identifiers
1793             ids_in_page = []
1794
1795             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796                 if mobj.group(1) not in ids_in_page:
1797                     ids_in_page.append(mobj.group(1))
1798
1799             video_ids.extend(ids_in_page)
1800
1801             # A little optimization - if current page is not
1802             # "full", ie. does not contain PAGE_SIZE video ids then
1803             # we can assume that this page is the last one - there
1804             # are no more ids on further pages - no need to query
1805             # again.
1806
1807             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1808                 break
1809
1810             pagenum += 1
1811
1812         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1813         url_results = [self.url_result(url, 'Youtube') for url in urls]
1814         return [self.playlist_result(url_results, playlist_title = username)]
1815
1816
1817 class BlipTVUserIE(InfoExtractor):
1818     """Information Extractor for blip.tv users."""
1819
1820     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1821     _PAGE_SIZE = 12
1822     IE_NAME = u'blip.tv:user'
1823
1824     def _real_extract(self, url):
1825         # Extract username
1826         mobj = re.match(self._VALID_URL, url)
1827         if mobj is None:
1828             self._downloader.report_error(u'invalid url: %s' % url)
1829             return
1830
1831         username = mobj.group(1)
1832
1833         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1834
1835         page = self._download_webpage(url, username, u'Downloading user page')
1836         mobj = re.search(r'data-users-id="([^"]+)"', page)
1837         page_base = page_base % mobj.group(1)
1838
1839
1840         # Download video ids using BlipTV Ajax calls. Result size per
1841         # query is limited (currently to 12 videos) so we need to query
1842         # page by page until there are no video ids - it means we got
1843         # all of them.
1844
1845         video_ids = []
1846         pagenum = 1
1847
1848         while True:
1849             url = page_base + "&page=" + str(pagenum)
1850             page = self._download_webpage(url, username,
1851                                           u'Downloading video ids from page %d' % pagenum)
1852
1853             # Extract video identifiers
1854             ids_in_page = []
1855
1856             for mobj in re.finditer(r'href="/([^"]+)"', page):
1857                 if mobj.group(1) not in ids_in_page:
1858                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1859
1860             video_ids.extend(ids_in_page)
1861
1862             # A little optimization - if current page is not
1863             # "full", ie. does not contain PAGE_SIZE video ids then
1864             # we can assume that this page is the last one - there
1865             # are no more ids on further pages - no need to query
1866             # again.
1867
1868             if len(ids_in_page) < self._PAGE_SIZE:
1869                 break
1870
1871             pagenum += 1
1872
1873         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1874         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1875         return [self.playlist_result(url_entries, playlist_title = username)]
1876
1877
1878 class DepositFilesIE(InfoExtractor):
1879     """Information extractor for depositfiles.com"""
1880
1881     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1882
1883     def _real_extract(self, url):
1884         file_id = url.split('/')[-1]
1885         # Rebuild url in english locale
1886         url = 'http://depositfiles.com/en/files/' + file_id
1887
1888         # Retrieve file webpage with 'Free download' button pressed
1889         free_download_indication = { 'gateway_result' : '1' }
1890         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1891         try:
1892             self.report_download_webpage(file_id)
1893             webpage = compat_urllib_request.urlopen(request).read()
1894         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1895             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1896             return
1897
1898         # Search for the real file URL
1899         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1900         if (mobj is None) or (mobj.group(1) is None):
1901             # Try to figure out reason of the error.
1902             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1903             if (mobj is not None) and (mobj.group(1) is not None):
1904                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1905                 self._downloader.report_error(u'%s' % restriction_message)
1906             else:
1907                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1908             return
1909
1910         file_url = mobj.group(1)
1911         file_extension = os.path.splitext(file_url)[1][1:]
1912
1913         # Search for file title
1914         mobj = re.search(r'<b title="(.*?)">', webpage)
1915         if mobj is None:
1916             self._downloader.report_error(u'unable to extract title')
1917             return
1918         file_title = mobj.group(1).decode('utf-8')
1919
1920         return [{
1921             'id':       file_id.decode('utf-8'),
1922             'url':      file_url.decode('utf-8'),
1923             'uploader': None,
1924             'upload_date':  None,
1925             'title':    file_title,
1926             'ext':      file_extension.decode('utf-8'),
1927         }]
1928
1929
1930 class FacebookIE(InfoExtractor):
1931     """Information Extractor for Facebook"""
1932
1933     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1934     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1935     _NETRC_MACHINE = 'facebook'
1936     IE_NAME = u'facebook'
1937
1938     def report_login(self):
1939         """Report attempt to log in."""
1940         self.to_screen(u'Logging in')
1941
1942     def _real_initialize(self):
1943         if self._downloader is None:
1944             return
1945
1946         useremail = None
1947         password = None
1948         downloader_params = self._downloader.params
1949
1950         # Attempt to use provided username and password or .netrc data
1951         if downloader_params.get('username', None) is not None:
1952             useremail = downloader_params['username']
1953             password = downloader_params['password']
1954         elif downloader_params.get('usenetrc', False):
1955             try:
1956                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1957                 if info is not None:
1958                     useremail = info[0]
1959                     password = info[2]
1960                 else:
1961                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1962             except (IOError, netrc.NetrcParseError) as err:
1963                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1964                 return
1965
1966         if useremail is None:
1967             return
1968
1969         # Log in
1970         login_form = {
1971             'email': useremail,
1972             'pass': password,
1973             'login': 'Log+In'
1974             }
1975         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1976         try:
1977             self.report_login()
1978             login_results = compat_urllib_request.urlopen(request).read()
1979             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1980                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1981                 return
1982         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1984             return
1985
1986     def _real_extract(self, url):
1987         mobj = re.match(self._VALID_URL, url)
1988         if mobj is None:
1989             self._downloader.report_error(u'invalid URL: %s' % url)
1990             return
1991         video_id = mobj.group('ID')
1992
1993         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1994         webpage = self._download_webpage(url, video_id)
1995
1996         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1997         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1998         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1999         if not m:
2000             raise ExtractorError(u'Cannot parse data')
2001         data = dict(json.loads(m.group(1)))
2002         params_raw = compat_urllib_parse.unquote(data['params'])
2003         params = json.loads(params_raw)
2004         video_data = params['video_data'][0]
2005         video_url = video_data.get('hd_src')
2006         if not video_url:
2007             video_url = video_data['sd_src']
2008         if not video_url:
2009             raise ExtractorError(u'Cannot find video URL')
2010         video_duration = int(video_data['video_duration'])
2011         thumbnail = video_data['thumbnail_src']
2012
2013         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2014         if not m:
2015             raise ExtractorError(u'Cannot find title in webpage')
2016         video_title = unescapeHTML(m.group(1))
2017
2018         info = {
2019             'id': video_id,
2020             'title': video_title,
2021             'url': video_url,
2022             'ext': 'mp4',
2023             'duration': video_duration,
2024             'thumbnail': thumbnail,
2025         }
2026         return [info]
2027
2028
2029 class BlipTVIE(InfoExtractor):
2030     """Information extractor for blip.tv"""
2031
2032     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2033     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2034     IE_NAME = u'blip.tv'
2035
2036     def report_direct_download(self, title):
2037         """Report information extraction."""
2038         self.to_screen(u'%s: Direct download detected' % title)
2039
2040     def _real_extract(self, url):
2041         mobj = re.match(self._VALID_URL, url)
2042         if mobj is None:
2043             self._downloader.report_error(u'invalid URL: %s' % url)
2044             return
2045
2046         urlp = compat_urllib_parse_urlparse(url)
2047         if urlp.path.startswith('/play/'):
2048             request = compat_urllib_request.Request(url)
2049             response = compat_urllib_request.urlopen(request)
2050             redirecturl = response.geturl()
2051             rurlp = compat_urllib_parse_urlparse(redirecturl)
2052             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2053             url = 'http://blip.tv/a/a-' + file_id
2054             return self._real_extract(url)
2055
2056
2057         if '?' in url:
2058             cchar = '&'
2059         else:
2060             cchar = '?'
2061         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2062         request = compat_urllib_request.Request(json_url)
2063         request.add_header('User-Agent', 'iTunes/10.6.1')
2064         self.report_extraction(mobj.group(1))
2065         info = None
2066         try:
2067             urlh = compat_urllib_request.urlopen(request)
2068             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2069                 basename = url.split('/')[-1]
2070                 title,ext = os.path.splitext(basename)
2071                 title = title.decode('UTF-8')
2072                 ext = ext.replace('.', '')
2073                 self.report_direct_download(title)
2074                 info = {
2075                     'id': title,
2076                     'url': url,
2077                     'uploader': None,
2078                     'upload_date': None,
2079                     'title': title,
2080                     'ext': ext,
2081                     'urlhandle': urlh
2082                 }
2083         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2084             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2085         if info is None: # Regular URL
2086             try:
2087                 json_code_bytes = urlh.read()
2088                 json_code = json_code_bytes.decode('utf-8')
2089             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2090                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2091                 return
2092
2093             try:
2094                 json_data = json.loads(json_code)
2095                 if 'Post' in json_data:
2096                     data = json_data['Post']
2097                 else:
2098                     data = json_data
2099
2100                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2101                 video_url = data['media']['url']
2102                 umobj = re.match(self._URL_EXT, video_url)
2103                 if umobj is None:
2104                     raise ValueError('Can not determine filename extension')
2105                 ext = umobj.group(1)
2106
2107                 info = {
2108                     'id': data['item_id'],
2109                     'url': video_url,
2110                     'uploader': data['display_name'],
2111                     'upload_date': upload_date,
2112                     'title': data['title'],
2113                     'ext': ext,
2114                     'format': data['media']['mimeType'],
2115                     'thumbnail': data['thumbnailUrl'],
2116                     'description': data['description'],
2117                     'player_url': data['embedUrl'],
2118                     'user_agent': 'iTunes/10.6.1',
2119                 }
2120             except (ValueError,KeyError) as err:
2121                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2122                 return
2123
2124         return [info]
2125
2126
2127 class MyVideoIE(InfoExtractor):
2128     """Information Extractor for myvideo.de."""
2129
2130     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2131     IE_NAME = u'myvideo'
2132
2133     def _real_extract(self,url):
2134         mobj = re.match(self._VALID_URL, url)
2135         if mobj is None:
2136             self._download.report_error(u'invalid URL: %s' % url)
2137             return
2138
2139         video_id = mobj.group(1)
2140
2141         # Get video webpage
2142         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2143         webpage = self._download_webpage(webpage_url, video_id)
2144
2145         self.report_extraction(video_id)
2146         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2147                  webpage)
2148         if mobj is None:
2149             self._downloader.report_error(u'unable to extract media URL')
2150             return
2151         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2152
2153         mobj = re.search('<title>([^<]+)</title>', webpage)
2154         if mobj is None:
2155             self._downloader.report_error(u'unable to extract title')
2156             return
2157
2158         video_title = mobj.group(1)
2159
2160         return [{
2161             'id':       video_id,
2162             'url':      video_url,
2163             'uploader': None,
2164             'upload_date':  None,
2165             'title':    video_title,
2166             'ext':      u'flv',
2167         }]
2168
2169 class ComedyCentralIE(InfoExtractor):
2170     """Information extractor for The Daily Show and Colbert Report """
2171
2172     # urls can be abbreviations like :thedailyshow or :colbert
2173     # urls for episodes like:
2174     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2175     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2176     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2177     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2178                       |(https?://)?(www\.)?
2179                           (?P<showname>thedailyshow|colbertnation)\.com/
2180                          (full-episodes/(?P<episode>.*)|
2181                           (?P<clip>
2182                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2183                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2184                      $"""
2185
2186     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2187
2188     _video_extensions = {
2189         '3500': 'mp4',
2190         '2200': 'mp4',
2191         '1700': 'mp4',
2192         '1200': 'mp4',
2193         '750': 'mp4',
2194         '400': 'mp4',
2195     }
2196     _video_dimensions = {
2197         '3500': '1280x720',
2198         '2200': '960x540',
2199         '1700': '768x432',
2200         '1200': '640x360',
2201         '750': '512x288',
2202         '400': '384x216',
2203     }
2204
2205     @classmethod
2206     def suitable(cls, url):
2207         """Receives a URL and returns True if suitable for this IE."""
2208         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2209
2210     def _print_formats(self, formats):
2211         print('Available formats:')
2212         for x in formats:
2213             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2214
2215
2216     def _real_extract(self, url):
2217         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2218         if mobj is None:
2219             self._downloader.report_error(u'invalid URL: %s' % url)
2220             return
2221
2222         if mobj.group('shortname'):
2223             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2224                 url = u'http://www.thedailyshow.com/full-episodes/'
2225             else:
2226                 url = u'http://www.colbertnation.com/full-episodes/'
2227             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2228             assert mobj is not None
2229
2230         if mobj.group('clip'):
2231             if mobj.group('showname') == 'thedailyshow':
2232                 epTitle = mobj.group('tdstitle')
2233             else:
2234                 epTitle = mobj.group('cntitle')
2235             dlNewest = False
2236         else:
2237             dlNewest = not mobj.group('episode')
2238             if dlNewest:
2239                 epTitle = mobj.group('showname')
2240             else:
2241                 epTitle = mobj.group('episode')
2242
2243         self.report_extraction(epTitle)
2244         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2245         if dlNewest:
2246             url = htmlHandle.geturl()
2247             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2248             if mobj is None:
2249                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2250             if mobj.group('episode') == '':
2251                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2252             epTitle = mobj.group('episode')
2253
2254         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2255
2256         if len(mMovieParams) == 0:
2257             # The Colbert Report embeds the information in a without
2258             # a URL prefix; so extract the alternate reference
2259             # and then add the URL prefix manually.
2260
2261             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2262             if len(altMovieParams) == 0:
2263                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2264             else:
2265                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2266
2267         uri = mMovieParams[0][1]
2268         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2269         indexXml = self._download_webpage(indexUrl, epTitle,
2270                                           u'Downloading show index',
2271                                           u'unable to download episode index')
2272
2273         results = []
2274
2275         idoc = xml.etree.ElementTree.fromstring(indexXml)
2276         itemEls = idoc.findall('.//item')
2277         for partNum,itemEl in enumerate(itemEls):
2278             mediaId = itemEl.findall('./guid')[0].text
2279             shortMediaId = mediaId.split(':')[-1]
2280             showId = mediaId.split(':')[-2].replace('.com', '')
2281             officialTitle = itemEl.findall('./title')[0].text
2282             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2283
2284             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2285                         compat_urllib_parse.urlencode({'uri': mediaId}))
2286             configXml = self._download_webpage(configUrl, epTitle,
2287                                                u'Downloading configuration for %s' % shortMediaId)
2288
2289             cdoc = xml.etree.ElementTree.fromstring(configXml)
2290             turls = []
2291             for rendition in cdoc.findall('.//rendition'):
2292                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2293                 turls.append(finfo)
2294
2295             if len(turls) == 0:
2296                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2297                 continue
2298
2299             if self._downloader.params.get('listformats', None):
2300                 self._print_formats([i[0] for i in turls])
2301                 return
2302
2303             # For now, just pick the highest bitrate
2304             format,rtmp_video_url = turls[-1]
2305
2306             # Get the format arg from the arg stream
2307             req_format = self._downloader.params.get('format', None)
2308
2309             # Select format if we can find one
2310             for f,v in turls:
2311                 if f == req_format:
2312                     format, rtmp_video_url = f, v
2313                     break
2314
2315             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2316             if not m:
2317                 raise ExtractorError(u'Cannot transform RTMP url')
2318             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2319             video_url = base + m.group('finalid')
2320
2321             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2322             info = {
2323                 'id': shortMediaId,
2324                 'url': video_url,
2325                 'uploader': showId,
2326                 'upload_date': officialDate,
2327                 'title': effTitle,
2328                 'ext': 'mp4',
2329                 'format': format,
2330                 'thumbnail': None,
2331                 'description': officialTitle,
2332             }
2333             results.append(info)
2334
2335         return results
2336
2337
2338 class EscapistIE(InfoExtractor):
2339     """Information extractor for The Escapist """
2340
2341     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2342     IE_NAME = u'escapist'
2343
2344     def _real_extract(self, url):
2345         mobj = re.match(self._VALID_URL, url)
2346         if mobj is None:
2347             self._downloader.report_error(u'invalid URL: %s' % url)
2348             return
2349         showName = mobj.group('showname')
2350         videoId = mobj.group('episode')
2351
2352         self.report_extraction(showName)
2353         webPage = self._download_webpage(url, showName)
2354
2355         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2356         description = unescapeHTML(descMatch.group(1))
2357         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2358         imgUrl = unescapeHTML(imgMatch.group(1))
2359         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2360         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2361         configUrlMatch = re.search('config=(.*)$', playerUrl)
2362         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2363
2364         configJSON = self._download_webpage(configUrl, showName,
2365                                             u'Downloading configuration',
2366                                             u'unable to download configuration')
2367
2368         # Technically, it's JavaScript, not JSON
2369         configJSON = configJSON.replace("'", '"')
2370
2371         try:
2372             config = json.loads(configJSON)
2373         except (ValueError,) as err:
2374             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2375             return
2376
2377         playlist = config['playlist']
2378         videoUrl = playlist[1]['url']
2379
2380         info = {
2381             'id': videoId,
2382             'url': videoUrl,
2383             'uploader': showName,
2384             'upload_date': None,
2385             'title': showName,
2386             'ext': 'mp4',
2387             'thumbnail': imgUrl,
2388             'description': description,
2389             'player_url': playerUrl,
2390         }
2391
2392         return [info]
2393
2394 class CollegeHumorIE(InfoExtractor):
2395     """Information extractor for collegehumor.com"""
2396
2397     _WORKING = False
2398     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2399     IE_NAME = u'collegehumor'
2400
2401     def report_manifest(self, video_id):
2402         """Report information extraction."""
2403         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2404
2405     def _real_extract(self, url):
2406         mobj = re.match(self._VALID_URL, url)
2407         if mobj is None:
2408             self._downloader.report_error(u'invalid URL: %s' % url)
2409             return
2410         video_id = mobj.group('videoid')
2411
2412         info = {
2413             'id': video_id,
2414             'uploader': None,
2415             'upload_date': None,
2416         }
2417
2418         self.report_extraction(video_id)
2419         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2420         try:
2421             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2422         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2424             return
2425
2426         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2427         try:
2428             videoNode = mdoc.findall('./video')[0]
2429             info['description'] = videoNode.findall('./description')[0].text
2430             info['title'] = videoNode.findall('./caption')[0].text
2431             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2432             manifest_url = videoNode.findall('./file')[0].text
2433         except IndexError:
2434             self._downloader.report_error(u'Invalid metadata XML file')
2435             return
2436
2437         manifest_url += '?hdcore=2.10.3'
2438         self.report_manifest(video_id)
2439         try:
2440             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2441         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2443             return
2444
2445         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2446         try:
2447             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2448             node_id = media_node.attrib['url']
2449             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2450         except IndexError as err:
2451             self._downloader.report_error(u'Invalid manifest file')
2452             return
2453
2454         url_pr = compat_urllib_parse_urlparse(manifest_url)
2455         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2456
2457         info['url'] = url
2458         info['ext'] = 'f4f'
2459         return [info]
2460
2461
2462 class XVideosIE(InfoExtractor):
2463     """Information extractor for xvideos.com"""
2464
2465     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2466     IE_NAME = u'xvideos'
2467
2468     def _real_extract(self, url):
2469         mobj = re.match(self._VALID_URL, url)
2470         if mobj is None:
2471             self._downloader.report_error(u'invalid URL: %s' % url)
2472             return
2473         video_id = mobj.group(1)
2474
2475         webpage = self._download_webpage(url, video_id)
2476
2477         self.report_extraction(video_id)
2478
2479
2480         # Extract video URL
2481         mobj = re.search(r'flv_url=(.+?)&', webpage)
2482         if mobj is None:
2483             self._downloader.report_error(u'unable to extract video url')
2484             return
2485         video_url = compat_urllib_parse.unquote(mobj.group(1))
2486
2487
2488         # Extract title
2489         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2490         if mobj is None:
2491             self._downloader.report_error(u'unable to extract video title')
2492             return
2493         video_title = mobj.group(1)
2494
2495
2496         # Extract video thumbnail
2497         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2498         if mobj is None:
2499             self._downloader.report_error(u'unable to extract video thumbnail')
2500             return
2501         video_thumbnail = mobj.group(0)
2502
2503         info = {
2504             'id': video_id,
2505             'url': video_url,
2506             'uploader': None,
2507             'upload_date': None,
2508             'title': video_title,
2509             'ext': 'flv',
2510             'thumbnail': video_thumbnail,
2511             'description': None,
2512         }
2513
2514         return [info]
2515
2516
2517 class SoundcloudIE(InfoExtractor):
2518     """Information extractor for soundcloud.com
2519        To access the media, the uid of the song and a stream token
2520        must be extracted from the page source and the script must make
2521        a request to media.soundcloud.com/crossdomain.xml. Then
2522        the media can be grabbed by requesting from an url composed
2523        of the stream token and uid
2524      """
2525
2526     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2527     IE_NAME = u'soundcloud'
2528
2529     def report_resolve(self, video_id):
2530         """Report information extraction."""
2531         self.to_screen(u'%s: Resolving id' % video_id)
2532
2533     def _real_extract(self, url):
2534         mobj = re.match(self._VALID_URL, url)
2535         if mobj is None:
2536             self._downloader.report_error(u'invalid URL: %s' % url)
2537             return
2538
2539         # extract uploader (which is in the url)
2540         uploader = mobj.group(1)
2541         # extract simple title (uploader + slug of song title)
2542         slug_title =  mobj.group(2)
2543         simple_title = uploader + u'-' + slug_title
2544         full_title = '%s/%s' % (uploader, slug_title)
2545
2546         self.report_resolve(full_title)
2547
2548         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2549         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2550         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2551
2552         info = json.loads(info_json)
2553         video_id = info['id']
2554         self.report_extraction(full_title)
2555
2556         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2557         stream_json = self._download_webpage(streams_url, full_title,
2558                                              u'Downloading stream definitions',
2559                                              u'unable to download stream definitions')
2560
2561         streams = json.loads(stream_json)
2562         mediaURL = streams['http_mp3_128_url']
2563         upload_date = unified_strdate(info['created_at'])
2564
2565         return [{
2566             'id':       info['id'],
2567             'url':      mediaURL,
2568             'uploader': info['user']['username'],
2569             'upload_date': upload_date,
2570             'title':    info['title'],
2571             'ext':      u'mp3',
2572             'description': info['description'],
2573         }]
2574
2575 class SoundcloudSetIE(InfoExtractor):
2576     """Information extractor for soundcloud.com sets
2577        To access the media, the uid of the song and a stream token
2578        must be extracted from the page source and the script must make
2579        a request to media.soundcloud.com/crossdomain.xml. Then
2580        the media can be grabbed by requesting from an url composed
2581        of the stream token and uid
2582      """
2583
2584     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2585     IE_NAME = u'soundcloud:set'
2586
2587     def report_resolve(self, video_id):
2588         """Report information extraction."""
2589         self.to_screen(u'%s: Resolving id' % video_id)
2590
2591     def _real_extract(self, url):
2592         mobj = re.match(self._VALID_URL, url)
2593         if mobj is None:
2594             self._downloader.report_error(u'invalid URL: %s' % url)
2595             return
2596
2597         # extract uploader (which is in the url)
2598         uploader = mobj.group(1)
2599         # extract simple title (uploader + slug of song title)
2600         slug_title =  mobj.group(2)
2601         simple_title = uploader + u'-' + slug_title
2602         full_title = '%s/sets/%s' % (uploader, slug_title)
2603
2604         self.report_resolve(full_title)
2605
2606         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2607         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2608         info_json = self._download_webpage(resolv_url, full_title)
2609
2610         videos = []
2611         info = json.loads(info_json)
2612         if 'errors' in info:
2613             for err in info['errors']:
2614                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2615             return
2616
2617         self.report_extraction(full_title)
2618         for track in info['tracks']:
2619             video_id = track['id']
2620
2621             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2622             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2623
2624             self.report_extraction(video_id)
2625             streams = json.loads(stream_json)
2626             mediaURL = streams['http_mp3_128_url']
2627
2628             videos.append({
2629                 'id':       video_id,
2630                 'url':      mediaURL,
2631                 'uploader': track['user']['username'],
2632                 'upload_date':  unified_strdate(track['created_at']),
2633                 'title':    track['title'],
2634                 'ext':      u'mp3',
2635                 'description': track['description'],
2636             })
2637         return videos
2638
2639
2640 class InfoQIE(InfoExtractor):
2641     """Information extractor for infoq.com"""
2642     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2643
2644     def _real_extract(self, url):
2645         mobj = re.match(self._VALID_URL, url)
2646         if mobj is None:
2647             self._downloader.report_error(u'invalid URL: %s' % url)
2648             return
2649
2650         webpage = self._download_webpage(url, video_id=url)
2651         self.report_extraction(url)
2652
2653         # Extract video URL
2654         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2655         if mobj is None:
2656             self._downloader.report_error(u'unable to extract video url')
2657             return
2658         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2659         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2660
2661         # Extract title
2662         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2663         if mobj is None:
2664             self._downloader.report_error(u'unable to extract video title')
2665             return
2666         video_title = mobj.group(1)
2667
2668         # Extract description
2669         video_description = u'No description available.'
2670         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2671         if mobj is not None:
2672             video_description = mobj.group(1)
2673
2674         video_filename = video_url.split('/')[-1]
2675         video_id, extension = video_filename.split('.')
2676
2677         info = {
2678             'id': video_id,
2679             'url': video_url,
2680             'uploader': None,
2681             'upload_date': None,
2682             'title': video_title,
2683             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2684             'thumbnail': None,
2685             'description': video_description,
2686         }
2687
2688         return [info]
2689
2690 class MixcloudIE(InfoExtractor):
2691     """Information extractor for www.mixcloud.com"""
2692
2693     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2694     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2695     IE_NAME = u'mixcloud'
2696
2697     def report_download_json(self, file_id):
2698         """Report JSON download."""
2699         self.to_screen(u'Downloading json')
2700
2701     def get_urls(self, jsonData, fmt, bitrate='best'):
2702         """Get urls from 'audio_formats' section in json"""
2703         file_url = None
2704         try:
2705             bitrate_list = jsonData[fmt]
2706             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2707                 bitrate = max(bitrate_list) # select highest
2708
2709             url_list = jsonData[fmt][bitrate]
2710         except TypeError: # we have no bitrate info.
2711             url_list = jsonData[fmt]
2712         return url_list
2713
2714     def check_urls(self, url_list):
2715         """Returns 1st active url from list"""
2716         for url in url_list:
2717             try:
2718                 compat_urllib_request.urlopen(url)
2719                 return url
2720             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2721                 url = None
2722
2723         return None
2724
2725     def _print_formats(self, formats):
2726         print('Available formats:')
2727         for fmt in formats.keys():
2728             for b in formats[fmt]:
2729                 try:
2730                     ext = formats[fmt][b][0]
2731                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2732                 except TypeError: # we have no bitrate info
2733                     ext = formats[fmt][0]
2734                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2735                     break
2736
2737     def _real_extract(self, url):
2738         mobj = re.match(self._VALID_URL, url)
2739         if mobj is None:
2740             self._downloader.report_error(u'invalid URL: %s' % url)
2741             return
2742         # extract uploader & filename from url
2743         uploader = mobj.group(1).decode('utf-8')
2744         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2745
2746         # construct API request
2747         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2748         # retrieve .json file with links to files
2749         request = compat_urllib_request.Request(file_url)
2750         try:
2751             self.report_download_json(file_url)
2752             jsonData = compat_urllib_request.urlopen(request).read()
2753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2755             return
2756
2757         # parse JSON
2758         json_data = json.loads(jsonData)
2759         player_url = json_data['player_swf_url']
2760         formats = dict(json_data['audio_formats'])
2761
2762         req_format = self._downloader.params.get('format', None)
2763         bitrate = None
2764
2765         if self._downloader.params.get('listformats', None):
2766             self._print_formats(formats)
2767             return
2768
2769         if req_format is None or req_format == 'best':
2770             for format_param in formats.keys():
2771                 url_list = self.get_urls(formats, format_param)
2772                 # check urls
2773                 file_url = self.check_urls(url_list)
2774                 if file_url is not None:
2775                     break # got it!
2776         else:
2777             if req_format not in formats:
2778                 self._downloader.report_error(u'format is not available')
2779                 return
2780
2781             url_list = self.get_urls(formats, req_format)
2782             file_url = self.check_urls(url_list)
2783             format_param = req_format
2784
2785         return [{
2786             'id': file_id.decode('utf-8'),
2787             'url': file_url.decode('utf-8'),
2788             'uploader': uploader.decode('utf-8'),
2789             'upload_date': None,
2790             'title': json_data['name'],
2791             'ext': file_url.split('.')[-1].decode('utf-8'),
2792             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2793             'thumbnail': json_data['thumbnail_url'],
2794             'description': json_data['description'],
2795             'player_url': player_url.decode('utf-8'),
2796         }]
2797
2798 class StanfordOpenClassroomIE(InfoExtractor):
2799     """Information extractor for Stanford's Open ClassRoom"""
2800
2801     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2802     IE_NAME = u'stanfordoc'
2803
2804     def _real_extract(self, url):
2805         mobj = re.match(self._VALID_URL, url)
2806         if mobj is None:
2807             raise ExtractorError(u'Invalid URL: %s' % url)
2808
2809         if mobj.group('course') and mobj.group('video'): # A specific video
2810             course = mobj.group('course')
2811             video = mobj.group('video')
2812             info = {
2813                 'id': course + '_' + video,
2814                 'uploader': None,
2815                 'upload_date': None,
2816             }
2817
2818             self.report_extraction(info['id'])
2819             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2820             xmlUrl = baseUrl + video + '.xml'
2821             try:
2822                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2823             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2824                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2825                 return
2826             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2827             try:
2828                 info['title'] = mdoc.findall('./title')[0].text
2829                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2830             except IndexError:
2831                 self._downloader.report_error(u'Invalid metadata XML file')
2832                 return
2833             info['ext'] = info['url'].rpartition('.')[2]
2834             return [info]
2835         elif mobj.group('course'): # A course page
2836             course = mobj.group('course')
2837             info = {
2838                 'id': course,
2839                 'type': 'playlist',
2840                 'uploader': None,
2841                 'upload_date': None,
2842             }
2843
2844             coursepage = self._download_webpage(url, info['id'],
2845                                         note='Downloading course info page',
2846                                         errnote='Unable to download course info page')
2847
2848             m = re.search('<h1>([^<]+)</h1>', coursepage)
2849             if m:
2850                 info['title'] = unescapeHTML(m.group(1))
2851             else:
2852                 info['title'] = info['id']
2853
2854             m = re.search('<description>([^<]+)</description>', coursepage)
2855             if m:
2856                 info['description'] = unescapeHTML(m.group(1))
2857
2858             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2859             info['list'] = [
2860                 {
2861                     'type': 'reference',
2862                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2863                 }
2864                     for vpage in links]
2865             results = []
2866             for entry in info['list']:
2867                 assert entry['type'] == 'reference'
2868                 results += self.extract(entry['url'])
2869             return results
2870         else: # Root page
2871             info = {
2872                 'id': 'Stanford OpenClassroom',
2873                 'type': 'playlist',
2874                 'uploader': None,
2875                 'upload_date': None,
2876             }
2877
2878             self.report_download_webpage(info['id'])
2879             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2880             try:
2881                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2882             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2883                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2884                 return
2885
2886             info['title'] = info['id']
2887
2888             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2889             info['list'] = [
2890                 {
2891                     'type': 'reference',
2892                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2893                 }
2894                     for cpage in links]
2895
2896             results = []
2897             for entry in info['list']:
2898                 assert entry['type'] == 'reference'
2899                 results += self.extract(entry['url'])
2900             return results
2901
2902 class MTVIE(InfoExtractor):
2903     """Information extractor for MTV.com"""
2904
2905     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2906     IE_NAME = u'mtv'
2907
2908     def _real_extract(self, url):
2909         mobj = re.match(self._VALID_URL, url)
2910         if mobj is None:
2911             self._downloader.report_error(u'invalid URL: %s' % url)
2912             return
2913         if not mobj.group('proto'):
2914             url = 'http://' + url
2915         video_id = mobj.group('videoid')
2916
2917         webpage = self._download_webpage(url, video_id)
2918
2919         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2920         if mobj is None:
2921             self._downloader.report_error(u'unable to extract song name')
2922             return
2923         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2924         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2925         if mobj is None:
2926             self._downloader.report_error(u'unable to extract performer')
2927             return
2928         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2929         video_title = performer + ' - ' + song_name
2930
2931         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2932         if mobj is None:
2933             self._downloader.report_error(u'unable to mtvn_uri')
2934             return
2935         mtvn_uri = mobj.group(1)
2936
2937         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2938         if mobj is None:
2939             self._downloader.report_error(u'unable to extract content id')
2940             return
2941         content_id = mobj.group(1)
2942
2943         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2944         self.report_extraction(video_id)
2945         request = compat_urllib_request.Request(videogen_url)
2946         try:
2947             metadataXml = compat_urllib_request.urlopen(request).read()
2948         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2949             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2950             return
2951
2952         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2953         renditions = mdoc.findall('.//rendition')
2954
2955         # For now, always pick the highest quality.
2956         rendition = renditions[-1]
2957
2958         try:
2959             _,_,ext = rendition.attrib['type'].partition('/')
2960             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2961             video_url = rendition.find('./src').text
2962         except KeyError:
2963             self._downloader.report_error('Invalid rendition field.')
2964             return
2965
2966         info = {
2967             'id': video_id,
2968             'url': video_url,
2969             'uploader': performer,
2970             'upload_date': None,
2971             'title': video_title,
2972             'ext': ext,
2973             'format': format,
2974         }
2975
2976         return [info]
2977
2978
2979 class YoukuIE(InfoExtractor):
2980     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2981
2982     def _gen_sid(self):
2983         nowTime = int(time.time() * 1000)
2984         random1 = random.randint(1000,1998)
2985         random2 = random.randint(1000,9999)
2986
2987         return "%d%d%d" %(nowTime,random1,random2)
2988
2989     def _get_file_ID_mix_string(self, seed):
2990         mixed = []
2991         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2992         seed = float(seed)
2993         for i in range(len(source)):
2994             seed  =  (seed * 211 + 30031 ) % 65536
2995             index  =  math.floor(seed / 65536 * len(source) )
2996             mixed.append(source[int(index)])
2997             source.remove(source[int(index)])
2998         #return ''.join(mixed)
2999         return mixed
3000
3001     def _get_file_id(self, fileId, seed):
3002         mixed = self._get_file_ID_mix_string(seed)
3003         ids = fileId.split('*')
3004         realId = []
3005         for ch in ids:
3006             if ch:
3007                 realId.append(mixed[int(ch)])
3008         return ''.join(realId)
3009
3010     def _real_extract(self, url):
3011         mobj = re.match(self._VALID_URL, url)
3012         if mobj is None:
3013             self._downloader.report_error(u'invalid URL: %s' % url)
3014             return
3015         video_id = mobj.group('ID')
3016
3017         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3018
3019         jsondata = self._download_webpage(info_url, video_id)
3020
3021         self.report_extraction(video_id)
3022         try:
3023             config = json.loads(jsondata)
3024
3025             video_title =  config['data'][0]['title']
3026             seed = config['data'][0]['seed']
3027
3028             format = self._downloader.params.get('format', None)
3029             supported_format = list(config['data'][0]['streamfileids'].keys())
3030
3031             if format is None or format == 'best':
3032                 if 'hd2' in supported_format:
3033                     format = 'hd2'
3034                 else:
3035                     format = 'flv'
3036                 ext = u'flv'
3037             elif format == 'worst':
3038                 format = 'mp4'
3039                 ext = u'mp4'
3040             else:
3041                 format = 'flv'
3042                 ext = u'flv'
3043
3044
3045             fileid = config['data'][0]['streamfileids'][format]
3046             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3047         except (UnicodeDecodeError, ValueError, KeyError):
3048             self._downloader.report_error(u'unable to extract info section')
3049             return
3050
3051         files_info=[]
3052         sid = self._gen_sid()
3053         fileid = self._get_file_id(fileid, seed)
3054
3055         #column 8,9 of fileid represent the segment number
3056         #fileid[7:9] should be changed
3057         for index, key in enumerate(keys):
3058
3059             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3060             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3061
3062             info = {
3063                 'id': '%s_part%02d' % (video_id, index),
3064                 'url': download_url,
3065                 'uploader': None,
3066                 'upload_date': None,
3067                 'title': video_title,
3068                 'ext': ext,
3069             }
3070             files_info.append(info)
3071
3072         return files_info
3073
3074
3075 class XNXXIE(InfoExtractor):
3076     """Information extractor for xnxx.com"""
3077
3078     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3079     IE_NAME = u'xnxx'
3080     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3081     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3082     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3083
3084     def _real_extract(self, url):
3085         mobj = re.match(self._VALID_URL, url)
3086         if mobj is None:
3087             self._downloader.report_error(u'invalid URL: %s' % url)
3088             return
3089         video_id = mobj.group(1)
3090
3091         # Get webpage content
3092         webpage = self._download_webpage(url, video_id)
3093
3094         result = re.search(self.VIDEO_URL_RE, webpage)
3095         if result is None:
3096             self._downloader.report_error(u'unable to extract video url')
3097             return
3098         video_url = compat_urllib_parse.unquote(result.group(1))
3099
3100         result = re.search(self.VIDEO_TITLE_RE, webpage)
3101         if result is None:
3102             self._downloader.report_error(u'unable to extract video title')
3103             return
3104         video_title = result.group(1)
3105
3106         result = re.search(self.VIDEO_THUMB_RE, webpage)
3107         if result is None:
3108             self._downloader.report_error(u'unable to extract video thumbnail')
3109             return
3110         video_thumbnail = result.group(1)
3111
3112         return [{
3113             'id': video_id,
3114             'url': video_url,
3115             'uploader': None,
3116             'upload_date': None,
3117             'title': video_title,
3118             'ext': 'flv',
3119             'thumbnail': video_thumbnail,
3120             'description': None,
3121         }]
3122
3123
3124 class GooglePlusIE(InfoExtractor):
3125     """Information extractor for plus.google.com."""
3126
3127     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3128     IE_NAME = u'plus.google'
3129
3130     def report_extract_entry(self, url):
3131         """Report downloading extry"""
3132         self.to_screen(u'Downloading entry: %s' % url)
3133
3134     def report_date(self, upload_date):
3135         """Report downloading extry"""
3136         self.to_screen(u'Entry date: %s' % upload_date)
3137
3138     def report_uploader(self, uploader):
3139         """Report downloading extry"""
3140         self.to_screen(u'Uploader: %s' % uploader)
3141
3142     def report_title(self, video_title):
3143         """Report downloading extry"""
3144         self.to_screen(u'Title: %s' % video_title)
3145
3146     def report_extract_vid_page(self, video_page):
3147         """Report information extraction."""
3148         self.to_screen(u'Extracting video page: %s' % video_page)
3149
3150     def _real_extract(self, url):
3151         # Extract id from URL
3152         mobj = re.match(self._VALID_URL, url)
3153         if mobj is None:
3154             self._downloader.report_error(u'Invalid URL: %s' % url)
3155             return
3156
3157         post_url = mobj.group(0)
3158         video_id = mobj.group(1)
3159
3160         video_extension = 'flv'
3161
3162         # Step 1, Retrieve post webpage to extract further information
3163         self.report_extract_entry(post_url)
3164         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3165
3166         # Extract update date
3167         upload_date = None
3168         pattern = 'title="Timestamp">(.*?)</a>'
3169         mobj = re.search(pattern, webpage)
3170         if mobj:
3171             upload_date = mobj.group(1)
3172             # Convert timestring to a format suitable for filename
3173             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3174             upload_date = upload_date.strftime('%Y%m%d')
3175         self.report_date(upload_date)
3176
3177         # Extract uploader
3178         uploader = None
3179         pattern = r'rel\="author".*?>(.*?)</a>'
3180         mobj = re.search(pattern, webpage)
3181         if mobj:
3182             uploader = mobj.group(1)
3183         self.report_uploader(uploader)
3184
3185         # Extract title
3186         # Get the first line for title
3187         video_title = u'NA'
3188         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3189         mobj = re.search(pattern, webpage)
3190         if mobj:
3191             video_title = mobj.group(1)
3192         self.report_title(video_title)
3193
3194         # Step 2, Stimulate clicking the image box to launch video
3195         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3196         mobj = re.search(pattern, webpage)
3197         if mobj is None:
3198             self._downloader.report_error(u'unable to extract video page URL')
3199
3200         video_page = mobj.group(1)
3201         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3202         self.report_extract_vid_page(video_page)
3203
3204
3205         # Extract video links on video page
3206         """Extract video links of all sizes"""
3207         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3208         mobj = re.findall(pattern, webpage)
3209         if len(mobj) == 0:
3210             self._downloader.report_error(u'unable to extract video links')
3211
3212         # Sort in resolution
3213         links = sorted(mobj)
3214
3215         # Choose the lowest of the sort, i.e. highest resolution
3216         video_url = links[-1]
3217         # Only get the url. The resolution part in the tuple has no use anymore
3218         video_url = video_url[-1]
3219         # Treat escaped \u0026 style hex
3220         try:
3221             video_url = video_url.decode("unicode_escape")
3222         except AttributeError: # Python 3
3223             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3224
3225
3226         return [{
3227             'id':       video_id,
3228             'url':      video_url,
3229             'uploader': uploader,
3230             'upload_date':  upload_date,
3231             'title':    video_title,
3232             'ext':      video_extension,
3233         }]
3234
3235 class NBAIE(InfoExtractor):
3236     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3237     IE_NAME = u'nba'
3238
3239     def _real_extract(self, url):
3240         mobj = re.match(self._VALID_URL, url)
3241         if mobj is None:
3242             self._downloader.report_error(u'invalid URL: %s' % url)
3243             return
3244
3245         video_id = mobj.group(1)
3246         if video_id.endswith('/index.html'):
3247             video_id = video_id[:-len('/index.html')]
3248
3249         webpage = self._download_webpage(url, video_id)
3250
3251         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3252         def _findProp(rexp, default=None):
3253             m = re.search(rexp, webpage)
3254             if m:
3255                 return unescapeHTML(m.group(1))
3256             else:
3257                 return default
3258
3259         shortened_video_id = video_id.rpartition('/')[2]
3260         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3261         info = {
3262             'id': shortened_video_id,
3263             'url': video_url,
3264             'ext': 'mp4',
3265             'title': title,
3266             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3267             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3268         }
3269         return [info]
3270
3271 class JustinTVIE(InfoExtractor):
3272     """Information extractor for justin.tv and twitch.tv"""
3273     # TODO: One broadcast may be split into multiple videos. The key
3274     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3275     # starts at 1 and increases. Can we treat all parts as one video?
3276
3277     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3278         (?:
3279             (?P<channelid>[^/]+)|
3280             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3281             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3282         )
3283         /?(?:\#.*)?$
3284         """
3285     _JUSTIN_PAGE_LIMIT = 100
3286     IE_NAME = u'justin.tv'
3287
3288     def report_download_page(self, channel, offset):
3289         """Report attempt to download a single page of videos."""
3290         self.to_screen(u'%s: Downloading video information from %d to %d' %
3291                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3292
3293     # Return count of items, list of *valid* items
3294     def _parse_page(self, url, video_id):
3295         webpage = self._download_webpage(url, video_id,
3296                                          u'Downloading video info JSON',
3297                                          u'unable to download video info JSON')
3298
3299         response = json.loads(webpage)
3300         if type(response) != list:
3301             error_text = response.get('error', 'unknown error')
3302             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3303         info = []
3304         for clip in response:
3305             video_url = clip['video_file_url']
3306             if video_url:
3307                 video_extension = os.path.splitext(video_url)[1][1:]
3308                 video_date = re.sub('-', '', clip['start_time'][:10])
3309                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3310                 video_id = clip['id']
3311                 video_title = clip.get('title', video_id)
3312                 info.append({
3313                     'id': video_id,
3314                     'url': video_url,
3315                     'title': video_title,
3316                     'uploader': clip.get('channel_name', video_uploader_id),
3317                     'uploader_id': video_uploader_id,
3318                     'upload_date': video_date,
3319                     'ext': video_extension,
3320                 })
3321         return (len(response), info)
3322
3323     def _real_extract(self, url):
3324         mobj = re.match(self._VALID_URL, url)
3325         if mobj is None:
3326             raise ExtractorError(u'invalid URL: %s' % url)
3327
3328         api_base = 'http://api.justin.tv'
3329         paged = False
3330         if mobj.group('channelid'):
3331             paged = True
3332             video_id = mobj.group('channelid')
3333             api = api_base + '/channel/archives/%s.json' % video_id
3334         elif mobj.group('chapterid'):
3335             chapter_id = mobj.group('chapterid')
3336
3337             webpage = self._download_webpage(url, chapter_id)
3338             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3339             if not m:
3340                 raise ExtractorError(u'Cannot find archive of a chapter')
3341             archive_id = m.group(1)
3342
3343             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3344             chapter_info_xml = self._download_webpage(api, chapter_id,
3345                                              note=u'Downloading chapter information',
3346                                              errnote=u'Chapter information download failed')
3347             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3348             for a in doc.findall('.//archive'):
3349                 if archive_id == a.find('./id').text:
3350                     break
3351             else:
3352                 raise ExtractorError(u'Could not find chapter in chapter information')
3353
3354             video_url = a.find('./video_file_url').text
3355             video_ext = video_url.rpartition('.')[2] or u'flv'
3356
3357             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3358             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3359                                    note='Downloading chapter metadata',
3360                                    errnote='Download of chapter metadata failed')
3361             chapter_info = json.loads(chapter_info_json)
3362
3363             bracket_start = int(doc.find('.//bracket_start').text)
3364             bracket_end = int(doc.find('.//bracket_end').text)
3365
3366             # TODO determine start (and probably fix up file)
3367             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3368             #video_url += u'?start=' + TODO:start_timestamp
3369             # bracket_start is 13290, but we want 51670615
3370             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3371                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3372
3373             info = {
3374                 'id': u'c' + chapter_id,
3375                 'url': video_url,
3376                 'ext': video_ext,
3377                 'title': chapter_info['title'],
3378                 'thumbnail': chapter_info['preview'],
3379                 'description': chapter_info['description'],
3380                 'uploader': chapter_info['channel']['display_name'],
3381                 'uploader_id': chapter_info['channel']['name'],
3382             }
3383             return [info]
3384         else:
3385             video_id = mobj.group('videoid')
3386             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3387
3388         self.report_extraction(video_id)
3389
3390         info = []
3391         offset = 0
3392         limit = self._JUSTIN_PAGE_LIMIT
3393         while True:
3394             if paged:
3395                 self.report_download_page(video_id, offset)
3396             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3397             page_count, page_info = self._parse_page(page_url, video_id)
3398             info.extend(page_info)
3399             if not paged or page_count != limit:
3400                 break
3401             offset += limit
3402         return info
3403
3404 class FunnyOrDieIE(InfoExtractor):
3405     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3406
3407     def _real_extract(self, url):
3408         mobj = re.match(self._VALID_URL, url)
3409         if mobj is None:
3410             raise ExtractorError(u'invalid URL: %s' % url)
3411
3412         video_id = mobj.group('id')
3413         webpage = self._download_webpage(url, video_id)
3414
3415         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3416         if not m:
3417             self._downloader.report_error(u'unable to find video information')
3418         video_url = unescapeHTML(m.group('url'))
3419
3420         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3421         if not m:
3422             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3423             if not m:
3424                 self._downloader.report_error(u'Cannot find video title')
3425         title = clean_html(m.group('title'))
3426
3427         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3428         if m:
3429             desc = unescapeHTML(m.group('desc'))
3430         else:
3431             desc = None
3432
3433         info = {
3434             'id': video_id,
3435             'url': video_url,
3436             'ext': 'mp4',
3437             'title': title,
3438             'description': desc,
3439         }
3440         return [info]
3441
3442 class SteamIE(InfoExtractor):
3443     _VALID_URL = r"""http://store\.steampowered\.com/
3444                 (agecheck/)?
3445                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3446                 (?P<gameID>\d+)/?
3447                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3448                 """
3449
3450     @classmethod
3451     def suitable(cls, url):
3452         """Receives a URL and returns True if suitable for this IE."""
3453         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3454
3455     def _real_extract(self, url):
3456         m = re.match(self._VALID_URL, url, re.VERBOSE)
3457         gameID = m.group('gameID')
3458         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3459         self.report_age_confirmation()
3460         webpage = self._download_webpage(videourl, gameID)
3461         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3462
3463         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3464         mweb = re.finditer(urlRE, webpage)
3465         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3466         titles = re.finditer(namesRE, webpage)
3467         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3468         thumbs = re.finditer(thumbsRE, webpage)
3469         videos = []
3470         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3471             video_id = vid.group('videoID')
3472             title = vtitle.group('videoName')
3473             video_url = vid.group('videoURL')
3474             video_thumb = thumb.group('thumbnail')
3475             if not video_url:
3476                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3477             info = {
3478                 'id':video_id,
3479                 'url':video_url,
3480                 'ext': 'flv',
3481                 'title': unescapeHTML(title),
3482                 'thumbnail': video_thumb
3483                   }
3484             videos.append(info)
3485         return [self.playlist_result(videos, gameID, game_title)]
3486
3487 class UstreamIE(InfoExtractor):
3488     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3489     IE_NAME = u'ustream'
3490
3491     def _real_extract(self, url):
3492         m = re.match(self._VALID_URL, url)
3493         video_id = m.group('videoID')
3494         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3495         webpage = self._download_webpage(url, video_id)
3496         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3497         title = m.group('title')
3498         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3499         uploader = m.group('uploader')
3500         info = {
3501                 'id':video_id,
3502                 'url':video_url,
3503                 'ext': 'flv',
3504                 'title': title,
3505                 'uploader': uploader
3506                   }
3507         return [info]
3508
3509 class WorldStarHipHopIE(InfoExtractor):
3510     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3511     IE_NAME = u'WorldStarHipHop'
3512
3513     def _real_extract(self, url):
3514         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3515
3516         m = re.match(self._VALID_URL, url)
3517         video_id = m.group('id')
3518
3519         webpage_src = self._download_webpage(url, video_id)
3520
3521         mobj = re.search(_src_url, webpage_src)
3522
3523         if mobj is not None:
3524             video_url = mobj.group(1)
3525             if 'mp4' in video_url:
3526                 ext = 'mp4'
3527             else:
3528                 ext = 'flv'
3529         else:
3530             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3531
3532         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3533
3534         if mobj is None:
3535             raise ExtractorError(u'Cannot determine title')
3536         title = mobj.group(1)
3537
3538         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3539         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3540         if mobj is not None:
3541             thumbnail = mobj.group(1)
3542         else:
3543             _title = r"""candytitles.*>(.*)</span>"""
3544             mobj = re.search(_title, webpage_src)
3545             if mobj is not None:
3546                 title = mobj.group(1)
3547             thumbnail = None
3548
3549         results = [{
3550                     'id': video_id,
3551                     'url' : video_url,
3552                     'title' : title,
3553                     'thumbnail' : thumbnail,
3554                     'ext' : ext,
3555                     }]
3556         return results
3557
3558 class RBMARadioIE(InfoExtractor):
3559     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3560
3561     def _real_extract(self, url):
3562         m = re.match(self._VALID_URL, url)
3563         video_id = m.group('videoID')
3564
3565         webpage = self._download_webpage(url, video_id)
3566         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3567         if not m:
3568             raise ExtractorError(u'Cannot find metadata')
3569         json_data = m.group(1)
3570
3571         try:
3572             data = json.loads(json_data)
3573         except ValueError as e:
3574             raise ExtractorError(u'Invalid JSON: ' + str(e))
3575
3576         video_url = data['akamai_url'] + '&cbr=256'
3577         url_parts = compat_urllib_parse_urlparse(video_url)
3578         video_ext = url_parts.path.rpartition('.')[2]
3579         info = {
3580                 'id': video_id,
3581                 'url': video_url,
3582                 'ext': video_ext,
3583                 'title': data['title'],
3584                 'description': data.get('teaser_text'),
3585                 'location': data.get('country_of_origin'),
3586                 'uploader': data.get('host', {}).get('name'),
3587                 'uploader_id': data.get('host', {}).get('slug'),
3588                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3589                 'duration': data.get('duration'),
3590         }
3591         return [info]
3592
3593
3594 class YouPornIE(InfoExtractor):
3595     """Information extractor for youporn.com."""
3596     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3597
3598     def _print_formats(self, formats):
3599         """Print all available formats"""
3600         print(u'Available formats:')
3601         print(u'ext\t\tformat')
3602         print(u'---------------------------------')
3603         for format in formats:
3604             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3605
3606     def _specific(self, req_format, formats):
3607         for x in formats:
3608             if(x["format"]==req_format):
3609                 return x
3610         return None
3611
3612     def _real_extract(self, url):
3613         mobj = re.match(self._VALID_URL, url)
3614         if mobj is None:
3615             self._downloader.report_error(u'invalid URL: %s' % url)
3616             return
3617
3618         video_id = mobj.group('videoid')
3619
3620         req = compat_urllib_request.Request(url)
3621         req.add_header('Cookie', 'age_verified=1')
3622         webpage = self._download_webpage(req, video_id)
3623
3624         # Get the video title
3625         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3626         if result is None:
3627             raise ExtractorError(u'Unable to extract video title')
3628         video_title = result.group('title').strip()
3629
3630         # Get the video date
3631         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3632         if result is None:
3633             self._downloader.report_warning(u'unable to extract video date')
3634             upload_date = None
3635         else:
3636             upload_date = unified_strdate(result.group('date').strip())
3637
3638         # Get the video uploader
3639         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3640         if result is None:
3641             self._downloader.report_warning(u'unable to extract uploader')
3642             video_uploader = None
3643         else:
3644             video_uploader = result.group('uploader').strip()
3645             video_uploader = clean_html( video_uploader )
3646
3647         # Get all of the formats available
3648         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3649         result = re.search(DOWNLOAD_LIST_RE, webpage)
3650         if result is None:
3651             raise ExtractorError(u'Unable to extract download list')
3652         download_list_html = result.group('download_list').strip()
3653
3654         # Get all of the links from the page
3655         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3656         links = re.findall(LINK_RE, download_list_html)
3657         if(len(links) == 0):
3658             raise ExtractorError(u'ERROR: no known formats available for video')
3659
3660         self.to_screen(u'Links found: %d' % len(links))
3661
3662         formats = []
3663         for link in links:
3664
3665             # A link looks like this:
3666             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3667             # A path looks like this:
3668             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3669             video_url = unescapeHTML( link )
3670             path = compat_urllib_parse_urlparse( video_url ).path
3671             extension = os.path.splitext( path )[1][1:]
3672             format = path.split('/')[4].split('_')[:2]
3673             size = format[0]
3674             bitrate = format[1]
3675             format = "-".join( format )
3676             title = u'%s-%s-%s' % (video_title, size, bitrate)
3677
3678             formats.append({
3679                 'id': video_id,
3680                 'url': video_url,
3681                 'uploader': video_uploader,
3682                 'upload_date': upload_date,
3683                 'title': title,
3684                 'ext': extension,
3685                 'format': format,
3686                 'thumbnail': None,
3687                 'description': None,
3688                 'player_url': None
3689             })
3690
3691         if self._downloader.params.get('listformats', None):
3692             self._print_formats(formats)
3693             return
3694
3695         req_format = self._downloader.params.get('format', None)
3696         self.to_screen(u'Format: %s' % req_format)
3697
3698         if req_format is None or req_format == 'best':
3699             return [formats[0]]
3700         elif req_format == 'worst':
3701             return [formats[-1]]
3702         elif req_format in ('-1', 'all'):
3703             return formats
3704         else:
3705             format = self._specific( req_format, formats )
3706             if result is None:
3707                 self._downloader.report_error(u'requested format not available')
3708                 return
3709             return [format]
3710
3711
3712
3713 class PornotubeIE(InfoExtractor):
3714     """Information extractor for pornotube.com."""
3715     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3716
3717     def _real_extract(self, url):
3718         mobj = re.match(self._VALID_URL, url)
3719         if mobj is None:
3720             self._downloader.report_error(u'invalid URL: %s' % url)
3721             return
3722
3723         video_id = mobj.group('videoid')
3724         video_title = mobj.group('title')
3725
3726         # Get webpage content
3727         webpage = self._download_webpage(url, video_id)
3728
3729         # Get the video URL
3730         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3731         result = re.search(VIDEO_URL_RE, webpage)
3732         if result is None:
3733             self._downloader.report_error(u'unable to extract video url')
3734             return
3735         video_url = compat_urllib_parse.unquote(result.group('url'))
3736
3737         #Get the uploaded date
3738         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3739         result = re.search(VIDEO_UPLOADED_RE, webpage)
3740         if result is None:
3741             self._downloader.report_error(u'unable to extract video title')
3742             return
3743         upload_date = unified_strdate(result.group('date'))
3744
3745         info = {'id': video_id,
3746                 'url': video_url,
3747                 'uploader': None,
3748                 'upload_date': upload_date,
3749                 'title': video_title,
3750                 'ext': 'flv',
3751                 'format': 'flv'}
3752
3753         return [info]
3754
3755 class YouJizzIE(InfoExtractor):
3756     """Information extractor for youjizz.com."""
3757     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3758
3759     def _real_extract(self, url):
3760         mobj = re.match(self._VALID_URL, url)
3761         if mobj is None:
3762             self._downloader.report_error(u'invalid URL: %s' % url)
3763             return
3764
3765         video_id = mobj.group('videoid')
3766
3767         # Get webpage content
3768         webpage = self._download_webpage(url, video_id)
3769
3770         # Get the video title
3771         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3772         if result is None:
3773             raise ExtractorError(u'ERROR: unable to extract video title')
3774         video_title = result.group('title').strip()
3775
3776         # Get the embed page
3777         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3778         if result is None:
3779             raise ExtractorError(u'ERROR: unable to extract embed page')
3780
3781         embed_page_url = result.group(0).strip()
3782         video_id = result.group('videoid')
3783
3784         webpage = self._download_webpage(embed_page_url, video_id)
3785
3786         # Get the video URL
3787         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3788         if result is None:
3789             raise ExtractorError(u'ERROR: unable to extract video url')
3790         video_url = result.group('source')
3791
3792         info = {'id': video_id,
3793                 'url': video_url,
3794                 'title': video_title,
3795                 'ext': 'flv',
3796                 'format': 'flv',
3797                 'player_url': embed_page_url}
3798
3799         return [info]
3800
3801 class EightTracksIE(InfoExtractor):
3802     IE_NAME = '8tracks'
3803     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3804
3805     def _real_extract(self, url):
3806         mobj = re.match(self._VALID_URL, url)
3807         if mobj is None:
3808             raise ExtractorError(u'Invalid URL: %s' % url)
3809         playlist_id = mobj.group('id')
3810
3811         webpage = self._download_webpage(url, playlist_id)
3812
3813         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3814         if not m:
3815             raise ExtractorError(u'Cannot find trax information')
3816         json_like = m.group(1)
3817         data = json.loads(json_like)
3818
3819         session = str(random.randint(0, 1000000000))
3820         mix_id = data['id']
3821         track_count = data['tracks_count']
3822         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3823         next_url = first_url
3824         res = []
3825         for i in itertools.count():
3826             api_json = self._download_webpage(next_url, playlist_id,
3827                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3828                 errnote=u'Failed to download song information')
3829             api_data = json.loads(api_json)
3830             track_data = api_data[u'set']['track']
3831             info = {
3832                 'id': track_data['id'],
3833                 'url': track_data['track_file_stream_url'],
3834                 'title': track_data['performer'] + u' - ' + track_data['name'],
3835                 'raw_title': track_data['name'],
3836                 'uploader_id': data['user']['login'],
3837                 'ext': 'm4a',
3838             }
3839             res.append(info)
3840             if api_data['set']['at_last_track']:
3841                 break
3842             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3843         return res
3844
3845 class KeekIE(InfoExtractor):
3846     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3847     IE_NAME = u'keek'
3848
3849     def _real_extract(self, url):
3850         m = re.match(self._VALID_URL, url)
3851         video_id = m.group('videoID')
3852         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3853         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3854         webpage = self._download_webpage(url, video_id)
3855         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3856         title = unescapeHTML(m.group('title'))
3857         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3858         uploader = clean_html(m.group('uploader'))
3859         info = {
3860                 'id': video_id,
3861                 'url': video_url,
3862                 'ext': 'mp4',
3863                 'title': title,
3864                 'thumbnail': thumbnail,
3865                 'uploader': uploader
3866         }
3867         return [info]
3868
3869 class TEDIE(InfoExtractor):
3870     _VALID_URL=r'''http://www\.ted\.com/
3871                    (
3872                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3873                         |
3874                         ((?P<type_talk>talks)) # We have a simple talk
3875                    )
3876                    (/lang/(.*?))? # The url may contain the language
3877                    /(?P<name>\w+) # Here goes the name and then ".html"
3878                    '''
3879
3880     @classmethod
3881     def suitable(cls, url):
3882         """Receives a URL and returns True if suitable for this IE."""
3883         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3884
3885     def _real_extract(self, url):
3886         m=re.match(self._VALID_URL, url, re.VERBOSE)
3887         if m.group('type_talk'):
3888             return [self._talk_info(url)]
3889         else :
3890             playlist_id=m.group('playlist_id')
3891             name=m.group('name')
3892             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3893             return [self._playlist_videos_info(url,name,playlist_id)]
3894
3895     def _talk_video_link(self,mediaSlug):
3896         '''Returns the video link for that mediaSlug'''
3897         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3898
3899     def _playlist_videos_info(self,url,name,playlist_id=0):
3900         '''Returns the videos of the playlist'''
3901         video_RE=r'''
3902                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3903                      ([.\s]*?)data-playlist_item_id="(\d+)"
3904                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3905                      '''
3906         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3907         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3908         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3909         m_names=re.finditer(video_name_RE,webpage)
3910
3911         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3912         m_playlist = re.search(playlist_RE, webpage)
3913         playlist_title = m_playlist.group('playlist_title')
3914
3915         playlist_entries = []
3916         for m_video, m_name in zip(m_videos,m_names):
3917             video_id=m_video.group('video_id')
3918             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3919             playlist_entries.append(self.url_result(talk_url, 'TED'))
3920         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3921
3922     def _talk_info(self, url, video_id=0):
3923         """Return the video for the talk in the url"""
3924         m=re.match(self._VALID_URL, url,re.VERBOSE)
3925         videoName=m.group('name')
3926         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3927         # If the url includes the language we get the title translated
3928         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3929         title=re.search(title_RE, webpage).group('title')
3930         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3931                         "id":(?P<videoID>[\d]+).*?
3932                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3933         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3934         thumb_match=re.search(thumb_RE,webpage)
3935         info_match=re.search(info_RE,webpage,re.VERBOSE)
3936         video_id=info_match.group('videoID')
3937         mediaSlug=info_match.group('mediaSlug')
3938         video_url=self._talk_video_link(mediaSlug)
3939         info = {
3940                 'id': video_id,
3941                 'url': video_url,
3942                 'ext': 'mp4',
3943                 'title': title,
3944                 'thumbnail': thumb_match.group('thumbnail')
3945                 }
3946         return info
3947
3948 class MySpassIE(InfoExtractor):
3949     _VALID_URL = r'http://www.myspass.de/.*'
3950
3951     def _real_extract(self, url):
3952         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3953
3954         # video id is the last path element of the URL
3955         # usually there is a trailing slash, so also try the second but last
3956         url_path = compat_urllib_parse_urlparse(url).path
3957         url_parent_path, video_id = os.path.split(url_path)
3958         if not video_id:
3959             _, video_id = os.path.split(url_parent_path)
3960
3961         # get metadata
3962         metadata_url = META_DATA_URL_TEMPLATE % video_id
3963         metadata_text = self._download_webpage(metadata_url, video_id)
3964         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3965
3966         # extract values from metadata
3967         url_flv_el = metadata.find('url_flv')
3968         if url_flv_el is None:
3969             self._downloader.report_error(u'unable to extract download url')
3970             return
3971         video_url = url_flv_el.text
3972         extension = os.path.splitext(video_url)[1][1:]
3973         title_el = metadata.find('title')
3974         if title_el is None:
3975             self._downloader.report_error(u'unable to extract title')
3976             return
3977         title = title_el.text
3978         format_id_el = metadata.find('format_id')
3979         if format_id_el is None:
3980             format = ext
3981         else:
3982             format = format_id_el.text
3983         description_el = metadata.find('description')
3984         if description_el is not None:
3985             description = description_el.text
3986         else:
3987             description = None
3988         imagePreview_el = metadata.find('imagePreview')
3989         if imagePreview_el is not None:
3990             thumbnail = imagePreview_el.text
3991         else:
3992             thumbnail = None
3993         info = {
3994             'id': video_id,
3995             'url': video_url,
3996             'title': title,
3997             'ext': extension,
3998             'format': format,
3999             'thumbnail': thumbnail,
4000             'description': description
4001         }
4002         return [info]
4003
4004 class SpiegelIE(InfoExtractor):
4005     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4006
4007     def _real_extract(self, url):
4008         m = re.match(self._VALID_URL, url)
4009         video_id = m.group('videoID')
4010
4011         webpage = self._download_webpage(url, video_id)
4012         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4013         if not m:
4014             raise ExtractorError(u'Cannot find title')
4015         video_title = unescapeHTML(m.group(1))
4016
4017         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4018         xml_code = self._download_webpage(xml_url, video_id,
4019                     note=u'Downloading XML', errnote=u'Failed to download XML')
4020
4021         idoc = xml.etree.ElementTree.fromstring(xml_code)
4022         last_type = idoc[-1]
4023         filename = last_type.findall('./filename')[0].text
4024         duration = float(last_type.findall('./duration')[0].text)
4025
4026         video_url = 'http://video2.spiegel.de/flash/' + filename
4027         video_ext = filename.rpartition('.')[2]
4028         info = {
4029             'id': video_id,
4030             'url': video_url,
4031             'ext': video_ext,
4032             'title': video_title,
4033             'duration': duration,
4034         }
4035         return [info]
4036
4037 class LiveLeakIE(InfoExtractor):
4038
4039     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4040     IE_NAME = u'liveleak'
4041
4042     def _real_extract(self, url):
4043         mobj = re.match(self._VALID_URL, url)
4044         if mobj is None:
4045             self._downloader.report_error(u'invalid URL: %s' % url)
4046             return
4047
4048         video_id = mobj.group('video_id')
4049
4050         webpage = self._download_webpage(url, video_id)
4051
4052         m = re.search(r'file: "(.*?)",', webpage)
4053         if not m:
4054             self._downloader.report_error(u'unable to find video url')
4055             return
4056         video_url = m.group(1)
4057
4058         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4059         if not m:
4060             self._downloader.report_error(u'Cannot find video title')
4061         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4062
4063         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4064         if m:
4065             desc = unescapeHTML(m.group('desc'))
4066         else:
4067             desc = None
4068
4069         m = re.search(r'By:.*?(\w+)</a>', webpage)
4070         if m:
4071             uploader = clean_html(m.group(1))
4072         else:
4073             uploader = None
4074
4075         info = {
4076             'id':  video_id,
4077             'url': video_url,
4078             'ext': 'mp4',
4079             'title': title,
4080             'description': desc,
4081             'uploader': uploader
4082         }
4083
4084         return [info]
4085
4086 class ARDIE(InfoExtractor):
4087     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4088     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4089     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4090
4091     def _real_extract(self, url):
4092         # determine video id from url
4093         m = re.match(self._VALID_URL, url)
4094
4095         numid = re.search(r'documentId=([0-9]+)', url)
4096         if numid:
4097             video_id = numid.group(1)
4098         else:
4099             video_id = m.group('video_id')
4100
4101         # determine title and media streams from webpage
4102         html = self._download_webpage(url, video_id)
4103         title = re.search(self._TITLE, html).group('title')
4104         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4105         if not streams:
4106             assert '"fsk"' in html
4107             self._downloader.report_error(u'this video is only available after 8:00 pm')
4108             return
4109
4110         # choose default media type and highest quality for now
4111         stream = max([s for s in streams if int(s["media_type"]) == 0],
4112                      key=lambda s: int(s["quality"]))
4113
4114         # there's two possibilities: RTMP stream or HTTP download
4115         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4116         if stream['rtmp_url']:
4117             self.to_screen(u'RTMP download detected')
4118             assert stream['video_url'].startswith('mp4:')
4119             info["url"] = stream["rtmp_url"]
4120             info["play_path"] = stream['video_url']
4121         else:
4122             assert stream["video_url"].endswith('.mp4')
4123             info["url"] = stream["video_url"]
4124         return [info]
4125
4126 class TumblrIE(InfoExtractor):
4127     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4128
4129     def _real_extract(self, url):
4130         m_url = re.match(self._VALID_URL, url)
4131         video_id = m_url.group('id')
4132         blog = m_url.group('blog_name')
4133
4134         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4135         webpage = self._download_webpage(url, video_id)
4136
4137         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4138         video = re.search(re_video, webpage)
4139         if video is None:
4140             self.to_screen("No video founded")
4141             return []
4142         video_url = video.group('video_url')
4143         ext = video.group('ext')
4144
4145         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4146         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4147
4148         # The only place where you can get a title, it's not complete,
4149         # but searching in other places doesn't work for all videos
4150         re_title = r'<title>(?P<title>.*?)</title>'
4151         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4152
4153         return [{'id': video_id,
4154                  'url': video_url,
4155                  'title': title,
4156                  'thumbnail': thumb,
4157                  'ext': ext
4158                  }]
4159
4160 class BandcampIE(InfoExtractor):
4161     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4162
4163     def _real_extract(self, url):
4164         mobj = re.match(self._VALID_URL, url)
4165         title = mobj.group('title')
4166         webpage = self._download_webpage(url, title)
4167         # We get the link to the free download page
4168         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4169         if m_download is None:
4170             self._downloader.report_error('No free songs founded')
4171             return
4172         download_link = m_download.group(1)
4173         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4174                        webpage, re.MULTILINE|re.DOTALL).group('id')
4175
4176         download_webpage = self._download_webpage(download_link, id,
4177                                                   'Downloading free downloads page')
4178         # We get the dictionary of the track from some javascrip code
4179         info = re.search(r'items: (.*?),$',
4180                          download_webpage, re.MULTILINE).group(1)
4181         info = json.loads(info)[0]
4182         # We pick mp3-320 for now, until format selection can be easily implemented.
4183         mp3_info = info[u'downloads'][u'mp3-320']
4184         # If we try to use this url it says the link has expired
4185         initial_url = mp3_info[u'url']
4186         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4187         m_url = re.match(re_url, initial_url)
4188         #We build the url we will use to get the final track url
4189         # This url is build in Bandcamp in the script download_bunde_*.js
4190         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4191         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4192         # If we could correctly generate the .rand field the url would be
4193         #in the "download_url" key
4194         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4195
4196         track_info = {'id':id,
4197                       'title' : info[u'title'],
4198                       'ext' : 'mp3',
4199                       'url' : final_url,
4200                       'thumbnail' : info[u'thumb_url'],
4201                       'uploader' : info[u'artist']
4202                       }
4203
4204         return [track_info]
4205
4206 class RedTubeIE(InfoExtractor):
4207     """Information Extractor for redtube"""
4208     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4209
4210     def _real_extract(self,url):
4211         mobj = re.match(self._VALID_URL, url)
4212         if mobj is None:
4213             raise ExtractorError(u'Invalid URL: %s' % url)
4214
4215         video_id = mobj.group('id')
4216         video_extension = 'mp4'
4217         webpage = self._download_webpage(url, video_id)
4218         self.report_extraction(video_id)
4219         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4220
4221         if mobj is None:
4222             raise ExtractorError(u'Unable to extract media URL')
4223
4224         video_url = mobj.group(1)
4225         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4226         if mobj is None:
4227             raise ExtractorError(u'Unable to extract title')
4228         video_title = mobj.group(1)
4229
4230         return [{
4231             'id':       video_id,
4232             'url':      video_url,
4233             'ext':      video_extension,
4234             'title':    video_title,
4235         }]
4236
4237
4238 def gen_extractors():
4239     """ Return a list of an instance of every supported extractor.
4240     The order does matter; the first extractor matched is the one handling the URL.
4241     """
4242     return [
4243         YoutubePlaylistIE(),
4244         YoutubeChannelIE(),
4245         YoutubeUserIE(),
4246         YoutubeSearchIE(),
4247         YoutubeIE(),
4248         MetacafeIE(),
4249         DailymotionIE(),
4250         GoogleSearchIE(),
4251         PhotobucketIE(),
4252         YahooIE(),
4253         YahooSearchIE(),
4254         DepositFilesIE(),
4255         FacebookIE(),
4256         BlipTVUserIE(),
4257         BlipTVIE(),
4258         VimeoIE(),
4259         MyVideoIE(),
4260         ComedyCentralIE(),
4261         EscapistIE(),
4262         CollegeHumorIE(),
4263         XVideosIE(),
4264         SoundcloudSetIE(),
4265         SoundcloudIE(),
4266         InfoQIE(),
4267         MixcloudIE(),
4268         StanfordOpenClassroomIE(),
4269         MTVIE(),
4270         YoukuIE(),
4271         XNXXIE(),
4272         YouJizzIE(),
4273         PornotubeIE(),
4274         YouPornIE(),
4275         GooglePlusIE(),
4276         ArteTvIE(),
4277         NBAIE(),
4278         WorldStarHipHopIE(),
4279         JustinTVIE(),
4280         FunnyOrDieIE(),
4281         SteamIE(),
4282         UstreamIE(),
4283         RBMARadioIE(),
4284         EightTracksIE(),
4285         KeekIE(),
4286         TEDIE(),
4287         MySpassIE(),
4288         SpiegelIE(),
4289         LiveLeakIE(),
4290         ARDIE(),
4291         TumblrIE(),
4292         BandcampIE(),
4293         RedTubeIE(),
4294         GenericIE()
4295     ]
4296
4297 def get_info_extractor(ie_name):
4298     """Returns the info extractor class with the given ie_name"""
4299     return globals()[ie_name+'IE']