_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, text, name, fatal=True, flags=0):
 195         """Extract a field from some text based on regex"""
 196         mobj = re.search(pattern, text, flags)
 197         if mobj is None and fatal:
 198             raise ExtractorError(u'Unable to extract %s; '
 199                 u'please report this issue on GitHub.' % name)
 200         elif mobj is None:
 201             self._downloader.report_warning(u'unable to extract %s; '
 202                 u'please report this issue on GitHub.' % name)
 203             return None
 204         else:
 205             # return the first matched group
 206             return next(g for g in mobj.groups() if g is not None)
 207
 208 class SearchInfoExtractor(InfoExtractor):
 209     """
 210     Base class for paged search queries extractors.
 211     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 212     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 213     """
 214
 215     @classmethod
 216     def _make_valid_url(cls):
 217         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 218
 219     @classmethod
 220     def suitable(cls, url):
 221         return re.match(cls._make_valid_url(), url) is not None
 222
 223     def _real_extract(self, query):
 224         mobj = re.match(self._make_valid_url(), query)
 225         if mobj is None:
 226             raise ExtractorError(u'Invalid search query "%s"' % query)
 227
 228         prefix = mobj.group('prefix')
 229         query = mobj.group('query')
 230         if prefix == '':
 231             return self._get_n_results(query, 1)
 232         elif prefix == 'all':
 233             return self._get_n_results(query, self._MAX_RESULTS)
 234         else:
 235             n = int(prefix)
 236             if n <= 0:
 237                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 238             elif n > self._MAX_RESULTS:
 239                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 240                 n = self._MAX_RESULTS
 241             return self._get_n_results(query, n)
 242
 243     def _get_n_results(self, query, n):
 244         """Get a specified number of results for a query"""
 245         raise NotImplementedError("This method must be implemented by sublclasses")
 246
 247
 248 class YoutubeIE(InfoExtractor):
 249     """Information extractor for youtube.com."""
 250
 251     _VALID_URL = r"""^
 252                      (
 253                          (?:https?://)?                                       # http(s):// (optional)
 254                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 255                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 256                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 257                          (?:                                                  # the various things that can precede the ID:
 258                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 259                              |(?:                                             # or the v= param in all its forms
 260                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 261                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 262                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 263                                  v=
 264                              )
 265                          )?                                                   # optional -> youtube.com/xxxx is OK
 266                      )?                                                       # all until now is optional -> you can pass the naked ID
 267                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 268                      (?(1).+)?                                                # if we found the ID, everything can follow
 269                      $"""
 270     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 271     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 272     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 273     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 274     _NETRC_MACHINE = 'youtube'
 275     # Listed in order of quality
 276     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 277     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 278     _video_extensions = {
 279         '13': '3gp',
 280         '17': 'mp4',
 281         '18': 'mp4',
 282         '22': 'mp4',
 283         '37': 'mp4',
 284         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 285         '43': 'webm',
 286         '44': 'webm',
 287         '45': 'webm',
 288         '46': 'webm',
 289     }
 290     _video_dimensions = {
 291         '5': '240x400',
 292         '6': '???',
 293         '13': '???',
 294         '17': '144x176',
 295         '18': '360x640',
 296         '22': '720x1280',
 297         '34': '360x640',
 298         '35': '480x854',
 299         '37': '1080x1920',
 300         '38': '3072x4096',
 301         '43': '360x640',
 302         '44': '480x854',
 303         '45': '720x1280',
 304         '46': '1080x1920',
 305     }
 306     IE_NAME = u'youtube'
 307
 308     @classmethod
 309     def suitable(cls, url):
 310         """Receives a URL and returns True if suitable for this IE."""
 311         if YoutubePlaylistIE.suitable(url): return False
 312         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 313
 314     def report_lang(self):
 315         """Report attempt to set language."""
 316         self.to_screen(u'Setting language')
 317
 318     def report_login(self):
 319         """Report attempt to log in."""
 320         self.to_screen(u'Logging in')
 321
 322     def report_video_webpage_download(self, video_id):
 323         """Report attempt to download video webpage."""
 324         self.to_screen(u'%s: Downloading video webpage' % video_id)
 325
 326     def report_video_info_webpage_download(self, video_id):
 327         """Report attempt to download video info webpage."""
 328         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 329
 330     def report_video_subtitles_download(self, video_id):
 331         """Report attempt to download video info webpage."""
 332         self.to_screen(u'%s: Checking available subtitles' % video_id)
 333
 334     def report_video_subtitles_request(self, video_id, sub_lang, format):
 335         """Report attempt to download video info webpage."""
 336         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 337
 338     def report_video_subtitles_available(self, video_id, sub_lang_list):
 339         """Report available subtitles."""
 340         sub_lang = ",".join(list(sub_lang_list.keys()))
 341         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 342
 343     def report_information_extraction(self, video_id):
 344         """Report attempt to extract video information."""
 345         self.to_screen(u'%s: Extracting video information' % video_id)
 346
 347     def report_unavailable_format(self, video_id, format):
 348         """Report extracted video URL."""
 349         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 350
 351     def report_rtmp_download(self):
 352         """Indicate the download will use the RTMP protocol."""
 353         self.to_screen(u'RTMP download detected')
 354
 355     def _get_available_subtitles(self, video_id):
 356         self.report_video_subtitles_download(video_id)
 357         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 358         try:
 359             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 360         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 361             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 362         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 363         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 364         if not sub_lang_list:
 365             return (u'video doesn\'t have subtitles', None)
 366         return sub_lang_list
 367
 368     def _list_available_subtitles(self, video_id):
 369         sub_lang_list = self._get_available_subtitles(video_id)
 370         self.report_video_subtitles_available(video_id, sub_lang_list)
 371
 372     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 373         """
 374         Return tuple:
 375         (error_message, sub_lang, sub)
 376         """
 377         self.report_video_subtitles_request(video_id, sub_lang, format)
 378         params = compat_urllib_parse.urlencode({
 379             'lang': sub_lang,
 380             'name': sub_name,
 381             'v': video_id,
 382             'fmt': format,
 383         })
 384         url = 'http://www.youtube.com/api/timedtext?' + params
 385         try:
 386             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 389         if not sub:
 390             return (u'Did not fetch video subtitles', None, None)
 391         return (None, sub_lang, sub)
 392
 393     def _request_automatic_caption(self, video_id, webpage):
 394         """We need the webpage for getting the captions url, pass it as an
 395            argument to speed up the process."""
 396         sub_lang = self._downloader.params.get('subtitleslang')
 397         sub_format = self._downloader.params.get('subtitlesformat')
 398         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 399         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 400         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 401         if mobj is None:
 402             return [(err_msg, None, None)]
 403         player_config = json.loads(mobj.group(1))
 404         try:
 405             args = player_config[u'args']
 406             caption_url = args[u'ttsurl']
 407             timestamp = args[u'timestamp']
 408             params = compat_urllib_parse.urlencode({
 409                 'lang': 'en',
 410                 'tlang': sub_lang,
 411                 'fmt': sub_format,
 412                 'ts': timestamp,
 413                 'kind': 'asr',
 414             })
 415             subtitles_url = caption_url + '&' + params
 416             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 417             return [(None, sub_lang, sub)]
 418         except KeyError:
 419             return [(err_msg, None, None)]
 420
 421     def _extract_subtitle(self, video_id):
 422         """
 423         Return a list with a tuple:
 424         [(error_message, sub_lang, sub)]
 425         """
 426         sub_lang_list = self._get_available_subtitles(video_id)
 427         sub_format = self._downloader.params.get('subtitlesformat')
 428         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 429             return [(sub_lang_list[0], None, None)]
 430         if self._downloader.params.get('subtitleslang', False):
 431             sub_lang = self._downloader.params.get('subtitleslang')
 432         elif 'en' in sub_lang_list:
 433             sub_lang = 'en'
 434         else:
 435             sub_lang = list(sub_lang_list.keys())[0]
 436         if not sub_lang in sub_lang_list:
 437             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 438
 439         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 440         return [subtitle]
 441
 442     def _extract_all_subtitles(self, video_id):
 443         sub_lang_list = self._get_available_subtitles(video_id)
 444         sub_format = self._downloader.params.get('subtitlesformat')
 445         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 446             return [(sub_lang_list[0], None, None)]
 447         subtitles = []
 448         for sub_lang in sub_lang_list:
 449             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 450             subtitles.append(subtitle)
 451         return subtitles
 452
 453     def _print_formats(self, formats):
 454         print('Available formats:')
 455         for x in formats:
 456             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 457
 458     def _real_initialize(self):
 459         if self._downloader is None:
 460             return
 461
 462         username = None
 463         password = None
 464         downloader_params = self._downloader.params
 465
 466         # Attempt to use provided username and password or .netrc data
 467         if downloader_params.get('username', None) is not None:
 468             username = downloader_params['username']
 469             password = downloader_params['password']
 470         elif downloader_params.get('usenetrc', False):
 471             try:
 472                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 473                 if info is not None:
 474                     username = info[0]
 475                     password = info[2]
 476                 else:
 477                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 478             except (IOError, netrc.NetrcParseError) as err:
 479                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 480                 return
 481
 482         # Set language
 483         request = compat_urllib_request.Request(self._LANG_URL)
 484         try:
 485             self.report_lang()
 486             compat_urllib_request.urlopen(request).read()
 487         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 488             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 489             return
 490
 491         # No authentication to be performed
 492         if username is None:
 493             return
 494
 495         request = compat_urllib_request.Request(self._LOGIN_URL)
 496         try:
 497             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 498         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 499             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 500             return
 501
 502         galx = None
 503         dsh = None
 504         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 505         if match:
 506           galx = match.group(1)
 507
 508         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 509         if match:
 510           dsh = match.group(1)
 511
 512         # Log in
 513         login_form_strs = {
 514                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 515                 u'Email': username,
 516                 u'GALX': galx,
 517                 u'Passwd': password,
 518                 u'PersistentCookie': u'yes',
 519                 u'_utf8': u'霱',
 520                 u'bgresponse': u'js_disabled',
 521                 u'checkConnection': u'',
 522                 u'checkedDomains': u'youtube',
 523                 u'dnConn': u'',
 524                 u'dsh': dsh,
 525                 u'pstMsg': u'0',
 526                 u'rmShown': u'1',
 527                 u'secTok': u'',
 528                 u'signIn': u'Sign in',
 529                 u'timeStmp': u'',
 530                 u'service': u'youtube',
 531                 u'uilel': u'3',
 532                 u'hl': u'en_US',
 533         }
 534         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 535         # chokes on unicode
 536         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 537         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 538         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 539         try:
 540             self.report_login()
 541             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 542             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 543                 self._downloader.report_warning(u'unable to log in: bad username or password')
 544                 return
 545         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 546             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 547             return
 548
 549         # Confirm age
 550         age_form = {
 551                 'next_url':     '/',
 552                 'action_confirm':   'Confirm',
 553                 }
 554         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 555         try:
 556             self.report_age_confirmation()
 557             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 558         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 559             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 560
 561     def _extract_id(self, url):
 562         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 563         if mobj is None:
 564             raise ExtractorError(u'Invalid URL: %s' % url)
 565         video_id = mobj.group(2)
 566         return video_id
 567
 568     def _real_extract(self, url):
 569         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 570         mobj = re.search(self._NEXT_URL_RE, url)
 571         if mobj:
 572             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 573         video_id = self._extract_id(url)
 574
 575         # Get video webpage
 576         self.report_video_webpage_download(video_id)
 577         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 578         request = compat_urllib_request.Request(url)
 579         try:
 580             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 582             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 583
 584         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 585
 586         # Attempt to extract SWF player URL
 587         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 588         if mobj is not None:
 589             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 590         else:
 591             player_url = None
 592
 593         # Get video info
 594         self.report_video_info_webpage_download(video_id)
 595         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 596             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 597                     % (video_id, el_type))
 598             video_info_webpage = self._download_webpage(video_info_url, video_id,
 599                                     note=False,
 600                                     errnote='unable to download video info webpage')
 601             video_info = compat_parse_qs(video_info_webpage)
 602             if 'token' in video_info:
 603                 break
 604         if 'token' not in video_info:
 605             if 'reason' in video_info:
 606                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 607             else:
 608                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 609
 610         # Check for "rental" videos
 611         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 612             raise ExtractorError(u'"rental" videos not supported')
 613
 614         # Start extracting information
 615         self.report_information_extraction(video_id)
 616
 617         # uploader
 618         if 'author' not in video_info:
 619             raise ExtractorError(u'Unable to extract uploader name')
 620         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 621
 622         # uploader_id
 623         video_uploader_id = None
 624         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 625         if mobj is not None:
 626             video_uploader_id = mobj.group(1)
 627         else:
 628             self._downloader.report_warning(u'unable to extract uploader nickname')
 629
 630         # title
 631         if 'title' not in video_info:
 632             raise ExtractorError(u'Unable to extract video title')
 633         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 634
 635         # thumbnail image
 636         if 'thumbnail_url' not in video_info:
 637             self._downloader.report_warning(u'unable to extract video thumbnail')
 638             video_thumbnail = ''
 639         else:   # don't panic if we can't find it
 640             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 641
 642         # upload date
 643         upload_date = None
 644         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 645         if mobj is not None:
 646             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 647             upload_date = unified_strdate(upload_date)
 648
 649         # description
 650         video_description = get_element_by_id("eow-description", video_webpage)
 651         if video_description:
 652             video_description = clean_html(video_description)
 653         else:
 654             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 655             if fd_mobj:
 656                 video_description = unescapeHTML(fd_mobj.group(1))
 657             else:
 658                 video_description = u''
 659
 660         # subtitles
 661         video_subtitles = None
 662
 663         if self._downloader.params.get('writesubtitles', False):
 664             video_subtitles = self._extract_subtitle(video_id)
 665             if video_subtitles:
 666                 (sub_error, sub_lang, sub) = video_subtitles[0]
 667                 if sub_error:
 668                     # We try with the automatic captions
 669                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 670                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 671                     if sub is not None:
 672                         pass
 673                     else:
 674                         # We report the original error
 675                         self._downloader.report_error(sub_error)
 676
 677         if self._downloader.params.get('allsubtitles', False):
 678             video_subtitles = self._extract_all_subtitles(video_id)
 679             for video_subtitle in video_subtitles:
 680                 (sub_error, sub_lang, sub) = video_subtitle
 681                 if sub_error:
 682                     self._downloader.report_error(sub_error)
 683
 684         if self._downloader.params.get('listsubtitles', False):
 685             sub_lang_list = self._list_available_subtitles(video_id)
 686             return
 687
 688         if 'length_seconds' not in video_info:
 689             self._downloader.report_warning(u'unable to extract video duration')
 690             video_duration = ''
 691         else:
 692             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 693
 694         # token
 695         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 696
 697         # Decide which formats to download
 698         req_format = self._downloader.params.get('format', None)
 699
 700         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 701             self.report_rtmp_download()
 702             video_url_list = [(None, video_info['conn'][0])]
 703         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 704             url_map = {}
 705             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 706                 url_data = compat_parse_qs(url_data_str)
 707                 if 'itag' in url_data and 'url' in url_data:
 708                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 709                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 710                     url_map[url_data['itag'][0]] = url
 711
 712             format_limit = self._downloader.params.get('format_limit', None)
 713             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 714             if format_limit is not None and format_limit in available_formats:
 715                 format_list = available_formats[available_formats.index(format_limit):]
 716             else:
 717                 format_list = available_formats
 718             existing_formats = [x for x in format_list if x in url_map]
 719             if len(existing_formats) == 0:
 720                 raise ExtractorError(u'no known formats available for video')
 721             if self._downloader.params.get('listformats', None):
 722                 self._print_formats(existing_formats)
 723                 return
 724             if req_format is None or req_format == 'best':
 725                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 726             elif req_format == 'worst':
 727                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 728             elif req_format in ('-1', 'all'):
 729                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 730             else:
 731                 # Specific formats. We pick the first in a slash-delimeted sequence.
 732                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 733                 req_formats = req_format.split('/')
 734                 video_url_list = None
 735                 for rf in req_formats:
 736                     if rf in url_map:
 737                         video_url_list = [(rf, url_map[rf])]
 738                         break
 739                 if video_url_list is None:
 740                     raise ExtractorError(u'requested format not available')
 741         else:
 742             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 743
 744         results = []
 745         for format_param, video_real_url in video_url_list:
 746             # Extension
 747             video_extension = self._video_extensions.get(format_param, 'flv')
 748
 749             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 750                                               self._video_dimensions.get(format_param, '???'))
 751
 752             results.append({
 753                 'id':       video_id,
 754                 'url':      video_real_url,
 755                 'uploader': video_uploader,
 756                 'uploader_id': video_uploader_id,
 757                 'upload_date':  upload_date,
 758                 'title':    video_title,
 759                 'ext':      video_extension,
 760                 'format':   video_format,
 761                 'thumbnail':    video_thumbnail,
 762                 'description':  video_description,
 763                 'player_url':   player_url,
 764                 'subtitles':    video_subtitles,
 765                 'duration':     video_duration
 766             })
 767         return results
 768
 769
 770 class MetacafeIE(InfoExtractor):
 771     """Information Extractor for metacafe.com."""
 772
 773     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 774     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 775     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 776     IE_NAME = u'metacafe'
 777
 778     def report_disclaimer(self):
 779         """Report disclaimer retrieval."""
 780         self.to_screen(u'Retrieving disclaimer')
 781
 782     def _real_initialize(self):
 783         # Retrieve disclaimer
 784         request = compat_urllib_request.Request(self._DISCLAIMER)
 785         try:
 786             self.report_disclaimer()
 787             disclaimer = compat_urllib_request.urlopen(request).read()
 788         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 789             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 790
 791         # Confirm age
 792         disclaimer_form = {
 793             'filters': '0',
 794             'submit': "Continue - I'm over 18",
 795             }
 796         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 797         try:
 798             self.report_age_confirmation()
 799             disclaimer = compat_urllib_request.urlopen(request).read()
 800         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 801             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 802
 803     def _real_extract(self, url):
 804         # Extract id and simplified title from URL
 805         mobj = re.match(self._VALID_URL, url)
 806         if mobj is None:
 807             raise ExtractorError(u'Invalid URL: %s' % url)
 808
 809         video_id = mobj.group(1)
 810
 811         # Check if video comes from YouTube
 812         mobj2 = re.match(r'^yt-(.*)$', video_id)
 813         if mobj2 is not None:
 814             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 815
 816         # Retrieve video webpage to extract further information
 817         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 818
 819         # Extract URL, uploader and title from webpage
 820         self.report_extraction(video_id)
 821         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 822         if mobj is not None:
 823             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 824             video_extension = mediaURL[-3:]
 825
 826             # Extract gdaKey if available
 827             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 828             if mobj is None:
 829                 video_url = mediaURL
 830             else:
 831                 gdaKey = mobj.group(1)
 832                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 833         else:
 834             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 835             if mobj is None:
 836                 raise ExtractorError(u'Unable to extract media URL')
 837             vardict = compat_parse_qs(mobj.group(1))
 838             if 'mediaData' not in vardict:
 839                 raise ExtractorError(u'Unable to extract media URL')
 840             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 841             if mobj is None:
 842                 raise ExtractorError(u'Unable to extract media URL')
 843             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 844             video_extension = mediaURL[-3:]
 845             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 846
 847         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 848         if mobj is None:
 849             raise ExtractorError(u'Unable to extract title')
 850         video_title = mobj.group(1).decode('utf-8')
 851
 852         mobj = re.search(r'submitter=(.*?);', webpage)
 853         if mobj is None:
 854             raise ExtractorError(u'Unable to extract uploader nickname')
 855         video_uploader = mobj.group(1)
 856
 857         return [{
 858             'id':       video_id.decode('utf-8'),
 859             'url':      video_url.decode('utf-8'),
 860             'uploader': video_uploader.decode('utf-8'),
 861             'upload_date':  None,
 862             'title':    video_title,
 863             'ext':      video_extension.decode('utf-8'),
 864         }]
 865
 866 class DailymotionIE(InfoExtractor):
 867     """Information Extractor for Dailymotion"""
 868
 869     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 870     IE_NAME = u'dailymotion'
 871
 872     def _real_extract(self, url):
 873         # Extract id and simplified title from URL
 874         mobj = re.match(self._VALID_URL, url)
 875         if mobj is None:
 876             raise ExtractorError(u'Invalid URL: %s' % url)
 877
 878         video_id = mobj.group(1).split('_')[0].split('?')[0]
 879
 880         video_extension = 'mp4'
 881
 882         # Retrieve video webpage to extract further information
 883         request = compat_urllib_request.Request(url)
 884         request.add_header('Cookie', 'family_filter=off')
 885         webpage = self._download_webpage(request, video_id)
 886
 887         # Extract URL, uploader and title from webpage
 888         self.report_extraction(video_id)
 889         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 890         if mobj is None:
 891             raise ExtractorError(u'Unable to extract media URL')
 892         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 893
 894         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 895             if key in flashvars:
 896                 max_quality = key
 897                 self.to_screen(u'Using %s' % key)
 898                 break
 899         else:
 900             raise ExtractorError(u'Unable to extract video URL')
 901
 902         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 903         if mobj is None:
 904             raise ExtractorError(u'Unable to extract video URL')
 905
 906         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 907
 908         # TODO: support choosing qualities
 909
 910         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 911         if mobj is None:
 912             raise ExtractorError(u'Unable to extract title')
 913         video_title = unescapeHTML(mobj.group('title'))
 914
 915         video_uploader = None
 916         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 917         if mobj is None:
 918             # lookin for official user
 919             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 920             if mobj_official is None:
 921                 self._downloader.report_warning(u'unable to extract uploader nickname')
 922             else:
 923                 video_uploader = mobj_official.group(1)
 924         else:
 925             video_uploader = mobj.group(1)
 926
 927         video_upload_date = None
 928         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 929         if mobj is not None:
 930             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 931
 932         return [{
 933             'id':       video_id,
 934             'url':      video_url,
 935             'uploader': video_uploader,
 936             'upload_date':  video_upload_date,
 937             'title':    video_title,
 938             'ext':      video_extension,
 939         }]
 940
 941
 942 class PhotobucketIE(InfoExtractor):
 943     """Information extractor for photobucket.com."""
 944
 945     # TODO: the original _VALID_URL was:
 946     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 947     # Check if it's necessary to keep the old extracion process
 948     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 949     IE_NAME = u'photobucket'
 950
 951     def _real_extract(self, url):
 952         # Extract id from URL
 953         mobj = re.match(self._VALID_URL, url)
 954         if mobj is None:
 955             raise ExtractorError(u'Invalid URL: %s' % url)
 956
 957         video_id = mobj.group('id')
 958
 959         video_extension = mobj.group('ext')
 960
 961         # Retrieve video webpage to extract further information
 962         webpage = self._download_webpage(url, video_id)
 963
 964         # Extract URL, uploader, and title from webpage
 965         self.report_extraction(video_id)
 966         # We try first by looking the javascript code:
 967         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 968         if mobj is not None:
 969             info = json.loads(mobj.group('json'))
 970             return [{
 971                 'id':       video_id,
 972                 'url':      info[u'downloadUrl'],
 973                 'uploader': info[u'username'],
 974                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 975                 'title':    info[u'title'],
 976                 'ext':      video_extension,
 977                 'thumbnail': info[u'thumbUrl'],
 978             }]
 979
 980         # We try looking in other parts of the webpage
 981         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
 982             webpage, u'video URL')
 983
 984         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 985         if mobj is None:
 986             raise ExtractorError(u'Unable to extract title')
 987         video_title = mobj.group(1).decode('utf-8')
 988         video_uploader = mobj.group(2).decode('utf-8')
 989
 990         return [{
 991             'id':       video_id.decode('utf-8'),
 992             'url':      video_url.decode('utf-8'),
 993             'uploader': video_uploader,
 994             'upload_date':  None,
 995             'title':    video_title,
 996             'ext':      video_extension.decode('utf-8'),
 997         }]
 998
 999
1000 class YahooIE(InfoExtractor):
1001     """Information extractor for screen.yahoo.com."""
1002     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1003
1004     def _real_extract(self, url):
1005         mobj = re.match(self._VALID_URL, url)
1006         if mobj is None:
1007             raise ExtractorError(u'Invalid URL: %s' % url)
1008         video_id = mobj.group('id')
1009         webpage = self._download_webpage(url, video_id)
1010         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1011
1012         if m_id is None:
1013             # TODO: Check which url parameters are required
1014             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1015             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1016             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1017                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1018                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1019                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1020                         '''
1021             self.report_extraction(video_id)
1022             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1023             if m_info is None:
1024                 raise ExtractorError(u'Unable to extract video info')
1025             video_title = m_info.group('title')
1026             video_description = m_info.group('description')
1027             video_thumb = m_info.group('thumb')
1028             video_date = m_info.group('date')
1029             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1030
1031             # TODO: Find a way to get mp4 videos
1032             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1033             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1034             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1035             video_url = m_rest.group('url')
1036             video_path = m_rest.group('path')
1037             if m_rest is None:
1038                 raise ExtractorError(u'Unable to extract video url')
1039
1040         else: # We have to use a different method if another id is defined
1041             long_id = m_id.group('new_id')
1042             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1043             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1044             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1045             info = json.loads(json_str)
1046             res = info[u'query'][u'results'][u'mediaObj'][0]
1047             stream = res[u'streams'][0]
1048             video_path = stream[u'path']
1049             video_url = stream[u'host']
1050             meta = res[u'meta']
1051             video_title = meta[u'title']
1052             video_description = meta[u'description']
1053             video_thumb = meta[u'thumbnail']
1054             video_date = None # I can't find it
1055
1056         info_dict = {
1057                      'id': video_id,
1058                      'url': video_url,
1059                      'play_path': video_path,
1060                      'title':video_title,
1061                      'description': video_description,
1062                      'thumbnail': video_thumb,
1063                      'upload_date': video_date,
1064                      'ext': 'flv',
1065                      }
1066         return info_dict
1067
1068 class VimeoIE(InfoExtractor):
1069     """Information extractor for vimeo.com."""
1070
1071     # _VALID_URL matches Vimeo URLs
1072     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1073     IE_NAME = u'vimeo'
1074
1075     def _real_extract(self, url, new_video=True):
1076         # Extract ID from URL
1077         mobj = re.match(self._VALID_URL, url)
1078         if mobj is None:
1079             raise ExtractorError(u'Invalid URL: %s' % url)
1080
1081         video_id = mobj.group('id')
1082         if not mobj.group('proto'):
1083             url = 'https://' + url
1084         if mobj.group('direct_link') or mobj.group('pro'):
1085             url = 'https://vimeo.com/' + video_id
1086
1087         # Retrieve video webpage to extract further information
1088         request = compat_urllib_request.Request(url, None, std_headers)
1089         webpage = self._download_webpage(request, video_id)
1090
1091         # Now we begin extracting as much information as we can from what we
1092         # retrieved. First we extract the information common to all extractors,
1093         # and latter we extract those that are Vimeo specific.
1094         self.report_extraction(video_id)
1095
1096         # Extract the config JSON
1097         try:
1098             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1099             config = json.loads(config)
1100         except:
1101             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1102                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1103             else:
1104                 raise ExtractorError(u'Unable to extract info section')
1105
1106         # Extract title
1107         video_title = config["video"]["title"]
1108
1109         # Extract uploader and uploader_id
1110         video_uploader = config["video"]["owner"]["name"]
1111         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1112
1113         # Extract video thumbnail
1114         video_thumbnail = config["video"]["thumbnail"]
1115
1116         # Extract video description
1117         video_description = get_element_by_attribute("itemprop", "description", webpage)
1118         if video_description: video_description = clean_html(video_description)
1119         else: video_description = u''
1120
1121         # Extract upload date
1122         video_upload_date = None
1123         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1124         if mobj is not None:
1125             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1126
1127         # Vimeo specific: extract request signature and timestamp
1128         sig = config['request']['signature']
1129         timestamp = config['request']['timestamp']
1130
1131         # Vimeo specific: extract video codec and quality information
1132         # First consider quality, then codecs, then take everything
1133         # TODO bind to format param
1134         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1135         files = { 'hd': [], 'sd': [], 'other': []}
1136         for codec_name, codec_extension in codecs:
1137             if codec_name in config["video"]["files"]:
1138                 if 'hd' in config["video"]["files"][codec_name]:
1139                     files['hd'].append((codec_name, codec_extension, 'hd'))
1140                 elif 'sd' in config["video"]["files"][codec_name]:
1141                     files['sd'].append((codec_name, codec_extension, 'sd'))
1142                 else:
1143                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1144
1145         for quality in ('hd', 'sd', 'other'):
1146             if len(files[quality]) > 0:
1147                 video_quality = files[quality][0][2]
1148                 video_codec = files[quality][0][0]
1149                 video_extension = files[quality][0][1]
1150                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1151                 break
1152         else:
1153             raise ExtractorError(u'No known codec found')
1154
1155         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1156                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1157
1158         return [{
1159             'id':       video_id,
1160             'url':      video_url,
1161             'uploader': video_uploader,
1162             'uploader_id': video_uploader_id,
1163             'upload_date':  video_upload_date,
1164             'title':    video_title,
1165             'ext':      video_extension,
1166             'thumbnail':    video_thumbnail,
1167             'description':  video_description,
1168         }]
1169
1170
1171 class ArteTvIE(InfoExtractor):
1172     """arte.tv information extractor."""
1173
1174     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1175     _LIVE_URL = r'index-[0-9]+\.html$'
1176
1177     IE_NAME = u'arte.tv'
1178
1179     def fetch_webpage(self, url):
1180         request = compat_urllib_request.Request(url)
1181         try:
1182             self.report_download_webpage(url)
1183             webpage = compat_urllib_request.urlopen(request).read()
1184         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1185             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1186         except ValueError as err:
1187             raise ExtractorError(u'Invalid URL: %s' % url)
1188         return webpage
1189
1190     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1191         page = self.fetch_webpage(url)
1192         mobj = re.search(regex, page, regexFlags)
1193         info = {}
1194
1195         if mobj is None:
1196             raise ExtractorError(u'Invalid URL: %s' % url)
1197
1198         for (i, key, err) in matchTuples:
1199             if mobj.group(i) is None:
1200                 raise ExtractorError(err)
1201             else:
1202                 info[key] = mobj.group(i)
1203
1204         return info
1205
1206     def extractLiveStream(self, url):
1207         video_lang = url.split('/')[-4]
1208         info = self.grep_webpage(
1209             url,
1210             r'src="(.*?/videothek_js.*?\.js)',
1211             0,
1212             [
1213                 (1, 'url', u'Invalid URL: %s' % url)
1214             ]
1215         )
1216         http_host = url.split('/')[2]
1217         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1218         info = self.grep_webpage(
1219             next_url,
1220             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1221                 '(http://.*?\.swf).*?' +
1222                 '(rtmp://.*?)\'',
1223             re.DOTALL,
1224             [
1225                 (1, 'path',   u'could not extract video path: %s' % url),
1226                 (2, 'player', u'could not extract video player: %s' % url),
1227                 (3, 'url',    u'could not extract video url: %s' % url)
1228             ]
1229         )
1230         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1231
1232     def extractPlus7Stream(self, url):
1233         video_lang = url.split('/')[-3]
1234         info = self.grep_webpage(
1235             url,
1236             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1237             0,
1238             [
1239                 (1, 'url', u'Invalid URL: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243         info = self.grep_webpage(
1244             next_url,
1245             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1246             0,
1247             [
1248                 (1, 'url', u'Could not find <video> tag: %s' % url)
1249             ]
1250         )
1251         next_url = compat_urllib_parse.unquote(info.get('url'))
1252
1253         info = self.grep_webpage(
1254             next_url,
1255             r'<video id="(.*?)".*?>.*?' +
1256                 '<name>(.*?)</name>.*?' +
1257                 '<dateVideo>(.*?)</dateVideo>.*?' +
1258                 '<url quality="hd">(.*?)</url>',
1259             re.DOTALL,
1260             [
1261                 (1, 'id',    u'could not extract video id: %s' % url),
1262                 (2, 'title', u'could not extract video title: %s' % url),
1263                 (3, 'date',  u'could not extract video date: %s' % url),
1264                 (4, 'url',   u'could not extract video url: %s' % url)
1265             ]
1266         )
1267
1268         return {
1269             'id':           info.get('id'),
1270             'url':          compat_urllib_parse.unquote(info.get('url')),
1271             'uploader':     u'arte.tv',
1272             'upload_date':  unified_strdate(info.get('date')),
1273             'title':        info.get('title').decode('utf-8'),
1274             'ext':          u'mp4',
1275             'format':       u'NA',
1276             'player_url':   None,
1277         }
1278
1279     def _real_extract(self, url):
1280         video_id = url.split('/')[-1]
1281         self.report_extraction(video_id)
1282
1283         if re.search(self._LIVE_URL, video_id) is not None:
1284             self.extractLiveStream(url)
1285             return
1286         else:
1287             info = self.extractPlus7Stream(url)
1288
1289         return [info]
1290
1291
1292 class GenericIE(InfoExtractor):
1293     """Generic last-resort information extractor."""
1294
1295     _VALID_URL = r'.*'
1296     IE_NAME = u'generic'
1297
1298     def report_download_webpage(self, video_id):
1299         """Report webpage download."""
1300         if not self._downloader.params.get('test', False):
1301             self._downloader.report_warning(u'Falling back on generic information extractor.')
1302         super(GenericIE, self).report_download_webpage(video_id)
1303
1304     def report_following_redirect(self, new_url):
1305         """Report information extraction."""
1306         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1307
1308     def _test_redirect(self, url):
1309         """Check if it is a redirect, like url shorteners, in case return the new url."""
1310         class HeadRequest(compat_urllib_request.Request):
1311             def get_method(self):
1312                 return "HEAD"
1313
1314         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1315             """
1316             Subclass the HTTPRedirectHandler to make it use our
1317             HeadRequest also on the redirected URL
1318             """
1319             def redirect_request(self, req, fp, code, msg, headers, newurl):
1320                 if code in (301, 302, 303, 307):
1321                     newurl = newurl.replace(' ', '%20')
1322                     newheaders = dict((k,v) for k,v in req.headers.items()
1323                                       if k.lower() not in ("content-length", "content-type"))
1324                     return HeadRequest(newurl,
1325                                        headers=newheaders,
1326                                        origin_req_host=req.get_origin_req_host(),
1327                                        unverifiable=True)
1328                 else:
1329                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1330
1331         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1332             """
1333             Fallback to GET if HEAD is not allowed (405 HTTP error)
1334             """
1335             def http_error_405(self, req, fp, code, msg, headers):
1336                 fp.read()
1337                 fp.close()
1338
1339                 newheaders = dict((k,v) for k,v in req.headers.items()
1340                                   if k.lower() not in ("content-length", "content-type"))
1341                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1342                                                  headers=newheaders,
1343                                                  origin_req_host=req.get_origin_req_host(),
1344                                                  unverifiable=True))
1345
1346         # Build our opener
1347         opener = compat_urllib_request.OpenerDirector()
1348         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1349                         HTTPMethodFallback, HEADRedirectHandler,
1350                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1351             opener.add_handler(handler())
1352
1353         response = opener.open(HeadRequest(url))
1354         if response is None:
1355             raise ExtractorError(u'Invalid URL protocol')
1356         new_url = response.geturl()
1357
1358         if url == new_url:
1359             return False
1360
1361         self.report_following_redirect(new_url)
1362         return new_url
1363
1364     def _real_extract(self, url):
1365         new_url = self._test_redirect(url)
1366         if new_url: return [self.url_result(new_url)]
1367
1368         video_id = url.split('/')[-1]
1369         try:
1370             webpage = self._download_webpage(url, video_id)
1371         except ValueError as err:
1372             # since this is the last-resort InfoExtractor, if
1373             # this error is thrown, it'll be thrown here
1374             raise ExtractorError(u'Invalid URL: %s' % url)
1375
1376         self.report_extraction(video_id)
1377         # Start with something easy: JW Player in SWFObject
1378         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1379         if mobj is None:
1380             # Broaden the search a little bit
1381             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1382         if mobj is None:
1383             # Broaden the search a little bit: JWPlayer JS loader
1384             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1385         if mobj is None:
1386             raise ExtractorError(u'Invalid URL: %s' % url)
1387
1388         # It's possible that one of the regexes
1389         # matched, but returned an empty group:
1390         if mobj.group(1) is None:
1391             raise ExtractorError(u'Invalid URL: %s' % url)
1392
1393         video_url = compat_urllib_parse.unquote(mobj.group(1))
1394         video_id = os.path.basename(video_url)
1395
1396         # here's a fun little line of code for you:
1397         video_extension = os.path.splitext(video_id)[1][1:]
1398         video_id = os.path.splitext(video_id)[0]
1399
1400         # it's tempting to parse this further, but you would
1401         # have to take into account all the variations like
1402         #   Video Title - Site Name
1403         #   Site Name | Video Title
1404         #   Video Title - Tagline | Site Name
1405         # and so on and so forth; it's just not practical
1406         mobj = re.search(r'<title>(.*)</title>', webpage)
1407         if mobj is None:
1408             raise ExtractorError(u'Unable to extract title')
1409         video_title = mobj.group(1)
1410
1411         # video uploader is domain name
1412         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1413         if mobj is None:
1414             raise ExtractorError(u'Unable to extract title')
1415         video_uploader = mobj.group(1)
1416
1417         return [{
1418             'id':       video_id,
1419             'url':      video_url,
1420             'uploader': video_uploader,
1421             'upload_date':  None,
1422             'title':    video_title,
1423             'ext':      video_extension,
1424         }]
1425
1426
1427 class YoutubeSearchIE(SearchInfoExtractor):
1428     """Information Extractor for YouTube search queries."""
1429     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1430     _MAX_RESULTS = 1000
1431     IE_NAME = u'youtube:search'
1432     _SEARCH_KEY = 'ytsearch'
1433
1434     def report_download_page(self, query, pagenum):
1435         """Report attempt to download search page with given number."""
1436         query = query.decode(preferredencoding())
1437         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1438
1439     def _get_n_results(self, query, n):
1440         """Get a specified number of results for a query"""
1441
1442         video_ids = []
1443         pagenum = 0
1444         limit = n
1445
1446         while (50 * pagenum) < limit:
1447             self.report_download_page(query, pagenum+1)
1448             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1449             request = compat_urllib_request.Request(result_url)
1450             try:
1451                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1452             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1453                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1454             api_response = json.loads(data)['data']
1455
1456             if not 'items' in api_response:
1457                 raise ExtractorError(u'[youtube] No video results')
1458
1459             new_ids = list(video['id'] for video in api_response['items'])
1460             video_ids += new_ids
1461
1462             limit = min(n, api_response['totalItems'])
1463             pagenum += 1
1464
1465         if len(video_ids) > n:
1466             video_ids = video_ids[:n]
1467         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1468         return self.playlist_result(videos, query)
1469
1470
1471 class GoogleSearchIE(SearchInfoExtractor):
1472     """Information Extractor for Google Video search queries."""
1473     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1474     _MAX_RESULTS = 1000
1475     IE_NAME = u'video.google:search'
1476     _SEARCH_KEY = 'gvsearch'
1477
1478     def _get_n_results(self, query, n):
1479         """Get a specified number of results for a query"""
1480
1481         res = {
1482             '_type': 'playlist',
1483             'id': query,
1484             'entries': []
1485         }
1486
1487         for pagenum in itertools.count(1):
1488             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1489             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1490                                              note='Downloading result page ' + str(pagenum))
1491
1492             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1493                 e = {
1494                     '_type': 'url',
1495                     'url': mobj.group(1)
1496                 }
1497                 res['entries'].append(e)
1498
1499             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1500                 return res
1501
1502 class YahooSearchIE(SearchInfoExtractor):
1503     """Information Extractor for Yahoo! Video search queries."""
1504
1505     _MAX_RESULTS = 1000
1506     IE_NAME = u'screen.yahoo:search'
1507     _SEARCH_KEY = 'yvsearch'
1508
1509     def _get_n_results(self, query, n):
1510         """Get a specified number of results for a query"""
1511
1512         res = {
1513             '_type': 'playlist',
1514             'id': query,
1515             'entries': []
1516         }
1517         for pagenum in itertools.count(0):
1518             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1519             webpage = self._download_webpage(result_url, query,
1520                                              note='Downloading results page '+str(pagenum+1))
1521             info = json.loads(webpage)
1522             m = info[u'm']
1523             results = info[u'results']
1524
1525             for (i, r) in enumerate(results):
1526                 if (pagenum * 30) +i >= n:
1527                     break
1528                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1529                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1530                 res['entries'].append(e)
1531             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1532                 break
1533
1534         return res
1535
1536
1537 class YoutubePlaylistIE(InfoExtractor):
1538     """Information Extractor for YouTube playlists."""
1539
1540     _VALID_URL = r"""(?:
1541                         (?:https?://)?
1542                         (?:\w+\.)?
1543                         youtube\.com/
1544                         (?:
1545                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1546                            \? (?:.*?&)*? (?:p|a|list)=
1547                         |  p/
1548                         )
1549                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1550                         .*
1551                      |
1552                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1553                      )"""
1554     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1555     _MAX_RESULTS = 50
1556     IE_NAME = u'youtube:playlist'
1557
1558     @classmethod
1559     def suitable(cls, url):
1560         """Receives a URL and returns True if suitable for this IE."""
1561         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1562
1563     def _real_extract(self, url):
1564         # Extract playlist id
1565         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1566         if mobj is None:
1567             raise ExtractorError(u'Invalid URL: %s' % url)
1568
1569         # Download playlist videos from API
1570         playlist_id = mobj.group(1) or mobj.group(2)
1571         page_num = 1
1572         videos = []
1573
1574         while True:
1575             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1576             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1577
1578             try:
1579                 response = json.loads(page)
1580             except ValueError as err:
1581                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1582
1583             if 'feed' not in response:
1584                 raise ExtractorError(u'Got a malformed response from YouTube API')
1585             playlist_title = response['feed']['title']['$t']
1586             if 'entry' not in response['feed']:
1587                 # Number of videos is a multiple of self._MAX_RESULTS
1588                 break
1589
1590             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1591                         for entry in response['feed']['entry']
1592                         if 'content' in entry ]
1593
1594             if len(response['feed']['entry']) < self._MAX_RESULTS:
1595                 break
1596             page_num += 1
1597
1598         videos = [v[1] for v in sorted(videos)]
1599
1600         url_results = [self.url_result(url, 'Youtube') for url in videos]
1601         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1602
1603
1604 class YoutubeChannelIE(InfoExtractor):
1605     """Information Extractor for YouTube channels."""
1606
1607     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1608     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1609     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1610     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1611     IE_NAME = u'youtube:channel'
1612
1613     def extract_videos_from_page(self, page):
1614         ids_in_page = []
1615         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1616             if mobj.group(1) not in ids_in_page:
1617                 ids_in_page.append(mobj.group(1))
1618         return ids_in_page
1619
1620     def _real_extract(self, url):
1621         # Extract channel id
1622         mobj = re.match(self._VALID_URL, url)
1623         if mobj is None:
1624             raise ExtractorError(u'Invalid URL: %s' % url)
1625
1626         # Download channel page
1627         channel_id = mobj.group(1)
1628         video_ids = []
1629         pagenum = 1
1630
1631         url = self._TEMPLATE_URL % (channel_id, pagenum)
1632         page = self._download_webpage(url, channel_id,
1633                                       u'Downloading page #%s' % pagenum)
1634
1635         # Extract video identifiers
1636         ids_in_page = self.extract_videos_from_page(page)
1637         video_ids.extend(ids_in_page)
1638
1639         # Download any subsequent channel pages using the json-based channel_ajax query
1640         if self._MORE_PAGES_INDICATOR in page:
1641             while True:
1642                 pagenum = pagenum + 1
1643
1644                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1645                 page = self._download_webpage(url, channel_id,
1646                                               u'Downloading page #%s' % pagenum)
1647
1648                 page = json.loads(page)
1649
1650                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1651                 video_ids.extend(ids_in_page)
1652
1653                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1654                     break
1655
1656         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1657
1658         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1659         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1660         return [self.playlist_result(url_entries, channel_id)]
1661
1662
1663 class YoutubeUserIE(InfoExtractor):
1664     """Information Extractor for YouTube users."""
1665
1666     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1667     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1668     _GDATA_PAGE_SIZE = 50
1669     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1670     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1671     IE_NAME = u'youtube:user'
1672
1673     def _real_extract(self, url):
1674         # Extract username
1675         mobj = re.match(self._VALID_URL, url)
1676         if mobj is None:
1677             raise ExtractorError(u'Invalid URL: %s' % url)
1678
1679         username = mobj.group(1)
1680
1681         # Download video ids using YouTube Data API. Result size per
1682         # query is limited (currently to 50 videos) so we need to query
1683         # page by page until there are no video ids - it means we got
1684         # all of them.
1685
1686         video_ids = []
1687         pagenum = 0
1688
1689         while True:
1690             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1691
1692             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1693             page = self._download_webpage(gdata_url, username,
1694                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1695
1696             # Extract video identifiers
1697             ids_in_page = []
1698
1699             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1700                 if mobj.group(1) not in ids_in_page:
1701                     ids_in_page.append(mobj.group(1))
1702
1703             video_ids.extend(ids_in_page)
1704
1705             # A little optimization - if current page is not
1706             # "full", ie. does not contain PAGE_SIZE video ids then
1707             # we can assume that this page is the last one - there
1708             # are no more ids on further pages - no need to query
1709             # again.
1710
1711             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1712                 break
1713
1714             pagenum += 1
1715
1716         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1717         url_results = [self.url_result(url, 'Youtube') for url in urls]
1718         return [self.playlist_result(url_results, playlist_title = username)]
1719
1720
1721 class BlipTVUserIE(InfoExtractor):
1722     """Information Extractor for blip.tv users."""
1723
1724     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1725     _PAGE_SIZE = 12
1726     IE_NAME = u'blip.tv:user'
1727
1728     def _real_extract(self, url):
1729         # Extract username
1730         mobj = re.match(self._VALID_URL, url)
1731         if mobj is None:
1732             raise ExtractorError(u'Invalid URL: %s' % url)
1733
1734         username = mobj.group(1)
1735
1736         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1737
1738         page = self._download_webpage(url, username, u'Downloading user page')
1739         mobj = re.search(r'data-users-id="([^"]+)"', page)
1740         page_base = page_base % mobj.group(1)
1741
1742
1743         # Download video ids using BlipTV Ajax calls. Result size per
1744         # query is limited (currently to 12 videos) so we need to query
1745         # page by page until there are no video ids - it means we got
1746         # all of them.
1747
1748         video_ids = []
1749         pagenum = 1
1750
1751         while True:
1752             url = page_base + "&page=" + str(pagenum)
1753             page = self._download_webpage(url, username,
1754                                           u'Downloading video ids from page %d' % pagenum)
1755
1756             # Extract video identifiers
1757             ids_in_page = []
1758
1759             for mobj in re.finditer(r'href="/([^"]+)"', page):
1760                 if mobj.group(1) not in ids_in_page:
1761                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1762
1763             video_ids.extend(ids_in_page)
1764
1765             # A little optimization - if current page is not
1766             # "full", ie. does not contain PAGE_SIZE video ids then
1767             # we can assume that this page is the last one - there
1768             # are no more ids on further pages - no need to query
1769             # again.
1770
1771             if len(ids_in_page) < self._PAGE_SIZE:
1772                 break
1773
1774             pagenum += 1
1775
1776         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1777         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1778         return [self.playlist_result(url_entries, playlist_title = username)]
1779
1780
1781 class DepositFilesIE(InfoExtractor):
1782     """Information extractor for depositfiles.com"""
1783
1784     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1785
1786     def _real_extract(self, url):
1787         file_id = url.split('/')[-1]
1788         # Rebuild url in english locale
1789         url = 'http://depositfiles.com/en/files/' + file_id
1790
1791         # Retrieve file webpage with 'Free download' button pressed
1792         free_download_indication = { 'gateway_result' : '1' }
1793         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1794         try:
1795             self.report_download_webpage(file_id)
1796             webpage = compat_urllib_request.urlopen(request).read()
1797         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1798             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1799
1800         # Search for the real file URL
1801         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1802         if (mobj is None) or (mobj.group(1) is None):
1803             # Try to figure out reason of the error.
1804             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1805             if (mobj is not None) and (mobj.group(1) is not None):
1806                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1807                 raise ExtractorError(u'%s' % restriction_message)
1808             else:
1809                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1810
1811         file_url = mobj.group(1)
1812         file_extension = os.path.splitext(file_url)[1][1:]
1813
1814         # Search for file title
1815         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1816
1817         return [{
1818             'id':       file_id.decode('utf-8'),
1819             'url':      file_url.decode('utf-8'),
1820             'uploader': None,
1821             'upload_date':  None,
1822             'title':    file_title,
1823             'ext':      file_extension.decode('utf-8'),
1824         }]
1825
1826
1827 class FacebookIE(InfoExtractor):
1828     """Information Extractor for Facebook"""
1829
1830     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1831     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1832     _NETRC_MACHINE = 'facebook'
1833     IE_NAME = u'facebook'
1834
1835     def report_login(self):
1836         """Report attempt to log in."""
1837         self.to_screen(u'Logging in')
1838
1839     def _real_initialize(self):
1840         if self._downloader is None:
1841             return
1842
1843         useremail = None
1844         password = None
1845         downloader_params = self._downloader.params
1846
1847         # Attempt to use provided username and password or .netrc data
1848         if downloader_params.get('username', None) is not None:
1849             useremail = downloader_params['username']
1850             password = downloader_params['password']
1851         elif downloader_params.get('usenetrc', False):
1852             try:
1853                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1854                 if info is not None:
1855                     useremail = info[0]
1856                     password = info[2]
1857                 else:
1858                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1859             except (IOError, netrc.NetrcParseError) as err:
1860                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1861                 return
1862
1863         if useremail is None:
1864             return
1865
1866         # Log in
1867         login_form = {
1868             'email': useremail,
1869             'pass': password,
1870             'login': 'Log+In'
1871             }
1872         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1873         try:
1874             self.report_login()
1875             login_results = compat_urllib_request.urlopen(request).read()
1876             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1877                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1878                 return
1879         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1880             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1881             return
1882
1883     def _real_extract(self, url):
1884         mobj = re.match(self._VALID_URL, url)
1885         if mobj is None:
1886             raise ExtractorError(u'Invalid URL: %s' % url)
1887         video_id = mobj.group('ID')
1888
1889         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1890         webpage = self._download_webpage(url, video_id)
1891
1892         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1893         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1894         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1895         if not m:
1896             raise ExtractorError(u'Cannot parse data')
1897         data = dict(json.loads(m.group(1)))
1898         params_raw = compat_urllib_parse.unquote(data['params'])
1899         params = json.loads(params_raw)
1900         video_data = params['video_data'][0]
1901         video_url = video_data.get('hd_src')
1902         if not video_url:
1903             video_url = video_data['sd_src']
1904         if not video_url:
1905             raise ExtractorError(u'Cannot find video URL')
1906         video_duration = int(video_data['video_duration'])
1907         thumbnail = video_data['thumbnail_src']
1908
1909         video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1910             webpage, u'title')
1911         video_title = unescapeHTML(video_title)
1912
1913         info = {
1914             'id': video_id,
1915             'title': video_title,
1916             'url': video_url,
1917             'ext': 'mp4',
1918             'duration': video_duration,
1919             'thumbnail': thumbnail,
1920         }
1921         return [info]
1922
1923
1924 class BlipTVIE(InfoExtractor):
1925     """Information extractor for blip.tv"""
1926
1927     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1928     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1929     IE_NAME = u'blip.tv'
1930
1931     def report_direct_download(self, title):
1932         """Report information extraction."""
1933         self.to_screen(u'%s: Direct download detected' % title)
1934
1935     def _real_extract(self, url):
1936         mobj = re.match(self._VALID_URL, url)
1937         if mobj is None:
1938             raise ExtractorError(u'Invalid URL: %s' % url)
1939
1940         # See https://github.com/rg3/youtube-dl/issues/857
1941         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1942         if api_mobj is not None:
1943             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1944         urlp = compat_urllib_parse_urlparse(url)
1945         if urlp.path.startswith('/play/'):
1946             request = compat_urllib_request.Request(url)
1947             response = compat_urllib_request.urlopen(request)
1948             redirecturl = response.geturl()
1949             rurlp = compat_urllib_parse_urlparse(redirecturl)
1950             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1951             url = 'http://blip.tv/a/a-' + file_id
1952             return self._real_extract(url)
1953
1954
1955         if '?' in url:
1956             cchar = '&'
1957         else:
1958             cchar = '?'
1959         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1960         request = compat_urllib_request.Request(json_url)
1961         request.add_header('User-Agent', 'iTunes/10.6.1')
1962         self.report_extraction(mobj.group(1))
1963         info = None
1964         try:
1965             urlh = compat_urllib_request.urlopen(request)
1966             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1967                 basename = url.split('/')[-1]
1968                 title,ext = os.path.splitext(basename)
1969                 title = title.decode('UTF-8')
1970                 ext = ext.replace('.', '')
1971                 self.report_direct_download(title)
1972                 info = {
1973                     'id': title,
1974                     'url': url,
1975                     'uploader': None,
1976                     'upload_date': None,
1977                     'title': title,
1978                     'ext': ext,
1979                     'urlhandle': urlh
1980                 }
1981         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1983         if info is None: # Regular URL
1984             try:
1985                 json_code_bytes = urlh.read()
1986                 json_code = json_code_bytes.decode('utf-8')
1987             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1988                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1989
1990             try:
1991                 json_data = json.loads(json_code)
1992                 if 'Post' in json_data:
1993                     data = json_data['Post']
1994                 else:
1995                     data = json_data
1996
1997                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1998                 video_url = data['media']['url']
1999                 umobj = re.match(self._URL_EXT, video_url)
2000                 if umobj is None:
2001                     raise ValueError('Can not determine filename extension')
2002                 ext = umobj.group(1)
2003
2004                 info = {
2005                     'id': data['item_id'],
2006                     'url': video_url,
2007                     'uploader': data['display_name'],
2008                     'upload_date': upload_date,
2009                     'title': data['title'],
2010                     'ext': ext,
2011                     'format': data['media']['mimeType'],
2012                     'thumbnail': data['thumbnailUrl'],
2013                     'description': data['description'],
2014                     'player_url': data['embedUrl'],
2015                     'user_agent': 'iTunes/10.6.1',
2016                 }
2017             except (ValueError,KeyError) as err:
2018                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2019
2020         return [info]
2021
2022
2023 class MyVideoIE(InfoExtractor):
2024     """Information Extractor for myvideo.de."""
2025
2026     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2027     IE_NAME = u'myvideo'
2028
2029     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2030     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2031     # https://github.com/rg3/youtube-dl/pull/842
2032     def __rc4crypt(self,data, key):
2033         x = 0
2034         box = list(range(256))
2035         for i in list(range(256)):
2036             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2037             box[i], box[x] = box[x], box[i]
2038         x = 0
2039         y = 0
2040         out = ''
2041         for char in data:
2042             x = (x + 1) % 256
2043             y = (y + box[x]) % 256
2044             box[x], box[y] = box[y], box[x]
2045             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2046         return out
2047
2048     def __md5(self,s):
2049         return hashlib.md5(s).hexdigest().encode()
2050
2051     def _real_extract(self,url):
2052         mobj = re.match(self._VALID_URL, url)
2053         if mobj is None:
2054             raise ExtractorError(u'invalid URL: %s' % url)
2055
2056         video_id = mobj.group(1)
2057
2058         GK = (
2059           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2060           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2061           b'TnpsbA0KTVRkbU1tSTRNdz09'
2062         )
2063
2064         # Get video webpage
2065         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2066         webpage = self._download_webpage(webpage_url, video_id)
2067
2068         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2069         if mobj is not None:
2070             self.report_extraction(video_id)
2071             video_url = mobj.group(1) + '.flv'
2072
2073             video_title = self._search_regex('<title>([^<]+)</title>',
2074                 webpage, u'title')
2075
2076             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2077
2078             return [{
2079                 'id':       video_id,
2080                 'url':      video_url,
2081                 'uploader': None,
2082                 'upload_date':  None,
2083                 'title':    video_title,
2084                 'ext':      u'flv',
2085             }]
2086
2087         # try encxml
2088         mobj = re.search('var flashvars={(.+?)}', webpage)
2089         if mobj is None:
2090             raise ExtractorError(u'Unable to extract video')
2091
2092         params = {}
2093         encxml = ''
2094         sec = mobj.group(1)
2095         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2096             if not a == '_encxml':
2097                 params[a] = b
2098             else:
2099                 encxml = compat_urllib_parse.unquote(b)
2100         if not params.get('domain'):
2101             params['domain'] = 'www.myvideo.de'
2102         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2103         if 'flash_playertype=MTV' in xmldata_url:
2104             self._downloader.report_warning(u'avoiding MTV player')
2105             xmldata_url = (
2106                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2107                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2108             ) % video_id
2109
2110         # get enc data
2111         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2112         enc_data_b = binascii.unhexlify(enc_data)
2113         sk = self.__md5(
2114             base64.b64decode(base64.b64decode(GK)) +
2115             self.__md5(
2116                 str(video_id).encode('utf-8')
2117             )
2118         )
2119         dec_data = self.__rc4crypt(enc_data_b, sk)
2120
2121         # extracting infos
2122         self.report_extraction(video_id)
2123
2124         video_url = None
2125         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2126         if mobj:
2127             video_url = compat_urllib_parse.unquote(mobj.group(1))
2128             if 'myvideo2flash' in video_url:
2129                 self._downloader.report_warning(u'forcing RTMPT ...')
2130                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2131
2132         if not video_url:
2133             # extract non rtmp videos
2134             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2135             if mobj is None:
2136                 raise ExtractorError(u'unable to extract url')
2137             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2138
2139         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2140         video_file = compat_urllib_parse.unquote(video_file)
2141
2142         if not video_file.endswith('f4m'):
2143             ppath, prefix = video_file.split('.')
2144             video_playpath = '%s:%s' % (prefix, ppath)
2145             video_hls_playlist = ''
2146         else:
2147             video_playpath = ''
2148             video_hls_playlist = (
2149                 video_filepath + video_file
2150             ).replace('.f4m', '.m3u8')
2151
2152         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2153         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2154
2155         video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2156             webpage, u'title')
2157
2158         return [{
2159             'id':                 video_id,
2160             'url':                video_url,
2161             'tc_url':             video_url,
2162             'uploader':           None,
2163             'upload_date':        None,
2164             'title':              video_title,
2165             'ext':                u'flv',
2166             'play_path':          video_playpath,
2167             'video_file':         video_file,
2168             'video_hls_playlist': video_hls_playlist,
2169             'player_url':         video_swfobj,
2170         }]
2171
2172
2173 class ComedyCentralIE(InfoExtractor):
2174     """Information extractor for The Daily Show and Colbert Report """
2175
2176     # urls can be abbreviations like :thedailyshow or :colbert
2177     # urls for episodes like:
2178     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2179     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2180     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2181     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2182                       |(https?://)?(www\.)?
2183                           (?P<showname>thedailyshow|colbertnation)\.com/
2184                          (full-episodes/(?P<episode>.*)|
2185                           (?P<clip>
2186                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2187                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2188                      $"""
2189
2190     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2191
2192     _video_extensions = {
2193         '3500': 'mp4',
2194         '2200': 'mp4',
2195         '1700': 'mp4',
2196         '1200': 'mp4',
2197         '750': 'mp4',
2198         '400': 'mp4',
2199     }
2200     _video_dimensions = {
2201         '3500': '1280x720',
2202         '2200': '960x540',
2203         '1700': '768x432',
2204         '1200': '640x360',
2205         '750': '512x288',
2206         '400': '384x216',
2207     }
2208
2209     @classmethod
2210     def suitable(cls, url):
2211         """Receives a URL and returns True if suitable for this IE."""
2212         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2213
2214     def _print_formats(self, formats):
2215         print('Available formats:')
2216         for x in formats:
2217             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2218
2219
2220     def _real_extract(self, url):
2221         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2222         if mobj is None:
2223             raise ExtractorError(u'Invalid URL: %s' % url)
2224
2225         if mobj.group('shortname'):
2226             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2227                 url = u'http://www.thedailyshow.com/full-episodes/'
2228             else:
2229                 url = u'http://www.colbertnation.com/full-episodes/'
2230             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2231             assert mobj is not None
2232
2233         if mobj.group('clip'):
2234             if mobj.group('showname') == 'thedailyshow':
2235                 epTitle = mobj.group('tdstitle')
2236             else:
2237                 epTitle = mobj.group('cntitle')
2238             dlNewest = False
2239         else:
2240             dlNewest = not mobj.group('episode')
2241             if dlNewest:
2242                 epTitle = mobj.group('showname')
2243             else:
2244                 epTitle = mobj.group('episode')
2245
2246         self.report_extraction(epTitle)
2247         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2248         if dlNewest:
2249             url = htmlHandle.geturl()
2250             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2251             if mobj is None:
2252                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2253             if mobj.group('episode') == '':
2254                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2255             epTitle = mobj.group('episode')
2256
2257         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2258
2259         if len(mMovieParams) == 0:
2260             # The Colbert Report embeds the information in a without
2261             # a URL prefix; so extract the alternate reference
2262             # and then add the URL prefix manually.
2263
2264             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2265             if len(altMovieParams) == 0:
2266                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2267             else:
2268                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2269
2270         uri = mMovieParams[0][1]
2271         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2272         indexXml = self._download_webpage(indexUrl, epTitle,
2273                                           u'Downloading show index',
2274                                           u'unable to download episode index')
2275
2276         results = []
2277
2278         idoc = xml.etree.ElementTree.fromstring(indexXml)
2279         itemEls = idoc.findall('.//item')
2280         for partNum,itemEl in enumerate(itemEls):
2281             mediaId = itemEl.findall('./guid')[0].text
2282             shortMediaId = mediaId.split(':')[-1]
2283             showId = mediaId.split(':')[-2].replace('.com', '')
2284             officialTitle = itemEl.findall('./title')[0].text
2285             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2286
2287             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2288                         compat_urllib_parse.urlencode({'uri': mediaId}))
2289             configXml = self._download_webpage(configUrl, epTitle,
2290                                                u'Downloading configuration for %s' % shortMediaId)
2291
2292             cdoc = xml.etree.ElementTree.fromstring(configXml)
2293             turls = []
2294             for rendition in cdoc.findall('.//rendition'):
2295                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2296                 turls.append(finfo)
2297
2298             if len(turls) == 0:
2299                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2300                 continue
2301
2302             if self._downloader.params.get('listformats', None):
2303                 self._print_formats([i[0] for i in turls])
2304                 return
2305
2306             # For now, just pick the highest bitrate
2307             format,rtmp_video_url = turls[-1]
2308
2309             # Get the format arg from the arg stream
2310             req_format = self._downloader.params.get('format', None)
2311
2312             # Select format if we can find one
2313             for f,v in turls:
2314                 if f == req_format:
2315                     format, rtmp_video_url = f, v
2316                     break
2317
2318             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2319             if not m:
2320                 raise ExtractorError(u'Cannot transform RTMP url')
2321             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2322             video_url = base + m.group('finalid')
2323
2324             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2325             info = {
2326                 'id': shortMediaId,
2327                 'url': video_url,
2328                 'uploader': showId,
2329                 'upload_date': officialDate,
2330                 'title': effTitle,
2331                 'ext': 'mp4',
2332                 'format': format,
2333                 'thumbnail': None,
2334                 'description': officialTitle,
2335             }
2336             results.append(info)
2337
2338         return results
2339
2340
2341 class EscapistIE(InfoExtractor):
2342     """Information extractor for The Escapist """
2343
2344     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2345     IE_NAME = u'escapist'
2346
2347     def _real_extract(self, url):
2348         mobj = re.match(self._VALID_URL, url)
2349         if mobj is None:
2350             raise ExtractorError(u'Invalid URL: %s' % url)
2351         showName = mobj.group('showname')
2352         videoId = mobj.group('episode')
2353
2354         self.report_extraction(showName)
2355         webpage = self._download_webpage(url, showName)
2356
2357         videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2358             webpage, u'description', fatal=False)
2359         if videoDesc: videoDesc = unescapeHTML(videoDesc)
2360
2361         imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2362             webpage, u'thumbnail', fatal=False)
2363         if imgUrl: imgUrl = unescapeHTML(imgUrl)
2364
2365         playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2366             webpage, u'player url')
2367         playerUrl = unescapeHTML(playerUrl)
2368
2369         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2370         configUrl = compat_urllib_parse.unquote(configUrl)
2371
2372         configJSON = self._download_webpage(configUrl, showName,
2373                                             u'Downloading configuration',
2374                                             u'unable to download configuration')
2375
2376         # Technically, it's JavaScript, not JSON
2377         configJSON = configJSON.replace("'", '"')
2378
2379         try:
2380             config = json.loads(configJSON)
2381         except (ValueError,) as err:
2382             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2383
2384         playlist = config['playlist']
2385         videoUrl = playlist[1]['url']
2386
2387         info = {
2388             'id': videoId,
2389             'url': videoUrl,
2390             'uploader': showName,
2391             'upload_date': None,
2392             'title': showName,
2393             'ext': 'mp4',
2394             'thumbnail': imgUrl,
2395             'description': videoDesc,
2396             'player_url': playerUrl,
2397         }
2398
2399         return [info]
2400
2401 class CollegeHumorIE(InfoExtractor):
2402     """Information extractor for collegehumor.com"""
2403
2404     _WORKING = False
2405     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2406     IE_NAME = u'collegehumor'
2407
2408     def report_manifest(self, video_id):
2409         """Report information extraction."""
2410         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2411
2412     def _real_extract(self, url):
2413         mobj = re.match(self._VALID_URL, url)
2414         if mobj is None:
2415             raise ExtractorError(u'Invalid URL: %s' % url)
2416         video_id = mobj.group('videoid')
2417
2418         info = {
2419             'id': video_id,
2420             'uploader': None,
2421             'upload_date': None,
2422         }
2423
2424         self.report_extraction(video_id)
2425         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2426         try:
2427             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2428         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2429             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2430
2431         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2432         try:
2433             videoNode = mdoc.findall('./video')[0]
2434             info['description'] = videoNode.findall('./description')[0].text
2435             info['title'] = videoNode.findall('./caption')[0].text
2436             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2437             manifest_url = videoNode.findall('./file')[0].text
2438         except IndexError:
2439             raise ExtractorError(u'Invalid metadata XML file')
2440
2441         manifest_url += '?hdcore=2.10.3'
2442         self.report_manifest(video_id)
2443         try:
2444             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2447
2448         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2449         try:
2450             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2451             node_id = media_node.attrib['url']
2452             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2453         except IndexError as err:
2454             raise ExtractorError(u'Invalid manifest file')
2455
2456         url_pr = compat_urllib_parse_urlparse(manifest_url)
2457         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2458
2459         info['url'] = url
2460         info['ext'] = 'f4f'
2461         return [info]
2462
2463
2464 class XVideosIE(InfoExtractor):
2465     """Information extractor for xvideos.com"""
2466
2467     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2468     IE_NAME = u'xvideos'
2469
2470     def _real_extract(self, url):
2471         mobj = re.match(self._VALID_URL, url)
2472         if mobj is None:
2473             raise ExtractorError(u'Invalid URL: %s' % url)
2474         video_id = mobj.group(1)
2475
2476         webpage = self._download_webpage(url, video_id)
2477
2478         self.report_extraction(video_id)
2479
2480         # Extract video URL
2481         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2482             webpage, u'video URL'))
2483
2484         # Extract title
2485         video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2486             webpage, u'title')
2487
2488         # Extract video thumbnail
2489         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2490             webpage, u'thumbnail', fatal=False)
2491
2492         info = {
2493             'id': video_id,
2494             'url': video_url,
2495             'uploader': None,
2496             'upload_date': None,
2497             'title': video_title,
2498             'ext': 'flv',
2499             'thumbnail': video_thumbnail,
2500             'description': None,
2501         }
2502
2503         return [info]
2504
2505
2506 class SoundcloudIE(InfoExtractor):
2507     """Information extractor for soundcloud.com
2508        To access the media, the uid of the song and a stream token
2509        must be extracted from the page source and the script must make
2510        a request to media.soundcloud.com/crossdomain.xml. Then
2511        the media can be grabbed by requesting from an url composed
2512        of the stream token and uid
2513      """
2514
2515     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2516     IE_NAME = u'soundcloud'
2517
2518     def report_resolve(self, video_id):
2519         """Report information extraction."""
2520         self.to_screen(u'%s: Resolving id' % video_id)
2521
2522     def _real_extract(self, url):
2523         mobj = re.match(self._VALID_URL, url)
2524         if mobj is None:
2525             raise ExtractorError(u'Invalid URL: %s' % url)
2526
2527         # extract uploader (which is in the url)
2528         uploader = mobj.group(1)
2529         # extract simple title (uploader + slug of song title)
2530         slug_title =  mobj.group(2)
2531         simple_title = uploader + u'-' + slug_title
2532         full_title = '%s/%s' % (uploader, slug_title)
2533
2534         self.report_resolve(full_title)
2535
2536         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2537         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2538         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2539
2540         info = json.loads(info_json)
2541         video_id = info['id']
2542         self.report_extraction(full_title)
2543
2544         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2545         stream_json = self._download_webpage(streams_url, full_title,
2546                                              u'Downloading stream definitions',
2547                                              u'unable to download stream definitions')
2548
2549         streams = json.loads(stream_json)
2550         mediaURL = streams['http_mp3_128_url']
2551         upload_date = unified_strdate(info['created_at'])
2552
2553         return [{
2554             'id':       info['id'],
2555             'url':      mediaURL,
2556             'uploader': info['user']['username'],
2557             'upload_date': upload_date,
2558             'title':    info['title'],
2559             'ext':      u'mp3',
2560             'description': info['description'],
2561         }]
2562
2563 class SoundcloudSetIE(InfoExtractor):
2564     """Information extractor for soundcloud.com sets
2565        To access the media, the uid of the song and a stream token
2566        must be extracted from the page source and the script must make
2567        a request to media.soundcloud.com/crossdomain.xml. Then
2568        the media can be grabbed by requesting from an url composed
2569        of the stream token and uid
2570      """
2571
2572     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2573     IE_NAME = u'soundcloud:set'
2574
2575     def report_resolve(self, video_id):
2576         """Report information extraction."""
2577         self.to_screen(u'%s: Resolving id' % video_id)
2578
2579     def _real_extract(self, url):
2580         mobj = re.match(self._VALID_URL, url)
2581         if mobj is None:
2582             raise ExtractorError(u'Invalid URL: %s' % url)
2583
2584         # extract uploader (which is in the url)
2585         uploader = mobj.group(1)
2586         # extract simple title (uploader + slug of song title)
2587         slug_title =  mobj.group(2)
2588         simple_title = uploader + u'-' + slug_title
2589         full_title = '%s/sets/%s' % (uploader, slug_title)
2590
2591         self.report_resolve(full_title)
2592
2593         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2594         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2595         info_json = self._download_webpage(resolv_url, full_title)
2596
2597         videos = []
2598         info = json.loads(info_json)
2599         if 'errors' in info:
2600             for err in info['errors']:
2601                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2602             return
2603
2604         self.report_extraction(full_title)
2605         for track in info['tracks']:
2606             video_id = track['id']
2607
2608             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2609             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2610
2611             self.report_extraction(video_id)
2612             streams = json.loads(stream_json)
2613             mediaURL = streams['http_mp3_128_url']
2614
2615             videos.append({
2616                 'id':       video_id,
2617                 'url':      mediaURL,
2618                 'uploader': track['user']['username'],
2619                 'upload_date':  unified_strdate(track['created_at']),
2620                 'title':    track['title'],
2621                 'ext':      u'mp3',
2622                 'description': track['description'],
2623             })
2624         return videos
2625
2626
2627 class InfoQIE(InfoExtractor):
2628     """Information extractor for infoq.com"""
2629     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2630
2631     def _real_extract(self, url):
2632         mobj = re.match(self._VALID_URL, url)
2633         if mobj is None:
2634             raise ExtractorError(u'Invalid URL: %s' % url)
2635
2636         webpage = self._download_webpage(url, video_id=url)
2637         self.report_extraction(url)
2638
2639         # Extract video URL
2640         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2641         if mobj is None:
2642             raise ExtractorError(u'Unable to extract video url')
2643         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2644         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2645
2646         # Extract title
2647         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2648             webpage, u'title')
2649
2650         # Extract description
2651         video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2652             webpage, u'description', fatal=False)
2653
2654         video_filename = video_url.split('/')[-1]
2655         video_id, extension = video_filename.split('.')
2656
2657         info = {
2658             'id': video_id,
2659             'url': video_url,
2660             'uploader': None,
2661             'upload_date': None,
2662             'title': video_title,
2663             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2664             'thumbnail': None,
2665             'description': video_description,
2666         }
2667
2668         return [info]
2669
2670 class MixcloudIE(InfoExtractor):
2671     """Information extractor for www.mixcloud.com"""
2672
2673     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2674     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2675     IE_NAME = u'mixcloud'
2676
2677     def report_download_json(self, file_id):
2678         """Report JSON download."""
2679         self.to_screen(u'Downloading json')
2680
2681     def get_urls(self, jsonData, fmt, bitrate='best'):
2682         """Get urls from 'audio_formats' section in json"""
2683         file_url = None
2684         try:
2685             bitrate_list = jsonData[fmt]
2686             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2687                 bitrate = max(bitrate_list) # select highest
2688
2689             url_list = jsonData[fmt][bitrate]
2690         except TypeError: # we have no bitrate info.
2691             url_list = jsonData[fmt]
2692         return url_list
2693
2694     def check_urls(self, url_list):
2695         """Returns 1st active url from list"""
2696         for url in url_list:
2697             try:
2698                 compat_urllib_request.urlopen(url)
2699                 return url
2700             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2701                 url = None
2702
2703         return None
2704
2705     def _print_formats(self, formats):
2706         print('Available formats:')
2707         for fmt in formats.keys():
2708             for b in formats[fmt]:
2709                 try:
2710                     ext = formats[fmt][b][0]
2711                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2712                 except TypeError: # we have no bitrate info
2713                     ext = formats[fmt][0]
2714                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2715                     break
2716
2717     def _real_extract(self, url):
2718         mobj = re.match(self._VALID_URL, url)
2719         if mobj is None:
2720             raise ExtractorError(u'Invalid URL: %s' % url)
2721         # extract uploader & filename from url
2722         uploader = mobj.group(1).decode('utf-8')
2723         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2724
2725         # construct API request
2726         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2727         # retrieve .json file with links to files
2728         request = compat_urllib_request.Request(file_url)
2729         try:
2730             self.report_download_json(file_url)
2731             jsonData = compat_urllib_request.urlopen(request).read()
2732         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2733             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2734
2735         # parse JSON
2736         json_data = json.loads(jsonData)
2737         player_url = json_data['player_swf_url']
2738         formats = dict(json_data['audio_formats'])
2739
2740         req_format = self._downloader.params.get('format', None)
2741         bitrate = None
2742
2743         if self._downloader.params.get('listformats', None):
2744             self._print_formats(formats)
2745             return
2746
2747         if req_format is None or req_format == 'best':
2748             for format_param in formats.keys():
2749                 url_list = self.get_urls(formats, format_param)
2750                 # check urls
2751                 file_url = self.check_urls(url_list)
2752                 if file_url is not None:
2753                     break # got it!
2754         else:
2755             if req_format not in formats:
2756                 raise ExtractorError(u'Format is not available')
2757
2758             url_list = self.get_urls(formats, req_format)
2759             file_url = self.check_urls(url_list)
2760             format_param = req_format
2761
2762         return [{
2763             'id': file_id.decode('utf-8'),
2764             'url': file_url.decode('utf-8'),
2765             'uploader': uploader.decode('utf-8'),
2766             'upload_date': None,
2767             'title': json_data['name'],
2768             'ext': file_url.split('.')[-1].decode('utf-8'),
2769             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2770             'thumbnail': json_data['thumbnail_url'],
2771             'description': json_data['description'],
2772             'player_url': player_url.decode('utf-8'),
2773         }]
2774
2775 class StanfordOpenClassroomIE(InfoExtractor):
2776     """Information extractor for Stanford's Open ClassRoom"""
2777
2778     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2779     IE_NAME = u'stanfordoc'
2780
2781     def _real_extract(self, url):
2782         mobj = re.match(self._VALID_URL, url)
2783         if mobj is None:
2784             raise ExtractorError(u'Invalid URL: %s' % url)
2785
2786         if mobj.group('course') and mobj.group('video'): # A specific video
2787             course = mobj.group('course')
2788             video = mobj.group('video')
2789             info = {
2790                 'id': course + '_' + video,
2791                 'uploader': None,
2792                 'upload_date': None,
2793             }
2794
2795             self.report_extraction(info['id'])
2796             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2797             xmlUrl = baseUrl + video + '.xml'
2798             try:
2799                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2800             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2802             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2803             try:
2804                 info['title'] = mdoc.findall('./title')[0].text
2805                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2806             except IndexError:
2807                 raise ExtractorError(u'Invalid metadata XML file')
2808             info['ext'] = info['url'].rpartition('.')[2]
2809             return [info]
2810         elif mobj.group('course'): # A course page
2811             course = mobj.group('course')
2812             info = {
2813                 'id': course,
2814                 'type': 'playlist',
2815                 'uploader': None,
2816                 'upload_date': None,
2817             }
2818
2819             coursepage = self._download_webpage(url, info['id'],
2820                                         note='Downloading course info page',
2821                                         errnote='Unable to download course info page')
2822
2823             # TODO: implement default_value in search_regex
2824             m = re.search('<h1>([^<]+)</h1>', coursepage)
2825             if m:
2826                 info['title'] = unescapeHTML(m.group(1))
2827             else:
2828                 info['title'] = info['id']
2829
2830             info['description'] = self._search_regex('<description>([^<]+)</description>',
2831                 coursepage, u'description', fatal=False)
2832             if info['description']: info['description'] = unescapeHTML(info['description'])
2833
2834             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2835             info['list'] = [
2836                 {
2837                     'type': 'reference',
2838                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2839                 }
2840                     for vpage in links]
2841             results = []
2842             for entry in info['list']:
2843                 assert entry['type'] == 'reference'
2844                 results += self.extract(entry['url'])
2845             return results
2846         else: # Root page
2847             info = {
2848                 'id': 'Stanford OpenClassroom',
2849                 'type': 'playlist',
2850                 'uploader': None,
2851                 'upload_date': None,
2852             }
2853
2854             self.report_download_webpage(info['id'])
2855             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2856             try:
2857                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2858             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2860
2861             info['title'] = info['id']
2862
2863             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2864             info['list'] = [
2865                 {
2866                     'type': 'reference',
2867                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2868                 }
2869                     for cpage in links]
2870
2871             results = []
2872             for entry in info['list']:
2873                 assert entry['type'] == 'reference'
2874                 results += self.extract(entry['url'])
2875             return results
2876
2877 class MTVIE(InfoExtractor):
2878     """Information extractor for MTV.com"""
2879
2880     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2881     IE_NAME = u'mtv'
2882
2883     def _real_extract(self, url):
2884         mobj = re.match(self._VALID_URL, url)
2885         if mobj is None:
2886             raise ExtractorError(u'Invalid URL: %s' % url)
2887         if not mobj.group('proto'):
2888             url = 'http://' + url
2889         video_id = mobj.group('videoid')
2890
2891         webpage = self._download_webpage(url, video_id)
2892
2893         song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2894             webpage, u'song name', fatal=False)
2895         if song_name: song_name = unescapeHTML(song_name)
2896
2897         video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2898             webpage, u'title')
2899         video_title = unescapeHTML(video_title)
2900
2901         mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2902             webpage, u'mtvn_uri', fatal=False)
2903
2904         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2905             webpage, u'content id', fatal=False)
2906
2907         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2908         self.report_extraction(video_id)
2909         request = compat_urllib_request.Request(videogen_url)
2910         try:
2911             metadataXml = compat_urllib_request.urlopen(request).read()
2912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2913             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2914
2915         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2916         renditions = mdoc.findall('.//rendition')
2917
2918         # For now, always pick the highest quality.
2919         rendition = renditions[-1]
2920
2921         try:
2922             _,_,ext = rendition.attrib['type'].partition('/')
2923             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2924             video_url = rendition.find('./src').text
2925         except KeyError:
2926             raise ExtractorError('Invalid rendition field.')
2927
2928         info = {
2929             'id': video_id,
2930             'url': video_url,
2931             'uploader': performer,
2932             'upload_date': None,
2933             'title': video_title,
2934             'ext': ext,
2935             'format': format,
2936         }
2937
2938         return [info]
2939
2940
2941 class YoukuIE(InfoExtractor):
2942     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2943
2944     def _gen_sid(self):
2945         nowTime = int(time.time() * 1000)
2946         random1 = random.randint(1000,1998)
2947         random2 = random.randint(1000,9999)
2948
2949         return "%d%d%d" %(nowTime,random1,random2)
2950
2951     def _get_file_ID_mix_string(self, seed):
2952         mixed = []
2953         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2954         seed = float(seed)
2955         for i in range(len(source)):
2956             seed  =  (seed * 211 + 30031 ) % 65536
2957             index  =  math.floor(seed / 65536 * len(source) )
2958             mixed.append(source[int(index)])
2959             source.remove(source[int(index)])
2960         #return ''.join(mixed)
2961         return mixed
2962
2963     def _get_file_id(self, fileId, seed):
2964         mixed = self._get_file_ID_mix_string(seed)
2965         ids = fileId.split('*')
2966         realId = []
2967         for ch in ids:
2968             if ch:
2969                 realId.append(mixed[int(ch)])
2970         return ''.join(realId)
2971
2972     def _real_extract(self, url):
2973         mobj = re.match(self._VALID_URL, url)
2974         if mobj is None:
2975             raise ExtractorError(u'Invalid URL: %s' % url)
2976         video_id = mobj.group('ID')
2977
2978         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2979
2980         jsondata = self._download_webpage(info_url, video_id)
2981
2982         self.report_extraction(video_id)
2983         try:
2984             config = json.loads(jsondata)
2985
2986             video_title =  config['data'][0]['title']
2987             seed = config['data'][0]['seed']
2988
2989             format = self._downloader.params.get('format', None)
2990             supported_format = list(config['data'][0]['streamfileids'].keys())
2991
2992             if format is None or format == 'best':
2993                 if 'hd2' in supported_format:
2994                     format = 'hd2'
2995                 else:
2996                     format = 'flv'
2997                 ext = u'flv'
2998             elif format == 'worst':
2999                 format = 'mp4'
3000                 ext = u'mp4'
3001             else:
3002                 format = 'flv'
3003                 ext = u'flv'
3004
3005
3006             fileid = config['data'][0]['streamfileids'][format]
3007             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3008         except (UnicodeDecodeError, ValueError, KeyError):
3009             raise ExtractorError(u'Unable to extract info section')
3010
3011         files_info=[]
3012         sid = self._gen_sid()
3013         fileid = self._get_file_id(fileid, seed)
3014
3015         #column 8,9 of fileid represent the segment number
3016         #fileid[7:9] should be changed
3017         for index, key in enumerate(keys):
3018
3019             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3020             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3021
3022             info = {
3023                 'id': '%s_part%02d' % (video_id, index),
3024                 'url': download_url,
3025                 'uploader': None,
3026                 'upload_date': None,
3027                 'title': video_title,
3028                 'ext': ext,
3029             }
3030             files_info.append(info)
3031
3032         return files_info
3033
3034
3035 class XNXXIE(InfoExtractor):
3036     """Information extractor for xnxx.com"""
3037
3038     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3039     IE_NAME = u'xnxx'
3040     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3041     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3042     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3043
3044     def _real_extract(self, url):
3045         mobj = re.match(self._VALID_URL, url)
3046         if mobj is None:
3047             raise ExtractorError(u'Invalid URL: %s' % url)
3048         video_id = mobj.group(1)
3049
3050         # Get webpage content
3051         webpage = self._download_webpage(url, video_id)
3052
3053         video_url = self._search_regex(self.VIDEO_URL_RE,
3054             webpage, u'video URL')
3055         video_url = compat_urllib_parse.unquote(video_url)
3056
3057         video_title = self._search_regex(self.VIDEO_TITLE_RE,
3058             webpage, u'title')
3059
3060         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3061             webpage, u'thumbnail', fatal=False)
3062
3063         return [{
3064             'id': video_id,
3065             'url': video_url,
3066             'uploader': None,
3067             'upload_date': None,
3068             'title': video_title,
3069             'ext': 'flv',
3070             'thumbnail': video_thumbnail,
3071             'description': None,
3072         }]
3073
3074
3075 class GooglePlusIE(InfoExtractor):
3076     """Information extractor for plus.google.com."""
3077
3078     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3079     IE_NAME = u'plus.google'
3080
3081     def _real_extract(self, url):
3082         # Extract id from URL
3083         mobj = re.match(self._VALID_URL, url)
3084         if mobj is None:
3085             raise ExtractorError(u'Invalid URL: %s' % url)
3086
3087         post_url = mobj.group(0)
3088         video_id = mobj.group(1)
3089
3090         video_extension = 'flv'
3091
3092         # Step 1, Retrieve post webpage to extract further information
3093         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3094
3095         self.report_extraction(video_id)
3096
3097         # Extract update date
3098         upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3099             webpage, u'upload date', fatal=False)
3100         if upload_date:
3101             # Convert timestring to a format suitable for filename
3102             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3103             upload_date = upload_date.strftime('%Y%m%d')
3104
3105         # Extract uploader
3106         uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3107             webpage, u'uploader', fatal=False)
3108
3109         # Extract title
3110         # Get the first line for title
3111         # TODO: implement default_value in search_regex
3112         video_title = u'NA'
3113         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3114         mobj = re.search(pattern, webpage)
3115         if mobj:
3116             video_title = mobj.group(1)
3117
3118         # Step 2, Stimulate clicking the image box to launch video
3119         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3120             webpage, u'video page URL')
3121         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3122
3123         # Extract video links on video page
3124         """Extract video links of all sizes"""
3125         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3126         mobj = re.findall(pattern, webpage)
3127         if len(mobj) == 0:
3128             raise ExtractorError(u'Unable to extract video links')
3129
3130         # Sort in resolution
3131         links = sorted(mobj)
3132
3133         # Choose the lowest of the sort, i.e. highest resolution
3134         video_url = links[-1]
3135         # Only get the url. The resolution part in the tuple has no use anymore
3136         video_url = video_url[-1]
3137         # Treat escaped \u0026 style hex
3138         try:
3139             video_url = video_url.decode("unicode_escape")
3140         except AttributeError: # Python 3
3141             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3142
3143
3144         return [{
3145             'id':       video_id,
3146             'url':      video_url,
3147             'uploader': uploader,
3148             'upload_date':  upload_date,
3149             'title':    video_title,
3150             'ext':      video_extension,
3151         }]
3152
3153 class NBAIE(InfoExtractor):
3154     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3155     IE_NAME = u'nba'
3156
3157     def _real_extract(self, url):
3158         mobj = re.match(self._VALID_URL, url)
3159         if mobj is None:
3160             raise ExtractorError(u'Invalid URL: %s' % url)
3161
3162         video_id = mobj.group(1)
3163         if video_id.endswith('/index.html'):
3164             video_id = video_id[:-len('/index.html')]
3165
3166         webpage = self._download_webpage(url, video_id)
3167
3168         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3169
3170         # TODO: implement default_value in search_regex
3171         def _findProp(rexp, default=None):
3172             m = re.search(rexp, webpage)
3173             if m:
3174                 return unescapeHTML(m.group(1))
3175             else:
3176                 return default
3177
3178         shortened_video_id = video_id.rpartition('/')[2]
3179         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3180         info = {
3181             'id': shortened_video_id,
3182             'url': video_url,
3183             'ext': 'mp4',
3184             'title': title,
3185             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3186             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3187         }
3188         return [info]
3189
3190 class JustinTVIE(InfoExtractor):
3191     """Information extractor for justin.tv and twitch.tv"""
3192     # TODO: One broadcast may be split into multiple videos. The key
3193     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3194     # starts at 1 and increases. Can we treat all parts as one video?
3195
3196     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3197         (?:
3198             (?P<channelid>[^/]+)|
3199             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3200             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3201         )
3202         /?(?:\#.*)?$
3203         """
3204     _JUSTIN_PAGE_LIMIT = 100
3205     IE_NAME = u'justin.tv'
3206
3207     def report_download_page(self, channel, offset):
3208         """Report attempt to download a single page of videos."""
3209         self.to_screen(u'%s: Downloading video information from %d to %d' %
3210                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3211
3212     # Return count of items, list of *valid* items
3213     def _parse_page(self, url, video_id):
3214         webpage = self._download_webpage(url, video_id,
3215                                          u'Downloading video info JSON',
3216                                          u'unable to download video info JSON')
3217
3218         response = json.loads(webpage)
3219         if type(response) != list:
3220             error_text = response.get('error', 'unknown error')
3221             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3222         info = []
3223         for clip in response:
3224             video_url = clip['video_file_url']
3225             if video_url:
3226                 video_extension = os.path.splitext(video_url)[1][1:]
3227                 video_date = re.sub('-', '', clip['start_time'][:10])
3228                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3229                 video_id = clip['id']
3230                 video_title = clip.get('title', video_id)
3231                 info.append({
3232                     'id': video_id,
3233                     'url': video_url,
3234                     'title': video_title,
3235                     'uploader': clip.get('channel_name', video_uploader_id),
3236                     'uploader_id': video_uploader_id,
3237                     'upload_date': video_date,
3238                     'ext': video_extension,
3239                 })
3240         return (len(response), info)
3241
3242     def _real_extract(self, url):
3243         mobj = re.match(self._VALID_URL, url)
3244         if mobj is None:
3245             raise ExtractorError(u'invalid URL: %s' % url)
3246
3247         api_base = 'http://api.justin.tv'
3248         paged = False
3249         if mobj.group('channelid'):
3250             paged = True
3251             video_id = mobj.group('channelid')
3252             api = api_base + '/channel/archives/%s.json' % video_id
3253         elif mobj.group('chapterid'):
3254             chapter_id = mobj.group('chapterid')
3255
3256             webpage = self._download_webpage(url, chapter_id)
3257             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3258             if not m:
3259                 raise ExtractorError(u'Cannot find archive of a chapter')
3260             archive_id = m.group(1)
3261
3262             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3263             chapter_info_xml = self._download_webpage(api, chapter_id,
3264                                              note=u'Downloading chapter information',
3265                                              errnote=u'Chapter information download failed')
3266             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3267             for a in doc.findall('.//archive'):
3268                 if archive_id == a.find('./id').text:
3269                     break
3270             else:
3271                 raise ExtractorError(u'Could not find chapter in chapter information')
3272
3273             video_url = a.find('./video_file_url').text
3274             video_ext = video_url.rpartition('.')[2] or u'flv'
3275
3276             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3277             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3278                                    note='Downloading chapter metadata',
3279                                    errnote='Download of chapter metadata failed')
3280             chapter_info = json.loads(chapter_info_json)
3281
3282             bracket_start = int(doc.find('.//bracket_start').text)
3283             bracket_end = int(doc.find('.//bracket_end').text)
3284
3285             # TODO determine start (and probably fix up file)
3286             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3287             #video_url += u'?start=' + TODO:start_timestamp
3288             # bracket_start is 13290, but we want 51670615
3289             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3290                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3291
3292             info = {
3293                 'id': u'c' + chapter_id,
3294                 'url': video_url,
3295                 'ext': video_ext,
3296                 'title': chapter_info['title'],
3297                 'thumbnail': chapter_info['preview'],
3298                 'description': chapter_info['description'],
3299                 'uploader': chapter_info['channel']['display_name'],
3300                 'uploader_id': chapter_info['channel']['name'],
3301             }
3302             return [info]
3303         else:
3304             video_id = mobj.group('videoid')
3305             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3306
3307         self.report_extraction(video_id)
3308
3309         info = []
3310         offset = 0
3311         limit = self._JUSTIN_PAGE_LIMIT
3312         while True:
3313             if paged:
3314                 self.report_download_page(video_id, offset)
3315             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3316             page_count, page_info = self._parse_page(page_url, video_id)
3317             info.extend(page_info)
3318             if not paged or page_count != limit:
3319                 break
3320             offset += limit
3321         return info
3322
3323 class FunnyOrDieIE(InfoExtractor):
3324     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3325
3326     def _real_extract(self, url):
3327         mobj = re.match(self._VALID_URL, url)
3328         if mobj is None:
3329             raise ExtractorError(u'invalid URL: %s' % url)
3330
3331         video_id = mobj.group('id')
3332         webpage = self._download_webpage(url, video_id)
3333
3334         video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3335             webpage, u'video URL', flags=re.DOTALL)
3336         video_url = unescapeHTML(video_url)
3337
3338         # TODO: implement fallbacks in regex_search
3339         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3340         if not m:
3341             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3342             if not m:
3343                 raise ExtractorError(u'Cannot find video title')
3344         title = clean_html(m.group('title'))
3345
3346         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3347             webpage, u'description', flags=re.DOTALL)
3348         if video_description: video_description = unescapeHTML(video_description)
3349
3350         info = {
3351             'id': video_id,
3352             'url': video_url,
3353             'ext': 'mp4',
3354             'title': title,
3355             'description': video_description,
3356         }
3357         return [info]
3358
3359 class SteamIE(InfoExtractor):
3360     _VALID_URL = r"""http://store\.steampowered\.com/
3361                 (agecheck/)?
3362                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3363                 (?P<gameID>\d+)/?
3364                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3365                 """
3366
3367     @classmethod
3368     def suitable(cls, url):
3369         """Receives a URL and returns True if suitable for this IE."""
3370         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3371
3372     def _real_extract(self, url):
3373         m = re.match(self._VALID_URL, url, re.VERBOSE)
3374         gameID = m.group('gameID')
3375         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3376         self.report_age_confirmation()
3377         webpage = self._download_webpage(videourl, gameID)
3378         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3379
3380         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3381         mweb = re.finditer(urlRE, webpage)
3382         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3383         titles = re.finditer(namesRE, webpage)
3384         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3385         thumbs = re.finditer(thumbsRE, webpage)
3386         videos = []
3387         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3388             video_id = vid.group('videoID')
3389             title = vtitle.group('videoName')
3390             video_url = vid.group('videoURL')
3391             video_thumb = thumb.group('thumbnail')
3392             if not video_url:
3393                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3394             info = {
3395                 'id':video_id,
3396                 'url':video_url,
3397                 'ext': 'flv',
3398                 'title': unescapeHTML(title),
3399                 'thumbnail': video_thumb
3400                   }
3401             videos.append(info)
3402         return [self.playlist_result(videos, gameID, game_title)]
3403
3404 class UstreamIE(InfoExtractor):
3405     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3406     IE_NAME = u'ustream'
3407
3408     def _real_extract(self, url):
3409         m = re.match(self._VALID_URL, url)
3410         video_id = m.group('videoID')
3411
3412         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3413         webpage = self._download_webpage(url, video_id)
3414
3415         self.report_extraction(video_id)
3416
3417         video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3418             webpage, u'title')
3419
3420         uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3421             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3422         if uploader: uploader = unescapeHTML(uploader.strip())
3423
3424         thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3425             webpage, u'thumbnail', fatal=False)
3426
3427         info = {
3428                 'id': video_id,
3429                 'url': video_url,
3430                 'ext': 'flv',
3431                 'title': video_title,
3432                 'uploader': uploader,
3433                 'thumbnail': thumbnail,
3434                }
3435         return info
3436
3437 class WorldStarHipHopIE(InfoExtractor):
3438     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3439     IE_NAME = u'WorldStarHipHop'
3440
3441     def _real_extract(self, url):
3442         m = re.match(self._VALID_URL, url)
3443         video_id = m.group('id')
3444
3445         webpage_src = self._download_webpage(url, video_id)
3446
3447         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3448             webpage_src, u'video URL')
3449
3450         if 'mp4' in video_url:
3451             ext = 'mp4'
3452         else:
3453             ext = 'flv'
3454
3455         video_title = self._search_regex(r"<title>(.*)</title>",
3456             webpage_src, u'title')
3457
3458         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3459         thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3460             webpage_src, u'thumbnail', fatal=False)
3461
3462         if not thumbnail:
3463             _title = r"""candytitles.*>(.*)</span>"""
3464             mobj = re.search(_title, webpage_src)
3465             if mobj is not None:
3466                 video_title = mobj.group(1)
3467
3468         results = [{
3469                     'id': video_id,
3470                     'url' : video_url,
3471                     'title' : video_title,
3472                     'thumbnail' : thumbnail,
3473                     'ext' : ext,
3474                     }]
3475         return results
3476
3477 class RBMARadioIE(InfoExtractor):
3478     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3479
3480     def _real_extract(self, url):
3481         m = re.match(self._VALID_URL, url)
3482         video_id = m.group('videoID')
3483
3484         webpage = self._download_webpage(url, video_id)
3485
3486         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3487             webpage, u'json data')
3488
3489         try:
3490             data = json.loads(json_data)
3491         except ValueError as e:
3492             raise ExtractorError(u'Invalid JSON: ' + str(e))
3493
3494         video_url = data['akamai_url'] + '&cbr=256'
3495         url_parts = compat_urllib_parse_urlparse(video_url)
3496         video_ext = url_parts.path.rpartition('.')[2]
3497         info = {
3498                 'id': video_id,
3499                 'url': video_url,
3500                 'ext': video_ext,
3501                 'title': data['title'],
3502                 'description': data.get('teaser_text'),
3503                 'location': data.get('country_of_origin'),
3504                 'uploader': data.get('host', {}).get('name'),
3505                 'uploader_id': data.get('host', {}).get('slug'),
3506                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3507                 'duration': data.get('duration'),
3508         }
3509         return [info]
3510
3511
3512 class YouPornIE(InfoExtractor):
3513     """Information extractor for youporn.com."""
3514     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3515
3516     def _print_formats(self, formats):
3517         """Print all available formats"""
3518         print(u'Available formats:')
3519         print(u'ext\t\tformat')
3520         print(u'---------------------------------')
3521         for format in formats:
3522             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3523
3524     def _specific(self, req_format, formats):
3525         for x in formats:
3526             if(x["format"]==req_format):
3527                 return x
3528         return None
3529
3530     def _real_extract(self, url):
3531         mobj = re.match(self._VALID_URL, url)
3532         if mobj is None:
3533             raise ExtractorError(u'Invalid URL: %s' % url)
3534         video_id = mobj.group('videoid')
3535
3536         req = compat_urllib_request.Request(url)
3537         req.add_header('Cookie', 'age_verified=1')
3538         webpage = self._download_webpage(req, video_id)
3539
3540         # Get the video title
3541         video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
3542             webpage, u'title').strip()
3543
3544         # Get the video date
3545         upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
3546             webpage, u'upload date', fatal=False)
3547         if upload_date: upload_date = unified_strdate(upload_date.strip())
3548
3549         # Get the video uploader
3550         video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
3551             webpage, u'uploader', fatal=False)
3552         if video_uploader: video_uploader = clean_html(video_uploader.strip())
3553
3554         # Get all of the formats available
3555         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3556         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3557             webpage, u'download list').strip()
3558
3559         # Get all of the links from the page
3560         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3561         links = re.findall(LINK_RE, download_list_html)
3562         if(len(links) == 0):
3563             raise ExtractorError(u'ERROR: no known formats available for video')
3564
3565         self.to_screen(u'Links found: %d' % len(links))
3566
3567         formats = []
3568         for link in links:
3569
3570             # A link looks like this:
3571             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3572             # A path looks like this:
3573             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3574             video_url = unescapeHTML( link )
3575             path = compat_urllib_parse_urlparse( video_url ).path
3576             extension = os.path.splitext( path )[1][1:]
3577             format = path.split('/')[4].split('_')[:2]
3578             size = format[0]
3579             bitrate = format[1]
3580             format = "-".join( format )
3581             title = u'%s-%s-%s' % (video_title, size, bitrate)
3582
3583             formats.append({
3584                 'id': video_id,
3585                 'url': video_url,
3586                 'uploader': video_uploader,
3587                 'upload_date': upload_date,
3588                 'title': title,
3589                 'ext': extension,
3590                 'format': format,
3591                 'thumbnail': None,
3592                 'description': None,
3593                 'player_url': None
3594             })
3595
3596         if self._downloader.params.get('listformats', None):
3597             self._print_formats(formats)
3598             return
3599
3600         req_format = self._downloader.params.get('format', None)
3601         self.to_screen(u'Format: %s' % req_format)
3602
3603         if req_format is None or req_format == 'best':
3604             return [formats[0]]
3605         elif req_format == 'worst':
3606             return [formats[-1]]
3607         elif req_format in ('-1', 'all'):
3608             return formats
3609         else:
3610             format = self._specific( req_format, formats )
3611             if result is None:
3612                 raise ExtractorError(u'Requested format not available')
3613             return [format]
3614
3615
3616
3617 class PornotubeIE(InfoExtractor):
3618     """Information extractor for pornotube.com."""
3619     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3620
3621     def _real_extract(self, url):
3622         mobj = re.match(self._VALID_URL, url)
3623         if mobj is None:
3624             raise ExtractorError(u'Invalid URL: %s' % url)
3625
3626         video_id = mobj.group('videoid')
3627         video_title = mobj.group('title')
3628
3629         # Get webpage content
3630         webpage = self._download_webpage(url, video_id)
3631
3632         # Get the video URL
3633         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3634         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3635         video_url = compat_urllib_parse.unquote(video_url)
3636
3637         #Get the uploaded date
3638         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3639         upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3640         if upload_date: upload_date = unified_strdate(upload_date)
3641
3642         info = {'id': video_id,
3643                 'url': video_url,
3644                 'uploader': None,
3645                 'upload_date': upload_date,
3646                 'title': video_title,
3647                 'ext': 'flv',
3648                 'format': 'flv'}
3649
3650         return [info]
3651
3652 class YouJizzIE(InfoExtractor):
3653     """Information extractor for youjizz.com."""
3654     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3655
3656     def _real_extract(self, url):
3657         mobj = re.match(self._VALID_URL, url)
3658         if mobj is None:
3659             raise ExtractorError(u'Invalid URL: %s' % url)
3660
3661         video_id = mobj.group('videoid')
3662
3663         # Get webpage content
3664         webpage = self._download_webpage(url, video_id)
3665
3666         # Get the video title
3667         video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3668             webpage, u'title').strip()
3669
3670         # Get the embed page
3671         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3672         if result is None:
3673             raise ExtractorError(u'ERROR: unable to extract embed page')
3674
3675         embed_page_url = result.group(0).strip()
3676         video_id = result.group('videoid')
3677
3678         webpage = self._download_webpage(embed_page_url, video_id)
3679
3680         # Get the video URL
3681         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3682             webpage, u'video URL')
3683
3684         info = {'id': video_id,
3685                 'url': video_url,
3686                 'title': video_title,
3687                 'ext': 'flv',
3688                 'format': 'flv',
3689                 'player_url': embed_page_url}
3690
3691         return [info]
3692
3693 class EightTracksIE(InfoExtractor):
3694     IE_NAME = '8tracks'
3695     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3696
3697     def _real_extract(self, url):
3698         mobj = re.match(self._VALID_URL, url)
3699         if mobj is None:
3700             raise ExtractorError(u'Invalid URL: %s' % url)
3701         playlist_id = mobj.group('id')
3702
3703         webpage = self._download_webpage(url, playlist_id)
3704
3705         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3706         data = json.loads(json_like)
3707
3708         session = str(random.randint(0, 1000000000))
3709         mix_id = data['id']
3710         track_count = data['tracks_count']
3711         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3712         next_url = first_url
3713         res = []
3714         for i in itertools.count():
3715             api_json = self._download_webpage(next_url, playlist_id,
3716                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3717                 errnote=u'Failed to download song information')
3718             api_data = json.loads(api_json)
3719             track_data = api_data[u'set']['track']
3720             info = {
3721                 'id': track_data['id'],
3722                 'url': track_data['track_file_stream_url'],
3723                 'title': track_data['performer'] + u' - ' + track_data['name'],
3724                 'raw_title': track_data['name'],
3725                 'uploader_id': data['user']['login'],
3726                 'ext': 'm4a',
3727             }
3728             res.append(info)
3729             if api_data['set']['at_last_track']:
3730                 break
3731             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3732         return res
3733
3734 class KeekIE(InfoExtractor):
3735     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3736     IE_NAME = u'keek'
3737
3738     def _real_extract(self, url):
3739         m = re.match(self._VALID_URL, url)
3740         video_id = m.group('videoID')
3741
3742         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3743         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3744         webpage = self._download_webpage(url, video_id)
3745
3746         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3747             webpage, u'title')
3748         video_title = unescapeHTML(video_title)
3749
3750         uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3751             webpage, u'uploader', fatal=False)
3752         if uploader: uploader = clean_html(uploader)
3753
3754         info = {
3755                 'id': video_id,
3756                 'url': video_url,
3757                 'ext': 'mp4',
3758                 'title': video_title,
3759                 'thumbnail': thumbnail,
3760                 'uploader': uploader
3761         }
3762         return [info]
3763
3764 class TEDIE(InfoExtractor):
3765     _VALID_URL=r'''http://www\.ted\.com/
3766                    (
3767                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3768                         |
3769                         ((?P<type_talk>talks)) # We have a simple talk
3770                    )
3771                    (/lang/(.*?))? # The url may contain the language
3772                    /(?P<name>\w+) # Here goes the name and then ".html"
3773                    '''
3774
3775     @classmethod
3776     def suitable(cls, url):
3777         """Receives a URL and returns True if suitable for this IE."""
3778         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3779
3780     def _real_extract(self, url):
3781         m=re.match(self._VALID_URL, url, re.VERBOSE)
3782         if m.group('type_talk'):
3783             return [self._talk_info(url)]
3784         else :
3785             playlist_id=m.group('playlist_id')
3786             name=m.group('name')
3787             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3788             return [self._playlist_videos_info(url,name,playlist_id)]
3789
3790     def _talk_video_link(self,mediaSlug):
3791         '''Returns the video link for that mediaSlug'''
3792         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3793
3794     def _playlist_videos_info(self,url,name,playlist_id=0):
3795         '''Returns the videos of the playlist'''
3796         video_RE=r'''
3797                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3798                      ([.\s]*?)data-playlist_item_id="(\d+)"
3799                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3800                      '''
3801         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3802         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3803         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3804         m_names=re.finditer(video_name_RE,webpage)
3805
3806         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3807         m_playlist = re.search(playlist_RE, webpage)
3808         playlist_title = m_playlist.group('playlist_title')
3809
3810         playlist_entries = []
3811         for m_video, m_name in zip(m_videos,m_names):
3812             video_id=m_video.group('video_id')
3813             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3814             playlist_entries.append(self.url_result(talk_url, 'TED'))
3815         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3816
3817     def _talk_info(self, url, video_id=0):
3818         """Return the video for the talk in the url"""
3819         m=re.match(self._VALID_URL, url,re.VERBOSE)
3820         videoName=m.group('name')
3821         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3822         # If the url includes the language we get the title translated
3823         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3824         title=re.search(title_RE, webpage).group('title')
3825         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3826                         "id":(?P<videoID>[\d]+).*?
3827                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3828         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3829         thumb_match=re.search(thumb_RE,webpage)
3830         info_match=re.search(info_RE,webpage,re.VERBOSE)
3831         video_id=info_match.group('videoID')
3832         mediaSlug=info_match.group('mediaSlug')
3833         video_url=self._talk_video_link(mediaSlug)
3834         info = {
3835                 'id': video_id,
3836                 'url': video_url,
3837                 'ext': 'mp4',
3838                 'title': title,
3839                 'thumbnail': thumb_match.group('thumbnail')
3840                 }
3841         return info
3842
3843 class MySpassIE(InfoExtractor):
3844     _VALID_URL = r'http://www.myspass.de/.*'
3845
3846     def _real_extract(self, url):
3847         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3848
3849         # video id is the last path element of the URL
3850         # usually there is a trailing slash, so also try the second but last
3851         url_path = compat_urllib_parse_urlparse(url).path
3852         url_parent_path, video_id = os.path.split(url_path)
3853         if not video_id:
3854             _, video_id = os.path.split(url_parent_path)
3855
3856         # get metadata
3857         metadata_url = META_DATA_URL_TEMPLATE % video_id
3858         metadata_text = self._download_webpage(metadata_url, video_id)
3859         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3860
3861         # extract values from metadata
3862         url_flv_el = metadata.find('url_flv')
3863         if url_flv_el is None:
3864             raise ExtractorError(u'Unable to extract download url')
3865         video_url = url_flv_el.text
3866         extension = os.path.splitext(video_url)[1][1:]
3867         title_el = metadata.find('title')
3868         if title_el is None:
3869             raise ExtractorError(u'Unable to extract title')
3870         title = title_el.text
3871         format_id_el = metadata.find('format_id')
3872         if format_id_el is None:
3873             format = ext
3874         else:
3875             format = format_id_el.text
3876         description_el = metadata.find('description')
3877         if description_el is not None:
3878             description = description_el.text
3879         else:
3880             description = None
3881         imagePreview_el = metadata.find('imagePreview')
3882         if imagePreview_el is not None:
3883             thumbnail = imagePreview_el.text
3884         else:
3885             thumbnail = None
3886         info = {
3887             'id': video_id,
3888             'url': video_url,
3889             'title': title,
3890             'ext': extension,
3891             'format': format,
3892             'thumbnail': thumbnail,
3893             'description': description
3894         }
3895         return [info]
3896
3897 class SpiegelIE(InfoExtractor):
3898     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3899
3900     def _real_extract(self, url):
3901         m = re.match(self._VALID_URL, url)
3902         video_id = m.group('videoID')
3903
3904         webpage = self._download_webpage(url, video_id)
3905
3906         video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3907             webpage, u'title')
3908         video_title = unescapeHTML(video_title)
3909
3910         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3911         xml_code = self._download_webpage(xml_url, video_id,
3912                     note=u'Downloading XML', errnote=u'Failed to download XML')
3913
3914         idoc = xml.etree.ElementTree.fromstring(xml_code)
3915         last_type = idoc[-1]
3916         filename = last_type.findall('./filename')[0].text
3917         duration = float(last_type.findall('./duration')[0].text)
3918
3919         video_url = 'http://video2.spiegel.de/flash/' + filename
3920         video_ext = filename.rpartition('.')[2]
3921         info = {
3922             'id': video_id,
3923             'url': video_url,
3924             'ext': video_ext,
3925             'title': video_title,
3926             'duration': duration,
3927         }
3928         return [info]
3929
3930 class LiveLeakIE(InfoExtractor):
3931
3932     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3933     IE_NAME = u'liveleak'
3934
3935     def _real_extract(self, url):
3936         mobj = re.match(self._VALID_URL, url)
3937         if mobj is None:
3938             raise ExtractorError(u'Invalid URL: %s' % url)
3939
3940         video_id = mobj.group('video_id')
3941
3942         webpage = self._download_webpage(url, video_id)
3943
3944         video_url = self._search_regex(r'file: "(.*?)",',
3945             webpage, u'video URL')
3946
3947         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3948             webpage, u'title')
3949         video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3950
3951         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3952             webpage, u'description', fatal=False)
3953         if video_description: video_description = unescapeHTML(video_description)
3954
3955         video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3956             webpage, u'uploader', fatal=False)
3957
3958         info = {
3959             'id':  video_id,
3960             'url': video_url,
3961             'ext': 'mp4',
3962             'title': video_title,
3963             'description': video_description,
3964             'uploader': video_uploader
3965         }
3966
3967         return [info]
3968
3969 class ARDIE(InfoExtractor):
3970     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3971     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3972     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3973
3974     def _real_extract(self, url):
3975         # determine video id from url
3976         m = re.match(self._VALID_URL, url)
3977
3978         numid = re.search(r'documentId=([0-9]+)', url)
3979         if numid:
3980             video_id = numid.group(1)
3981         else:
3982             video_id = m.group('video_id')
3983
3984         # determine title and media streams from webpage
3985         html = self._download_webpage(url, video_id)
3986         title = re.search(self._TITLE, html).group('title')
3987         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3988         if not streams:
3989             assert '"fsk"' in html
3990             raise ExtractorError(u'This video is only available after 8:00 pm')
3991
3992         # choose default media type and highest quality for now
3993         stream = max([s for s in streams if int(s["media_type"]) == 0],
3994                      key=lambda s: int(s["quality"]))
3995
3996         # there's two possibilities: RTMP stream or HTTP download
3997         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3998         if stream['rtmp_url']:
3999             self.to_screen(u'RTMP download detected')
4000             assert stream['video_url'].startswith('mp4:')
4001             info["url"] = stream["rtmp_url"]
4002             info["play_path"] = stream['video_url']
4003         else:
4004             assert stream["video_url"].endswith('.mp4')
4005             info["url"] = stream["video_url"]
4006         return [info]
4007
4008 class TumblrIE(InfoExtractor):
4009     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4010
4011     def _real_extract(self, url):
4012         m_url = re.match(self._VALID_URL, url)
4013         video_id = m_url.group('id')
4014         blog = m_url.group('blog_name')
4015
4016         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4017         webpage = self._download_webpage(url, video_id)
4018
4019         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4020         video = re.search(re_video, webpage)
4021         if video is None:
4022            raise ExtractorError(u'Unable to extract video')
4023         video_url = video.group('video_url')
4024         ext = video.group('ext')
4025
4026         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4027             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4028         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4029
4030         # The only place where you can get a title, it's not complete,
4031         # but searching in other places doesn't work for all videos
4032         video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4033             webpage, u'title', flags=re.DOTALL)
4034         video_title = unescapeHTML(video_title)
4035
4036         return [{'id': video_id,
4037                  'url': video_url,
4038                  'title': video_title,
4039                  'thumbnail': video_thumbnail,
4040                  'ext': ext
4041                  }]
4042
4043 class BandcampIE(InfoExtractor):
4044     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4045
4046     def _real_extract(self, url):
4047         mobj = re.match(self._VALID_URL, url)
4048         title = mobj.group('title')
4049         webpage = self._download_webpage(url, title)
4050         # We get the link to the free download page
4051         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4052         if m_download is None:
4053             raise ExtractorError(u'No free songs found')
4054
4055         download_link = m_download.group(1)
4056         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4057                        webpage, re.MULTILINE|re.DOTALL).group('id')
4058
4059         download_webpage = self._download_webpage(download_link, id,
4060                                                   'Downloading free downloads page')
4061         # We get the dictionary of the track from some javascrip code
4062         info = re.search(r'items: (.*?),$',
4063                          download_webpage, re.MULTILINE).group(1)
4064         info = json.loads(info)[0]
4065         # We pick mp3-320 for now, until format selection can be easily implemented.
4066         mp3_info = info[u'downloads'][u'mp3-320']
4067         # If we try to use this url it says the link has expired
4068         initial_url = mp3_info[u'url']
4069         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4070         m_url = re.match(re_url, initial_url)
4071         #We build the url we will use to get the final track url
4072         # This url is build in Bandcamp in the script download_bunde_*.js
4073         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4074         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4075         # If we could correctly generate the .rand field the url would be
4076         #in the "download_url" key
4077         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4078
4079         track_info = {'id':id,
4080                       'title' : info[u'title'],
4081                       'ext' :   'mp3',
4082                       'url' :   final_url,
4083                       'thumbnail' : info[u'thumb_url'],
4084                       'uploader' :  info[u'artist']
4085                       }
4086
4087         return [track_info]
4088
4089 class RedTubeIE(InfoExtractor):
4090     """Information Extractor for redtube"""
4091     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4092
4093     def _real_extract(self,url):
4094         mobj = re.match(self._VALID_URL, url)
4095         if mobj is None:
4096             raise ExtractorError(u'Invalid URL: %s' % url)
4097
4098         video_id = mobj.group('id')
4099         video_extension = 'mp4'
4100         webpage = self._download_webpage(url, video_id)
4101
4102         self.report_extraction(video_id)
4103
4104         video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4105             webpage, u'video URL')
4106
4107         video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4108             webpage, u'title')
4109
4110         return [{
4111             'id':       video_id,
4112             'url':      video_url,
4113             'ext':      video_extension,
4114             'title':    video_title,
4115         }]
4116
4117 class InaIE(InfoExtractor):
4118     """Information Extractor for Ina.fr"""
4119     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4120
4121     def _real_extract(self,url):
4122         mobj = re.match(self._VALID_URL, url)
4123
4124         video_id = mobj.group('id')
4125         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4126         video_extension = 'mp4'
4127         webpage = self._download_webpage(mrss_url, video_id)
4128
4129         self.report_extraction(video_id)
4130
4131         video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4132             webpage, u'video URL')
4133
4134         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4135             webpage, u'title')
4136
4137         return [{
4138             'id':       video_id,
4139             'url':      video_url,
4140             'ext':      video_extension,
4141             'title':    video_title,
4142         }]
4143
4144 class HowcastIE(InfoExtractor):
4145     """Information Extractor for Howcast.com"""
4146     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4147
4148     def _real_extract(self, url):
4149         mobj = re.match(self._VALID_URL, url)
4150
4151         video_id = mobj.group('id')
4152         webpage_url = 'http://www.howcast.com/videos/' + video_id
4153         webpage = self._download_webpage(webpage_url, video_id)
4154
4155         self.report_extraction(video_id)
4156
4157         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4158             webpage, u'video URL')
4159
4160         video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4161             webpage, u'title')
4162
4163         video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4164             webpage, u'description', fatal=False)
4165
4166         thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4167             webpage, u'thumbnail', fatal=False)
4168
4169         return [{
4170             'id':       video_id,
4171             'url':      video_url,
4172             'ext':      'mp4',
4173             'title':    video_title,
4174             'description': video_description,
4175             'thumbnail': thumbnail,
4176         }]
4177
4178 class VineIE(InfoExtractor):
4179     """Information Extractor for Vine.co"""
4180     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4181
4182     def _real_extract(self, url):
4183         mobj = re.match(self._VALID_URL, url)
4184
4185         video_id = mobj.group('id')
4186         webpage_url = 'https://vine.co/v/' + video_id
4187         webpage = self._download_webpage(webpage_url, video_id)
4188
4189         self.report_extraction(video_id)
4190
4191         video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4192             webpage, u'video URL')
4193
4194         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4195             webpage, u'title')
4196
4197         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4198             webpage, u'thumbnail', fatal=False)
4199
4200         uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4201             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4202
4203         return [{
4204             'id':        video_id,
4205             'url':       video_url,
4206             'ext':       'mp4',
4207             'title':     video_title,
4208             'thumbnail': thumbnail,
4209             'uploader':  uploader,
4210         }]
4211
4212 class FlickrIE(InfoExtractor):
4213     """Information Extractor for Flickr videos"""
4214     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4215
4216     def _real_extract(self, url):
4217         mobj = re.match(self._VALID_URL, url)
4218
4219         video_id = mobj.group('id')
4220         video_uploader_id = mobj.group('uploader_id')
4221         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4222         webpage = self._download_webpage(webpage_url, video_id)
4223
4224         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4225
4226         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4227         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4228
4229         node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4230             first_xml, u'node_id')
4231
4232         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4233         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4234
4235         self.report_extraction(video_id)
4236
4237         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4238         if mobj is None:
4239             raise ExtractorError(u'Unable to extract video url')
4240         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4241
4242         video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4243             webpage, u'video title')
4244
4245         video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4246             webpage, u'description', fatal=False)
4247
4248         thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4249             webpage, u'thumbnail', fatal=False)
4250
4251         return [{
4252             'id':          video_id,
4253             'url':         video_url,
4254             'ext':         'mp4',
4255             'title':       video_title,
4256             'description': video_description,
4257             'thumbnail':   thumbnail,
4258             'uploader_id': video_uploader_id,
4259         }]
4260
4261 class TeamcocoIE(InfoExtractor):
4262     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4263
4264     def _real_extract(self, url):
4265         mobj = re.match(self._VALID_URL, url)
4266         if mobj is None:
4267             raise ExtractorError(u'Invalid URL: %s' % url)
4268         url_title = mobj.group('url_title')
4269         webpage = self._download_webpage(url, url_title)
4270
4271         video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4272             webpage, u'video id')
4273
4274         self.report_extraction(video_id)
4275
4276         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4277             webpage, u'title')
4278
4279         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4280             webpage, u'thumbnail', fatal=False)
4281
4282         video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4283             webpage, u'description', fatal=False)
4284
4285         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4286         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4287
4288         video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4289             data, u'video URL')
4290
4291         return [{
4292             'id':          video_id,
4293             'url':         video_url,
4294             'ext':         'mp4',
4295             'title':       video_title,
4296             'thumbnail':   thumbnail,
4297             'description': video_description,
4298         }]
4299
4300 class XHamsterIE(InfoExtractor):
4301     """Information Extractor for xHamster"""
4302     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4303
4304     def _real_extract(self,url):
4305         mobj = re.match(self._VALID_URL, url)
4306
4307         video_id = mobj.group('id')
4308         mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4309         webpage = self._download_webpage(mrss_url, video_id)
4310         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4311         if mobj is None:
4312             raise ExtractorError(u'Unable to extract media URL')
4313         if len(mobj.group('server')) == 0:
4314             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4315         else:
4316             video_url = mobj.group('server')+'/key='+mobj.group('file')
4317         video_extension = video_url.split('.')[-1]
4318
4319         mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4320         if mobj is None:
4321             raise ExtractorError(u'Unable to extract title')
4322         video_title = unescapeHTML(mobj.group('title'))
4323
4324         mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4325         if mobj is None:
4326             video_description = u''
4327         else:
4328             video_description = unescapeHTML(mobj.group('description'))
4329
4330         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4331         if mobj is None:
4332             raise ExtractorError(u'Unable to extract upload date')
4333         video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4334
4335         mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4336         if mobj is None:
4337             video_uploader_id = u'anonymous'
4338         else:
4339             video_uploader_id = mobj.group('uploader_id')
4340
4341         mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4342         if mobj is None:
4343             raise ExtractorError(u'Unable to extract thumbnail URL')
4344         video_thumbnail = mobj.group('thumbnail')
4345
4346         return [{
4347             'id':       video_id,
4348             'url':      video_url,
4349             'ext':      video_extension,
4350             'title':    video_title,
4351             'description': video_description,
4352             'upload_date': video_upload_date,
4353             'uploader_id': video_uploader_id,
4354             'thumbnail': video_thumbnail
4355         }]
4356
4357 class HypemIE(InfoExtractor):
4358     """Information Extractor for hypem"""
4359     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4360
4361     def _real_extract(self, url):
4362         mobj = re.match(self._VALID_URL, url)
4363         if mobj is None:
4364             raise ExtractorError(u'Invalid URL: %s' % url)
4365         track_id = mobj.group(1)
4366
4367         data = { 'ax': 1, 'ts': time.time() }
4368         data_encoded = compat_urllib_parse.urlencode(data)
4369         complete_url = url + "?" + data_encoded
4370         request = compat_urllib_request.Request(complete_url)
4371         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4372         cookie = urlh.headers.get('Set-Cookie', '')
4373
4374         self.report_extraction(track_id)
4375         mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4376         if mobj is None:
4377             raise ExtractorError(u'Unable to extrack tracks')
4378         html_tracks = mobj.group(1).strip()
4379         try:
4380             track_list = json.loads(html_tracks)
4381             track = track_list[u'tracks'][0]
4382         except ValueError:
4383             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4384
4385         key = track[u"key"]
4386         track_id = track[u"id"]
4387         artist = track[u"artist"]
4388         title = track[u"song"]
4389
4390         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4391         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4392         request.add_header('cookie', cookie)
4393         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4394         try:
4395             song_data = json.loads(song_data_json)
4396         except ValueError:
4397             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4398         final_url = song_data[u"url"]
4399
4400         return [{
4401             'id':       track_id,
4402             'url':      final_url,
4403             'ext':      "mp3",
4404             'title':    title,
4405             'artist':   artist,
4406         }]
4407
4408
4409 def gen_extractors():
4410     """ Return a list of an instance of every supported extractor.
4411     The order does matter; the first extractor matched is the one handling the URL.
4412     """
4413     return [
4414         YoutubePlaylistIE(),
4415         YoutubeChannelIE(),
4416         YoutubeUserIE(),
4417         YoutubeSearchIE(),
4418         YoutubeIE(),
4419         MetacafeIE(),
4420         DailymotionIE(),
4421         GoogleSearchIE(),
4422         PhotobucketIE(),
4423         YahooIE(),
4424         YahooSearchIE(),
4425         DepositFilesIE(),
4426         FacebookIE(),
4427         BlipTVIE(),
4428         BlipTVUserIE(),
4429         VimeoIE(),
4430         MyVideoIE(),
4431         ComedyCentralIE(),
4432         EscapistIE(),
4433         CollegeHumorIE(),
4434         XVideosIE(),
4435         SoundcloudSetIE(),
4436         SoundcloudIE(),
4437         InfoQIE(),
4438         MixcloudIE(),
4439         StanfordOpenClassroomIE(),
4440         MTVIE(),
4441         YoukuIE(),
4442         XNXXIE(),
4443         YouJizzIE(),
4444         PornotubeIE(),
4445         YouPornIE(),
4446         GooglePlusIE(),
4447         ArteTvIE(),
4448         NBAIE(),
4449         WorldStarHipHopIE(),
4450         JustinTVIE(),
4451         FunnyOrDieIE(),
4452         SteamIE(),
4453         UstreamIE(),
4454         RBMARadioIE(),
4455         EightTracksIE(),
4456         KeekIE(),
4457         TEDIE(),
4458         MySpassIE(),
4459         SpiegelIE(),
4460         LiveLeakIE(),
4461         ARDIE(),
4462         TumblrIE(),
4463         BandcampIE(),
4464         RedTubeIE(),
4465         InaIE(),
4466         HowcastIE(),
4467         VineIE(),
4468         FlickrIE(),
4469         TeamcocoIE(),
4470         XHamsterIE(),
4471         HypemIE(),
4472         GenericIE()
4473     ]
4474
4475 def get_info_extractor(ie_name):
4476     """Returns the info extractor class with the given ie_name"""
4477     return globals()[ie_name+'IE']