_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s; '
 220                 u'please report this issue on GitHub.' % _name)
 221         else:
 222             self._downloader.report_warning(u'unable to extract %s; '
 223                 u'please report this issue on GitHub.' % _name)
 224             return None
 225
 226 class SearchInfoExtractor(InfoExtractor):
 227     """
 228     Base class for paged search queries extractors.
 229     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 230     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 231     """
 232
 233     @classmethod
 234     def _make_valid_url(cls):
 235         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 236
 237     @classmethod
 238     def suitable(cls, url):
 239         return re.match(cls._make_valid_url(), url) is not None
 240
 241     def _real_extract(self, query):
 242         mobj = re.match(self._make_valid_url(), query)
 243         if mobj is None:
 244             raise ExtractorError(u'Invalid search query "%s"' % query)
 245
 246         prefix = mobj.group('prefix')
 247         query = mobj.group('query')
 248         if prefix == '':
 249             return self._get_n_results(query, 1)
 250         elif prefix == 'all':
 251             return self._get_n_results(query, self._MAX_RESULTS)
 252         else:
 253             n = int(prefix)
 254             if n <= 0:
 255                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 256             elif n > self._MAX_RESULTS:
 257                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 258                 n = self._MAX_RESULTS
 259             return self._get_n_results(query, n)
 260
 261     def _get_n_results(self, query, n):
 262         """Get a specified number of results for a query"""
 263         raise NotImplementedError("This method must be implemented by sublclasses")
 264
 265
 266 class YoutubeIE(InfoExtractor):
 267     """Information extractor for youtube.com."""
 268
 269     _VALID_URL = r"""^
 270                      (
 271                          (?:https?://)?                                       # http(s):// (optional)
 272                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 273                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 274                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 275                          (?:                                                  # the various things that can precede the ID:
 276                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 277                              |(?:                                             # or the v= param in all its forms
 278                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 279                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 280                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 281                                  v=
 282                              )
 283                          )?                                                   # optional -> youtube.com/xxxx is OK
 284                      )?                                                       # all until now is optional -> you can pass the naked ID
 285                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 286                      (?(1).+)?                                                # if we found the ID, everything can follow
 287                      $"""
 288     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 289     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 290     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 291     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 292     _NETRC_MACHINE = 'youtube'
 293     # Listed in order of quality
 294     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 295     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 296     _video_extensions = {
 297         '13': '3gp',
 298         '17': 'mp4',
 299         '18': 'mp4',
 300         '22': 'mp4',
 301         '37': 'mp4',
 302         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 303         '43': 'webm',
 304         '44': 'webm',
 305         '45': 'webm',
 306         '46': 'webm',
 307     }
 308     _video_dimensions = {
 309         '5': '240x400',
 310         '6': '???',
 311         '13': '???',
 312         '17': '144x176',
 313         '18': '360x640',
 314         '22': '720x1280',
 315         '34': '360x640',
 316         '35': '480x854',
 317         '37': '1080x1920',
 318         '38': '3072x4096',
 319         '43': '360x640',
 320         '44': '480x854',
 321         '45': '720x1280',
 322         '46': '1080x1920',
 323     }
 324     IE_NAME = u'youtube'
 325
 326     @classmethod
 327     def suitable(cls, url):
 328         """Receives a URL and returns True if suitable for this IE."""
 329         if YoutubePlaylistIE.suitable(url): return False
 330         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 331
 332     def report_lang(self):
 333         """Report attempt to set language."""
 334         self.to_screen(u'Setting language')
 335
 336     def report_login(self):
 337         """Report attempt to log in."""
 338         self.to_screen(u'Logging in')
 339
 340     def report_video_webpage_download(self, video_id):
 341         """Report attempt to download video webpage."""
 342         self.to_screen(u'%s: Downloading video webpage' % video_id)
 343
 344     def report_video_info_webpage_download(self, video_id):
 345         """Report attempt to download video info webpage."""
 346         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 347
 348     def report_video_subtitles_download(self, video_id):
 349         """Report attempt to download video info webpage."""
 350         self.to_screen(u'%s: Checking available subtitles' % video_id)
 351
 352     def report_video_subtitles_request(self, video_id, sub_lang, format):
 353         """Report attempt to download video info webpage."""
 354         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 355
 356     def report_video_subtitles_available(self, video_id, sub_lang_list):
 357         """Report available subtitles."""
 358         sub_lang = ",".join(list(sub_lang_list.keys()))
 359         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 360
 361     def report_information_extraction(self, video_id):
 362         """Report attempt to extract video information."""
 363         self.to_screen(u'%s: Extracting video information' % video_id)
 364
 365     def report_unavailable_format(self, video_id, format):
 366         """Report extracted video URL."""
 367         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 368
 369     def report_rtmp_download(self):
 370         """Indicate the download will use the RTMP protocol."""
 371         self.to_screen(u'RTMP download detected')
 372
 373     def _get_available_subtitles(self, video_id):
 374         self.report_video_subtitles_download(video_id)
 375         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 376         try:
 377             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 378         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 379             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 380         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 381         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 382         if not sub_lang_list:
 383             return (u'video doesn\'t have subtitles', None)
 384         return sub_lang_list
 385
 386     def _list_available_subtitles(self, video_id):
 387         sub_lang_list = self._get_available_subtitles(video_id)
 388         self.report_video_subtitles_available(video_id, sub_lang_list)
 389
 390     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 391         """
 392         Return tuple:
 393         (error_message, sub_lang, sub)
 394         """
 395         self.report_video_subtitles_request(video_id, sub_lang, format)
 396         params = compat_urllib_parse.urlencode({
 397             'lang': sub_lang,
 398             'name': sub_name,
 399             'v': video_id,
 400             'fmt': format,
 401         })
 402         url = 'http://www.youtube.com/api/timedtext?' + params
 403         try:
 404             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 406             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 407         if not sub:
 408             return (u'Did not fetch video subtitles', None, None)
 409         return (None, sub_lang, sub)
 410
 411     def _request_automatic_caption(self, video_id, webpage):
 412         """We need the webpage for getting the captions url, pass it as an
 413            argument to speed up the process."""
 414         sub_lang = self._downloader.params.get('subtitleslang')
 415         sub_format = self._downloader.params.get('subtitlesformat')
 416         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 417         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 418         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 419         if mobj is None:
 420             return [(err_msg, None, None)]
 421         player_config = json.loads(mobj.group(1))
 422         try:
 423             args = player_config[u'args']
 424             caption_url = args[u'ttsurl']
 425             timestamp = args[u'timestamp']
 426             params = compat_urllib_parse.urlencode({
 427                 'lang': 'en',
 428                 'tlang': sub_lang,
 429                 'fmt': sub_format,
 430                 'ts': timestamp,
 431                 'kind': 'asr',
 432             })
 433             subtitles_url = caption_url + '&' + params
 434             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 435             return [(None, sub_lang, sub)]
 436         except KeyError:
 437             return [(err_msg, None, None)]
 438
 439     def _extract_subtitle(self, video_id):
 440         """
 441         Return a list with a tuple:
 442         [(error_message, sub_lang, sub)]
 443         """
 444         sub_lang_list = self._get_available_subtitles(video_id)
 445         sub_format = self._downloader.params.get('subtitlesformat')
 446         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 447             return [(sub_lang_list[0], None, None)]
 448         if self._downloader.params.get('subtitleslang', False):
 449             sub_lang = self._downloader.params.get('subtitleslang')
 450         elif 'en' in sub_lang_list:
 451             sub_lang = 'en'
 452         else:
 453             sub_lang = list(sub_lang_list.keys())[0]
 454         if not sub_lang in sub_lang_list:
 455             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 456
 457         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 458         return [subtitle]
 459
 460     def _extract_all_subtitles(self, video_id):
 461         sub_lang_list = self._get_available_subtitles(video_id)
 462         sub_format = self._downloader.params.get('subtitlesformat')
 463         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 464             return [(sub_lang_list[0], None, None)]
 465         subtitles = []
 466         for sub_lang in sub_lang_list:
 467             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 468             subtitles.append(subtitle)
 469         return subtitles
 470
 471     def _print_formats(self, formats):
 472         print('Available formats:')
 473         for x in formats:
 474             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 475
 476     def _real_initialize(self):
 477         if self._downloader is None:
 478             return
 479
 480         username = None
 481         password = None
 482         downloader_params = self._downloader.params
 483
 484         # Attempt to use provided username and password or .netrc data
 485         if downloader_params.get('username', None) is not None:
 486             username = downloader_params['username']
 487             password = downloader_params['password']
 488         elif downloader_params.get('usenetrc', False):
 489             try:
 490                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 491                 if info is not None:
 492                     username = info[0]
 493                     password = info[2]
 494                 else:
 495                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 496             except (IOError, netrc.NetrcParseError) as err:
 497                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 498                 return
 499
 500         # Set language
 501         request = compat_urllib_request.Request(self._LANG_URL)
 502         try:
 503             self.report_lang()
 504             compat_urllib_request.urlopen(request).read()
 505         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 506             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 507             return
 508
 509         # No authentication to be performed
 510         if username is None:
 511             return
 512
 513         request = compat_urllib_request.Request(self._LOGIN_URL)
 514         try:
 515             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 517             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 518             return
 519
 520         galx = None
 521         dsh = None
 522         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 523         if match:
 524           galx = match.group(1)
 525
 526         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 527         if match:
 528           dsh = match.group(1)
 529
 530         # Log in
 531         login_form_strs = {
 532                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 533                 u'Email': username,
 534                 u'GALX': galx,
 535                 u'Passwd': password,
 536                 u'PersistentCookie': u'yes',
 537                 u'_utf8': u'霱',
 538                 u'bgresponse': u'js_disabled',
 539                 u'checkConnection': u'',
 540                 u'checkedDomains': u'youtube',
 541                 u'dnConn': u'',
 542                 u'dsh': dsh,
 543                 u'pstMsg': u'0',
 544                 u'rmShown': u'1',
 545                 u'secTok': u'',
 546                 u'signIn': u'Sign in',
 547                 u'timeStmp': u'',
 548                 u'service': u'youtube',
 549                 u'uilel': u'3',
 550                 u'hl': u'en_US',
 551         }
 552         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 553         # chokes on unicode
 554         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 555         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 556         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 557         try:
 558             self.report_login()
 559             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 560             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 561                 self._downloader.report_warning(u'unable to log in: bad username or password')
 562                 return
 563         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 564             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 565             return
 566
 567         # Confirm age
 568         age_form = {
 569                 'next_url':     '/',
 570                 'action_confirm':   'Confirm',
 571                 }
 572         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 573         try:
 574             self.report_age_confirmation()
 575             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 577             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 578
 579     def _extract_id(self, url):
 580         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 581         if mobj is None:
 582             raise ExtractorError(u'Invalid URL: %s' % url)
 583         video_id = mobj.group(2)
 584         return video_id
 585
 586     def _real_extract(self, url):
 587         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 588         mobj = re.search(self._NEXT_URL_RE, url)
 589         if mobj:
 590             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 591         video_id = self._extract_id(url)
 592
 593         # Get video webpage
 594         self.report_video_webpage_download(video_id)
 595         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 596         request = compat_urllib_request.Request(url)
 597         try:
 598             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 599         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 600             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 601
 602         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 603
 604         # Attempt to extract SWF player URL
 605         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 606         if mobj is not None:
 607             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 608         else:
 609             player_url = None
 610
 611         # Get video info
 612         self.report_video_info_webpage_download(video_id)
 613         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 614             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 615                     % (video_id, el_type))
 616             video_info_webpage = self._download_webpage(video_info_url, video_id,
 617                                     note=False,
 618                                     errnote='unable to download video info webpage')
 619             video_info = compat_parse_qs(video_info_webpage)
 620             if 'token' in video_info:
 621                 break
 622         if 'token' not in video_info:
 623             if 'reason' in video_info:
 624                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 625             else:
 626                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 627
 628         # Check for "rental" videos
 629         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 630             raise ExtractorError(u'"rental" videos not supported')
 631
 632         # Start extracting information
 633         self.report_information_extraction(video_id)
 634
 635         # uploader
 636         if 'author' not in video_info:
 637             raise ExtractorError(u'Unable to extract uploader name')
 638         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 639
 640         # uploader_id
 641         video_uploader_id = None
 642         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 643         if mobj is not None:
 644             video_uploader_id = mobj.group(1)
 645         else:
 646             self._downloader.report_warning(u'unable to extract uploader nickname')
 647
 648         # title
 649         if 'title' not in video_info:
 650             raise ExtractorError(u'Unable to extract video title')
 651         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 652
 653         # thumbnail image
 654         if 'thumbnail_url' not in video_info:
 655             self._downloader.report_warning(u'unable to extract video thumbnail')
 656             video_thumbnail = ''
 657         else:   # don't panic if we can't find it
 658             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 659
 660         # upload date
 661         upload_date = None
 662         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 663         if mobj is not None:
 664             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 665             upload_date = unified_strdate(upload_date)
 666
 667         # description
 668         video_description = get_element_by_id("eow-description", video_webpage)
 669         if video_description:
 670             video_description = clean_html(video_description)
 671         else:
 672             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 673             if fd_mobj:
 674                 video_description = unescapeHTML(fd_mobj.group(1))
 675             else:
 676                 video_description = u''
 677
 678         # subtitles
 679         video_subtitles = None
 680
 681         if self._downloader.params.get('writesubtitles', False):
 682             video_subtitles = self._extract_subtitle(video_id)
 683             if video_subtitles:
 684                 (sub_error, sub_lang, sub) = video_subtitles[0]
 685                 if sub_error:
 686                     # We try with the automatic captions
 687                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 688                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 689                     if sub is not None:
 690                         pass
 691                     else:
 692                         # We report the original error
 693                         self._downloader.report_error(sub_error)
 694
 695         if self._downloader.params.get('allsubtitles', False):
 696             video_subtitles = self._extract_all_subtitles(video_id)
 697             for video_subtitle in video_subtitles:
 698                 (sub_error, sub_lang, sub) = video_subtitle
 699                 if sub_error:
 700                     self._downloader.report_error(sub_error)
 701
 702         if self._downloader.params.get('listsubtitles', False):
 703             sub_lang_list = self._list_available_subtitles(video_id)
 704             return
 705
 706         if 'length_seconds' not in video_info:
 707             self._downloader.report_warning(u'unable to extract video duration')
 708             video_duration = ''
 709         else:
 710             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 711
 712         # token
 713         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 714
 715         # Decide which formats to download
 716         req_format = self._downloader.params.get('format', None)
 717
 718         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 719             self.report_rtmp_download()
 720             video_url_list = [(None, video_info['conn'][0])]
 721         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 722             url_map = {}
 723             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 724                 url_data = compat_parse_qs(url_data_str)
 725                 if 'itag' in url_data and 'url' in url_data:
 726                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 727                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 728                     url_map[url_data['itag'][0]] = url
 729
 730             format_limit = self._downloader.params.get('format_limit', None)
 731             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 732             if format_limit is not None and format_limit in available_formats:
 733                 format_list = available_formats[available_formats.index(format_limit):]
 734             else:
 735                 format_list = available_formats
 736             existing_formats = [x for x in format_list if x in url_map]
 737             if len(existing_formats) == 0:
 738                 raise ExtractorError(u'no known formats available for video')
 739             if self._downloader.params.get('listformats', None):
 740                 self._print_formats(existing_formats)
 741                 return
 742             if req_format is None or req_format == 'best':
 743                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 744             elif req_format == 'worst':
 745                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 746             elif req_format in ('-1', 'all'):
 747                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 748             else:
 749                 # Specific formats. We pick the first in a slash-delimeted sequence.
 750                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 751                 req_formats = req_format.split('/')
 752                 video_url_list = None
 753                 for rf in req_formats:
 754                     if rf in url_map:
 755                         video_url_list = [(rf, url_map[rf])]
 756                         break
 757                 if video_url_list is None:
 758                     raise ExtractorError(u'requested format not available')
 759         else:
 760             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 761
 762         results = []
 763         for format_param, video_real_url in video_url_list:
 764             # Extension
 765             video_extension = self._video_extensions.get(format_param, 'flv')
 766
 767             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 768                                               self._video_dimensions.get(format_param, '???'))
 769
 770             results.append({
 771                 'id':       video_id,
 772                 'url':      video_real_url,
 773                 'uploader': video_uploader,
 774                 'uploader_id': video_uploader_id,
 775                 'upload_date':  upload_date,
 776                 'title':    video_title,
 777                 'ext':      video_extension,
 778                 'format':   video_format,
 779                 'thumbnail':    video_thumbnail,
 780                 'description':  video_description,
 781                 'player_url':   player_url,
 782                 'subtitles':    video_subtitles,
 783                 'duration':     video_duration
 784             })
 785         return results
 786
 787
 788 class MetacafeIE(InfoExtractor):
 789     """Information Extractor for metacafe.com."""
 790
 791     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 792     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 793     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 794     IE_NAME = u'metacafe'
 795
 796     def report_disclaimer(self):
 797         """Report disclaimer retrieval."""
 798         self.to_screen(u'Retrieving disclaimer')
 799
 800     def _real_initialize(self):
 801         # Retrieve disclaimer
 802         request = compat_urllib_request.Request(self._DISCLAIMER)
 803         try:
 804             self.report_disclaimer()
 805             disclaimer = compat_urllib_request.urlopen(request).read()
 806         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 807             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 808
 809         # Confirm age
 810         disclaimer_form = {
 811             'filters': '0',
 812             'submit': "Continue - I'm over 18",
 813             }
 814         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 815         try:
 816             self.report_age_confirmation()
 817             disclaimer = compat_urllib_request.urlopen(request).read()
 818         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 819             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 820
 821     def _real_extract(self, url):
 822         # Extract id and simplified title from URL
 823         mobj = re.match(self._VALID_URL, url)
 824         if mobj is None:
 825             raise ExtractorError(u'Invalid URL: %s' % url)
 826
 827         video_id = mobj.group(1)
 828
 829         # Check if video comes from YouTube
 830         mobj2 = re.match(r'^yt-(.*)$', video_id)
 831         if mobj2 is not None:
 832             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 833
 834         # Retrieve video webpage to extract further information
 835         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 836
 837         # Extract URL, uploader and title from webpage
 838         self.report_extraction(video_id)
 839         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 840         if mobj is not None:
 841             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 842             video_extension = mediaURL[-3:]
 843
 844             # Extract gdaKey if available
 845             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 846             if mobj is None:
 847                 video_url = mediaURL
 848             else:
 849                 gdaKey = mobj.group(1)
 850                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 851         else:
 852             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 853             if mobj is None:
 854                 raise ExtractorError(u'Unable to extract media URL')
 855             vardict = compat_parse_qs(mobj.group(1))
 856             if 'mediaData' not in vardict:
 857                 raise ExtractorError(u'Unable to extract media URL')
 858             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 859             if mobj is None:
 860                 raise ExtractorError(u'Unable to extract media URL')
 861             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 862             video_extension = mediaURL[-3:]
 863             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 864
 865         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 866         if mobj is None:
 867             raise ExtractorError(u'Unable to extract title')
 868         video_title = mobj.group(1).decode('utf-8')
 869
 870         mobj = re.search(r'submitter=(.*?);', webpage)
 871         if mobj is None:
 872             raise ExtractorError(u'Unable to extract uploader nickname')
 873         video_uploader = mobj.group(1)
 874
 875         return [{
 876             'id':       video_id.decode('utf-8'),
 877             'url':      video_url.decode('utf-8'),
 878             'uploader': video_uploader.decode('utf-8'),
 879             'upload_date':  None,
 880             'title':    video_title,
 881             'ext':      video_extension.decode('utf-8'),
 882         }]
 883
 884 class DailymotionIE(InfoExtractor):
 885     """Information Extractor for Dailymotion"""
 886
 887     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 888     IE_NAME = u'dailymotion'
 889
 890     def _real_extract(self, url):
 891         # Extract id and simplified title from URL
 892         mobj = re.match(self._VALID_URL, url)
 893         if mobj is None:
 894             raise ExtractorError(u'Invalid URL: %s' % url)
 895
 896         video_id = mobj.group(1).split('_')[0].split('?')[0]
 897
 898         video_extension = 'mp4'
 899
 900         # Retrieve video webpage to extract further information
 901         request = compat_urllib_request.Request(url)
 902         request.add_header('Cookie', 'family_filter=off')
 903         webpage = self._download_webpage(request, video_id)
 904
 905         # Extract URL, uploader and title from webpage
 906         self.report_extraction(video_id)
 907         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 908         if mobj is None:
 909             raise ExtractorError(u'Unable to extract media URL')
 910         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 911
 912         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 913             if key in flashvars:
 914                 max_quality = key
 915                 self.to_screen(u'Using %s' % key)
 916                 break
 917         else:
 918             raise ExtractorError(u'Unable to extract video URL')
 919
 920         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 921         if mobj is None:
 922             raise ExtractorError(u'Unable to extract video URL')
 923
 924         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 925
 926         # TODO: support choosing qualities
 927
 928         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 929         if mobj is None:
 930             raise ExtractorError(u'Unable to extract title')
 931         video_title = unescapeHTML(mobj.group('title'))
 932
 933         video_uploader = None
 934         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 935         if mobj is None:
 936             # lookin for official user
 937             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 938             if mobj_official is None:
 939                 self._downloader.report_warning(u'unable to extract uploader nickname')
 940             else:
 941                 video_uploader = mobj_official.group(1)
 942         else:
 943             video_uploader = mobj.group(1)
 944
 945         video_upload_date = None
 946         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 947         if mobj is not None:
 948             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 949
 950         return [{
 951             'id':       video_id,
 952             'url':      video_url,
 953             'uploader': video_uploader,
 954             'upload_date':  video_upload_date,
 955             'title':    video_title,
 956             'ext':      video_extension,
 957         }]
 958
 959
 960 class PhotobucketIE(InfoExtractor):
 961     """Information extractor for photobucket.com."""
 962
 963     # TODO: the original _VALID_URL was:
 964     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 965     # Check if it's necessary to keep the old extracion process
 966     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 967     IE_NAME = u'photobucket'
 968
 969     def _real_extract(self, url):
 970         # Extract id from URL
 971         mobj = re.match(self._VALID_URL, url)
 972         if mobj is None:
 973             raise ExtractorError(u'Invalid URL: %s' % url)
 974
 975         video_id = mobj.group('id')
 976
 977         video_extension = mobj.group('ext')
 978
 979         # Retrieve video webpage to extract further information
 980         webpage = self._download_webpage(url, video_id)
 981
 982         # Extract URL, uploader, and title from webpage
 983         self.report_extraction(video_id)
 984         # We try first by looking the javascript code:
 985         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 986         if mobj is not None:
 987             info = json.loads(mobj.group('json'))
 988             return [{
 989                 'id':       video_id,
 990                 'url':      info[u'downloadUrl'],
 991                 'uploader': info[u'username'],
 992                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 993                 'title':    info[u'title'],
 994                 'ext':      video_extension,
 995                 'thumbnail': info[u'thumbUrl'],
 996             }]
 997
 998         # We try looking in other parts of the webpage
 999         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1000             webpage, u'video URL')
1001
1002         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1003         if mobj is None:
1004             raise ExtractorError(u'Unable to extract title')
1005         video_title = mobj.group(1).decode('utf-8')
1006         video_uploader = mobj.group(2).decode('utf-8')
1007
1008         return [{
1009             'id':       video_id.decode('utf-8'),
1010             'url':      video_url.decode('utf-8'),
1011             'uploader': video_uploader,
1012             'upload_date':  None,
1013             'title':    video_title,
1014             'ext':      video_extension.decode('utf-8'),
1015         }]
1016
1017
1018 class YahooIE(InfoExtractor):
1019     """Information extractor for screen.yahoo.com."""
1020     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1021
1022     def _real_extract(self, url):
1023         mobj = re.match(self._VALID_URL, url)
1024         if mobj is None:
1025             raise ExtractorError(u'Invalid URL: %s' % url)
1026         video_id = mobj.group('id')
1027         webpage = self._download_webpage(url, video_id)
1028         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1029
1030         if m_id is None:
1031             # TODO: Check which url parameters are required
1032             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1033             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1034             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1035                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1036                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1037                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1038                         '''
1039             self.report_extraction(video_id)
1040             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1041             if m_info is None:
1042                 raise ExtractorError(u'Unable to extract video info')
1043             video_title = m_info.group('title')
1044             video_description = m_info.group('description')
1045             video_thumb = m_info.group('thumb')
1046             video_date = m_info.group('date')
1047             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1048
1049             # TODO: Find a way to get mp4 videos
1050             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1051             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1052             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1053             video_url = m_rest.group('url')
1054             video_path = m_rest.group('path')
1055             if m_rest is None:
1056                 raise ExtractorError(u'Unable to extract video url')
1057
1058         else: # We have to use a different method if another id is defined
1059             long_id = m_id.group('new_id')
1060             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1061             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1062             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1063             info = json.loads(json_str)
1064             res = info[u'query'][u'results'][u'mediaObj'][0]
1065             stream = res[u'streams'][0]
1066             video_path = stream[u'path']
1067             video_url = stream[u'host']
1068             meta = res[u'meta']
1069             video_title = meta[u'title']
1070             video_description = meta[u'description']
1071             video_thumb = meta[u'thumbnail']
1072             video_date = None # I can't find it
1073
1074         info_dict = {
1075                      'id': video_id,
1076                      'url': video_url,
1077                      'play_path': video_path,
1078                      'title':video_title,
1079                      'description': video_description,
1080                      'thumbnail': video_thumb,
1081                      'upload_date': video_date,
1082                      'ext': 'flv',
1083                      }
1084         return info_dict
1085
1086 class VimeoIE(InfoExtractor):
1087     """Information extractor for vimeo.com."""
1088
1089     # _VALID_URL matches Vimeo URLs
1090     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1091     IE_NAME = u'vimeo'
1092
1093     def _real_extract(self, url, new_video=True):
1094         # Extract ID from URL
1095         mobj = re.match(self._VALID_URL, url)
1096         if mobj is None:
1097             raise ExtractorError(u'Invalid URL: %s' % url)
1098
1099         video_id = mobj.group('id')
1100         if not mobj.group('proto'):
1101             url = 'https://' + url
1102         if mobj.group('direct_link') or mobj.group('pro'):
1103             url = 'https://vimeo.com/' + video_id
1104
1105         # Retrieve video webpage to extract further information
1106         request = compat_urllib_request.Request(url, None, std_headers)
1107         webpage = self._download_webpage(request, video_id)
1108
1109         # Now we begin extracting as much information as we can from what we
1110         # retrieved. First we extract the information common to all extractors,
1111         # and latter we extract those that are Vimeo specific.
1112         self.report_extraction(video_id)
1113
1114         # Extract the config JSON
1115         try:
1116             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1117             config = json.loads(config)
1118         except:
1119             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1120                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1121             else:
1122                 raise ExtractorError(u'Unable to extract info section')
1123
1124         # Extract title
1125         video_title = config["video"]["title"]
1126
1127         # Extract uploader and uploader_id
1128         video_uploader = config["video"]["owner"]["name"]
1129         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1130
1131         # Extract video thumbnail
1132         video_thumbnail = config["video"]["thumbnail"]
1133
1134         # Extract video description
1135         video_description = get_element_by_attribute("itemprop", "description", webpage)
1136         if video_description: video_description = clean_html(video_description)
1137         else: video_description = u''
1138
1139         # Extract upload date
1140         video_upload_date = None
1141         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1142         if mobj is not None:
1143             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1144
1145         # Vimeo specific: extract request signature and timestamp
1146         sig = config['request']['signature']
1147         timestamp = config['request']['timestamp']
1148
1149         # Vimeo specific: extract video codec and quality information
1150         # First consider quality, then codecs, then take everything
1151         # TODO bind to format param
1152         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1153         files = { 'hd': [], 'sd': [], 'other': []}
1154         for codec_name, codec_extension in codecs:
1155             if codec_name in config["video"]["files"]:
1156                 if 'hd' in config["video"]["files"][codec_name]:
1157                     files['hd'].append((codec_name, codec_extension, 'hd'))
1158                 elif 'sd' in config["video"]["files"][codec_name]:
1159                     files['sd'].append((codec_name, codec_extension, 'sd'))
1160                 else:
1161                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1162
1163         for quality in ('hd', 'sd', 'other'):
1164             if len(files[quality]) > 0:
1165                 video_quality = files[quality][0][2]
1166                 video_codec = files[quality][0][0]
1167                 video_extension = files[quality][0][1]
1168                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1169                 break
1170         else:
1171             raise ExtractorError(u'No known codec found')
1172
1173         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1174                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1175
1176         return [{
1177             'id':       video_id,
1178             'url':      video_url,
1179             'uploader': video_uploader,
1180             'uploader_id': video_uploader_id,
1181             'upload_date':  video_upload_date,
1182             'title':    video_title,
1183             'ext':      video_extension,
1184             'thumbnail':    video_thumbnail,
1185             'description':  video_description,
1186         }]
1187
1188
1189 class ArteTvIE(InfoExtractor):
1190     """arte.tv information extractor."""
1191
1192     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1193     _LIVE_URL = r'index-[0-9]+\.html$'
1194
1195     IE_NAME = u'arte.tv'
1196
1197     def fetch_webpage(self, url):
1198         request = compat_urllib_request.Request(url)
1199         try:
1200             self.report_download_webpage(url)
1201             webpage = compat_urllib_request.urlopen(request).read()
1202         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1203             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1204         except ValueError as err:
1205             raise ExtractorError(u'Invalid URL: %s' % url)
1206         return webpage
1207
1208     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1209         page = self.fetch_webpage(url)
1210         mobj = re.search(regex, page, regexFlags)
1211         info = {}
1212
1213         if mobj is None:
1214             raise ExtractorError(u'Invalid URL: %s' % url)
1215
1216         for (i, key, err) in matchTuples:
1217             if mobj.group(i) is None:
1218                 raise ExtractorError(err)
1219             else:
1220                 info[key] = mobj.group(i)
1221
1222         return info
1223
1224     def extractLiveStream(self, url):
1225         video_lang = url.split('/')[-4]
1226         info = self.grep_webpage(
1227             url,
1228             r'src="(.*?/videothek_js.*?\.js)',
1229             0,
1230             [
1231                 (1, 'url', u'Invalid URL: %s' % url)
1232             ]
1233         )
1234         http_host = url.split('/')[2]
1235         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1236         info = self.grep_webpage(
1237             next_url,
1238             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1239                 '(http://.*?\.swf).*?' +
1240                 '(rtmp://.*?)\'',
1241             re.DOTALL,
1242             [
1243                 (1, 'path',   u'could not extract video path: %s' % url),
1244                 (2, 'player', u'could not extract video player: %s' % url),
1245                 (3, 'url',    u'could not extract video url: %s' % url)
1246             ]
1247         )
1248         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1249
1250     def extractPlus7Stream(self, url):
1251         video_lang = url.split('/')[-3]
1252         info = self.grep_webpage(
1253             url,
1254             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1255             0,
1256             [
1257                 (1, 'url', u'Invalid URL: %s' % url)
1258             ]
1259         )
1260         next_url = compat_urllib_parse.unquote(info.get('url'))
1261         info = self.grep_webpage(
1262             next_url,
1263             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1264             0,
1265             [
1266                 (1, 'url', u'Could not find <video> tag: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270
1271         info = self.grep_webpage(
1272             next_url,
1273             r'<video id="(.*?)".*?>.*?' +
1274                 '<name>(.*?)</name>.*?' +
1275                 '<dateVideo>(.*?)</dateVideo>.*?' +
1276                 '<url quality="hd">(.*?)</url>',
1277             re.DOTALL,
1278             [
1279                 (1, 'id',    u'could not extract video id: %s' % url),
1280                 (2, 'title', u'could not extract video title: %s' % url),
1281                 (3, 'date',  u'could not extract video date: %s' % url),
1282                 (4, 'url',   u'could not extract video url: %s' % url)
1283             ]
1284         )
1285
1286         return {
1287             'id':           info.get('id'),
1288             'url':          compat_urllib_parse.unquote(info.get('url')),
1289             'uploader':     u'arte.tv',
1290             'upload_date':  unified_strdate(info.get('date')),
1291             'title':        info.get('title').decode('utf-8'),
1292             'ext':          u'mp4',
1293             'format':       u'NA',
1294             'player_url':   None,
1295         }
1296
1297     def _real_extract(self, url):
1298         video_id = url.split('/')[-1]
1299         self.report_extraction(video_id)
1300
1301         if re.search(self._LIVE_URL, video_id) is not None:
1302             self.extractLiveStream(url)
1303             return
1304         else:
1305             info = self.extractPlus7Stream(url)
1306
1307         return [info]
1308
1309
1310 class GenericIE(InfoExtractor):
1311     """Generic last-resort information extractor."""
1312
1313     _VALID_URL = r'.*'
1314     IE_NAME = u'generic'
1315
1316     def report_download_webpage(self, video_id):
1317         """Report webpage download."""
1318         if not self._downloader.params.get('test', False):
1319             self._downloader.report_warning(u'Falling back on generic information extractor.')
1320         super(GenericIE, self).report_download_webpage(video_id)
1321
1322     def report_following_redirect(self, new_url):
1323         """Report information extraction."""
1324         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1325
1326     def _test_redirect(self, url):
1327         """Check if it is a redirect, like url shorteners, in case return the new url."""
1328         class HeadRequest(compat_urllib_request.Request):
1329             def get_method(self):
1330                 return "HEAD"
1331
1332         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1333             """
1334             Subclass the HTTPRedirectHandler to make it use our
1335             HeadRequest also on the redirected URL
1336             """
1337             def redirect_request(self, req, fp, code, msg, headers, newurl):
1338                 if code in (301, 302, 303, 307):
1339                     newurl = newurl.replace(' ', '%20')
1340                     newheaders = dict((k,v) for k,v in req.headers.items()
1341                                       if k.lower() not in ("content-length", "content-type"))
1342                     return HeadRequest(newurl,
1343                                        headers=newheaders,
1344                                        origin_req_host=req.get_origin_req_host(),
1345                                        unverifiable=True)
1346                 else:
1347                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1348
1349         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1350             """
1351             Fallback to GET if HEAD is not allowed (405 HTTP error)
1352             """
1353             def http_error_405(self, req, fp, code, msg, headers):
1354                 fp.read()
1355                 fp.close()
1356
1357                 newheaders = dict((k,v) for k,v in req.headers.items()
1358                                   if k.lower() not in ("content-length", "content-type"))
1359                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1360                                                  headers=newheaders,
1361                                                  origin_req_host=req.get_origin_req_host(),
1362                                                  unverifiable=True))
1363
1364         # Build our opener
1365         opener = compat_urllib_request.OpenerDirector()
1366         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1367                         HTTPMethodFallback, HEADRedirectHandler,
1368                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1369             opener.add_handler(handler())
1370
1371         response = opener.open(HeadRequest(url))
1372         if response is None:
1373             raise ExtractorError(u'Invalid URL protocol')
1374         new_url = response.geturl()
1375
1376         if url == new_url:
1377             return False
1378
1379         self.report_following_redirect(new_url)
1380         return new_url
1381
1382     def _real_extract(self, url):
1383         new_url = self._test_redirect(url)
1384         if new_url: return [self.url_result(new_url)]
1385
1386         video_id = url.split('/')[-1]
1387         try:
1388             webpage = self._download_webpage(url, video_id)
1389         except ValueError as err:
1390             # since this is the last-resort InfoExtractor, if
1391             # this error is thrown, it'll be thrown here
1392             raise ExtractorError(u'Invalid URL: %s' % url)
1393
1394         self.report_extraction(video_id)
1395         # Start with something easy: JW Player in SWFObject
1396         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1397         if mobj is None:
1398             # Broaden the search a little bit
1399             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1400         if mobj is None:
1401             # Broaden the search a little bit: JWPlayer JS loader
1402             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             raise ExtractorError(u'Invalid URL: %s' % url)
1405
1406         # It's possible that one of the regexes
1407         # matched, but returned an empty group:
1408         if mobj.group(1) is None:
1409             raise ExtractorError(u'Invalid URL: %s' % url)
1410
1411         video_url = compat_urllib_parse.unquote(mobj.group(1))
1412         video_id = os.path.basename(video_url)
1413
1414         # here's a fun little line of code for you:
1415         video_extension = os.path.splitext(video_id)[1][1:]
1416         video_id = os.path.splitext(video_id)[0]
1417
1418         # it's tempting to parse this further, but you would
1419         # have to take into account all the variations like
1420         #   Video Title - Site Name
1421         #   Site Name | Video Title
1422         #   Video Title - Tagline | Site Name
1423         # and so on and so forth; it's just not practical
1424         mobj = re.search(r'<title>(.*)</title>', webpage)
1425         if mobj is None:
1426             raise ExtractorError(u'Unable to extract title')
1427         video_title = mobj.group(1)
1428
1429         # video uploader is domain name
1430         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1431         if mobj is None:
1432             raise ExtractorError(u'Unable to extract title')
1433         video_uploader = mobj.group(1)
1434
1435         return [{
1436             'id':       video_id,
1437             'url':      video_url,
1438             'uploader': video_uploader,
1439             'upload_date':  None,
1440             'title':    video_title,
1441             'ext':      video_extension,
1442         }]
1443
1444
1445 class YoutubeSearchIE(SearchInfoExtractor):
1446     """Information Extractor for YouTube search queries."""
1447     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1448     _MAX_RESULTS = 1000
1449     IE_NAME = u'youtube:search'
1450     _SEARCH_KEY = 'ytsearch'
1451
1452     def report_download_page(self, query, pagenum):
1453         """Report attempt to download search page with given number."""
1454         query = query.decode(preferredencoding())
1455         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1456
1457     def _get_n_results(self, query, n):
1458         """Get a specified number of results for a query"""
1459
1460         video_ids = []
1461         pagenum = 0
1462         limit = n
1463
1464         while (50 * pagenum) < limit:
1465             self.report_download_page(query, pagenum+1)
1466             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1467             request = compat_urllib_request.Request(result_url)
1468             try:
1469                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1470             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1471                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1472             api_response = json.loads(data)['data']
1473
1474             if not 'items' in api_response:
1475                 raise ExtractorError(u'[youtube] No video results')
1476
1477             new_ids = list(video['id'] for video in api_response['items'])
1478             video_ids += new_ids
1479
1480             limit = min(n, api_response['totalItems'])
1481             pagenum += 1
1482
1483         if len(video_ids) > n:
1484             video_ids = video_ids[:n]
1485         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1486         return self.playlist_result(videos, query)
1487
1488
1489 class GoogleSearchIE(SearchInfoExtractor):
1490     """Information Extractor for Google Video search queries."""
1491     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1492     _MAX_RESULTS = 1000
1493     IE_NAME = u'video.google:search'
1494     _SEARCH_KEY = 'gvsearch'
1495
1496     def _get_n_results(self, query, n):
1497         """Get a specified number of results for a query"""
1498
1499         res = {
1500             '_type': 'playlist',
1501             'id': query,
1502             'entries': []
1503         }
1504
1505         for pagenum in itertools.count(1):
1506             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1507             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1508                                              note='Downloading result page ' + str(pagenum))
1509
1510             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1511                 e = {
1512                     '_type': 'url',
1513                     'url': mobj.group(1)
1514                 }
1515                 res['entries'].append(e)
1516
1517             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1518                 return res
1519
1520 class YahooSearchIE(SearchInfoExtractor):
1521     """Information Extractor for Yahoo! Video search queries."""
1522
1523     _MAX_RESULTS = 1000
1524     IE_NAME = u'screen.yahoo:search'
1525     _SEARCH_KEY = 'yvsearch'
1526
1527     def _get_n_results(self, query, n):
1528         """Get a specified number of results for a query"""
1529
1530         res = {
1531             '_type': 'playlist',
1532             'id': query,
1533             'entries': []
1534         }
1535         for pagenum in itertools.count(0):
1536             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1537             webpage = self._download_webpage(result_url, query,
1538                                              note='Downloading results page '+str(pagenum+1))
1539             info = json.loads(webpage)
1540             m = info[u'm']
1541             results = info[u'results']
1542
1543             for (i, r) in enumerate(results):
1544                 if (pagenum * 30) +i >= n:
1545                     break
1546                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1547                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1548                 res['entries'].append(e)
1549             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1550                 break
1551
1552         return res
1553
1554
1555 class YoutubePlaylistIE(InfoExtractor):
1556     """Information Extractor for YouTube playlists."""
1557
1558     _VALID_URL = r"""(?:
1559                         (?:https?://)?
1560                         (?:\w+\.)?
1561                         youtube\.com/
1562                         (?:
1563                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1564                            \? (?:.*?&)*? (?:p|a|list)=
1565                         |  p/
1566                         )
1567                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1568                         .*
1569                      |
1570                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1571                      )"""
1572     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1573     _MAX_RESULTS = 50
1574     IE_NAME = u'youtube:playlist'
1575
1576     @classmethod
1577     def suitable(cls, url):
1578         """Receives a URL and returns True if suitable for this IE."""
1579         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1580
1581     def _real_extract(self, url):
1582         # Extract playlist id
1583         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1584         if mobj is None:
1585             raise ExtractorError(u'Invalid URL: %s' % url)
1586
1587         # Download playlist videos from API
1588         playlist_id = mobj.group(1) or mobj.group(2)
1589         page_num = 1
1590         videos = []
1591
1592         while True:
1593             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1594             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1595
1596             try:
1597                 response = json.loads(page)
1598             except ValueError as err:
1599                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1600
1601             if 'feed' not in response:
1602                 raise ExtractorError(u'Got a malformed response from YouTube API')
1603             playlist_title = response['feed']['title']['$t']
1604             if 'entry' not in response['feed']:
1605                 # Number of videos is a multiple of self._MAX_RESULTS
1606                 break
1607
1608             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1609                         for entry in response['feed']['entry']
1610                         if 'content' in entry ]
1611
1612             if len(response['feed']['entry']) < self._MAX_RESULTS:
1613                 break
1614             page_num += 1
1615
1616         videos = [v[1] for v in sorted(videos)]
1617
1618         url_results = [self.url_result(url, 'Youtube') for url in videos]
1619         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1620
1621
1622 class YoutubeChannelIE(InfoExtractor):
1623     """Information Extractor for YouTube channels."""
1624
1625     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1626     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1627     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1628     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1629     IE_NAME = u'youtube:channel'
1630
1631     def extract_videos_from_page(self, page):
1632         ids_in_page = []
1633         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1634             if mobj.group(1) not in ids_in_page:
1635                 ids_in_page.append(mobj.group(1))
1636         return ids_in_page
1637
1638     def _real_extract(self, url):
1639         # Extract channel id
1640         mobj = re.match(self._VALID_URL, url)
1641         if mobj is None:
1642             raise ExtractorError(u'Invalid URL: %s' % url)
1643
1644         # Download channel page
1645         channel_id = mobj.group(1)
1646         video_ids = []
1647         pagenum = 1
1648
1649         url = self._TEMPLATE_URL % (channel_id, pagenum)
1650         page = self._download_webpage(url, channel_id,
1651                                       u'Downloading page #%s' % pagenum)
1652
1653         # Extract video identifiers
1654         ids_in_page = self.extract_videos_from_page(page)
1655         video_ids.extend(ids_in_page)
1656
1657         # Download any subsequent channel pages using the json-based channel_ajax query
1658         if self._MORE_PAGES_INDICATOR in page:
1659             while True:
1660                 pagenum = pagenum + 1
1661
1662                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1663                 page = self._download_webpage(url, channel_id,
1664                                               u'Downloading page #%s' % pagenum)
1665
1666                 page = json.loads(page)
1667
1668                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1669                 video_ids.extend(ids_in_page)
1670
1671                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1672                     break
1673
1674         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1675
1676         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1677         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1678         return [self.playlist_result(url_entries, channel_id)]
1679
1680
1681 class YoutubeUserIE(InfoExtractor):
1682     """Information Extractor for YouTube users."""
1683
1684     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1685     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1686     _GDATA_PAGE_SIZE = 50
1687     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1688     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1689     IE_NAME = u'youtube:user'
1690
1691     def _real_extract(self, url):
1692         # Extract username
1693         mobj = re.match(self._VALID_URL, url)
1694         if mobj is None:
1695             raise ExtractorError(u'Invalid URL: %s' % url)
1696
1697         username = mobj.group(1)
1698
1699         # Download video ids using YouTube Data API. Result size per
1700         # query is limited (currently to 50 videos) so we need to query
1701         # page by page until there are no video ids - it means we got
1702         # all of them.
1703
1704         video_ids = []
1705         pagenum = 0
1706
1707         while True:
1708             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1709
1710             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1711             page = self._download_webpage(gdata_url, username,
1712                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1713
1714             # Extract video identifiers
1715             ids_in_page = []
1716
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 if mobj.group(1) not in ids_in_page:
1719                     ids_in_page.append(mobj.group(1))
1720
1721             video_ids.extend(ids_in_page)
1722
1723             # A little optimization - if current page is not
1724             # "full", ie. does not contain PAGE_SIZE video ids then
1725             # we can assume that this page is the last one - there
1726             # are no more ids on further pages - no need to query
1727             # again.
1728
1729             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1730                 break
1731
1732             pagenum += 1
1733
1734         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1735         url_results = [self.url_result(url, 'Youtube') for url in urls]
1736         return [self.playlist_result(url_results, playlist_title = username)]
1737
1738
1739 class BlipTVUserIE(InfoExtractor):
1740     """Information Extractor for blip.tv users."""
1741
1742     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1743     _PAGE_SIZE = 12
1744     IE_NAME = u'blip.tv:user'
1745
1746     def _real_extract(self, url):
1747         # Extract username
1748         mobj = re.match(self._VALID_URL, url)
1749         if mobj is None:
1750             raise ExtractorError(u'Invalid URL: %s' % url)
1751
1752         username = mobj.group(1)
1753
1754         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1755
1756         page = self._download_webpage(url, username, u'Downloading user page')
1757         mobj = re.search(r'data-users-id="([^"]+)"', page)
1758         page_base = page_base % mobj.group(1)
1759
1760
1761         # Download video ids using BlipTV Ajax calls. Result size per
1762         # query is limited (currently to 12 videos) so we need to query
1763         # page by page until there are no video ids - it means we got
1764         # all of them.
1765
1766         video_ids = []
1767         pagenum = 1
1768
1769         while True:
1770             url = page_base + "&page=" + str(pagenum)
1771             page = self._download_webpage(url, username,
1772                                           u'Downloading video ids from page %d' % pagenum)
1773
1774             # Extract video identifiers
1775             ids_in_page = []
1776
1777             for mobj in re.finditer(r'href="/([^"]+)"', page):
1778                 if mobj.group(1) not in ids_in_page:
1779                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1780
1781             video_ids.extend(ids_in_page)
1782
1783             # A little optimization - if current page is not
1784             # "full", ie. does not contain PAGE_SIZE video ids then
1785             # we can assume that this page is the last one - there
1786             # are no more ids on further pages - no need to query
1787             # again.
1788
1789             if len(ids_in_page) < self._PAGE_SIZE:
1790                 break
1791
1792             pagenum += 1
1793
1794         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1795         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1796         return [self.playlist_result(url_entries, playlist_title = username)]
1797
1798
1799 class DepositFilesIE(InfoExtractor):
1800     """Information extractor for depositfiles.com"""
1801
1802     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1803
1804     def _real_extract(self, url):
1805         file_id = url.split('/')[-1]
1806         # Rebuild url in english locale
1807         url = 'http://depositfiles.com/en/files/' + file_id
1808
1809         # Retrieve file webpage with 'Free download' button pressed
1810         free_download_indication = { 'gateway_result' : '1' }
1811         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1812         try:
1813             self.report_download_webpage(file_id)
1814             webpage = compat_urllib_request.urlopen(request).read()
1815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1816             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1817
1818         # Search for the real file URL
1819         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1820         if (mobj is None) or (mobj.group(1) is None):
1821             # Try to figure out reason of the error.
1822             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1823             if (mobj is not None) and (mobj.group(1) is not None):
1824                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1825                 raise ExtractorError(u'%s' % restriction_message)
1826             else:
1827                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1828
1829         file_url = mobj.group(1)
1830         file_extension = os.path.splitext(file_url)[1][1:]
1831
1832         # Search for file title
1833         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1834
1835         return [{
1836             'id':       file_id.decode('utf-8'),
1837             'url':      file_url.decode('utf-8'),
1838             'uploader': None,
1839             'upload_date':  None,
1840             'title':    file_title,
1841             'ext':      file_extension.decode('utf-8'),
1842         }]
1843
1844
1845 class FacebookIE(InfoExtractor):
1846     """Information Extractor for Facebook"""
1847
1848     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1849     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1850     _NETRC_MACHINE = 'facebook'
1851     IE_NAME = u'facebook'
1852
1853     def report_login(self):
1854         """Report attempt to log in."""
1855         self.to_screen(u'Logging in')
1856
1857     def _real_initialize(self):
1858         if self._downloader is None:
1859             return
1860
1861         useremail = None
1862         password = None
1863         downloader_params = self._downloader.params
1864
1865         # Attempt to use provided username and password or .netrc data
1866         if downloader_params.get('username', None) is not None:
1867             useremail = downloader_params['username']
1868             password = downloader_params['password']
1869         elif downloader_params.get('usenetrc', False):
1870             try:
1871                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1872                 if info is not None:
1873                     useremail = info[0]
1874                     password = info[2]
1875                 else:
1876                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1877             except (IOError, netrc.NetrcParseError) as err:
1878                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1879                 return
1880
1881         if useremail is None:
1882             return
1883
1884         # Log in
1885         login_form = {
1886             'email': useremail,
1887             'pass': password,
1888             'login': 'Log+In'
1889             }
1890         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1891         try:
1892             self.report_login()
1893             login_results = compat_urllib_request.urlopen(request).read()
1894             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1895                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1896                 return
1897         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1898             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1899             return
1900
1901     def _real_extract(self, url):
1902         mobj = re.match(self._VALID_URL, url)
1903         if mobj is None:
1904             raise ExtractorError(u'Invalid URL: %s' % url)
1905         video_id = mobj.group('ID')
1906
1907         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1908         webpage = self._download_webpage(url, video_id)
1909
1910         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1911         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1912         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1913         if not m:
1914             raise ExtractorError(u'Cannot parse data')
1915         data = dict(json.loads(m.group(1)))
1916         params_raw = compat_urllib_parse.unquote(data['params'])
1917         params = json.loads(params_raw)
1918         video_data = params['video_data'][0]
1919         video_url = video_data.get('hd_src')
1920         if not video_url:
1921             video_url = video_data['sd_src']
1922         if not video_url:
1923             raise ExtractorError(u'Cannot find video URL')
1924         video_duration = int(video_data['video_duration'])
1925         thumbnail = video_data['thumbnail_src']
1926
1927         video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1928             webpage, u'title')
1929         video_title = unescapeHTML(video_title)
1930
1931         info = {
1932             'id': video_id,
1933             'title': video_title,
1934             'url': video_url,
1935             'ext': 'mp4',
1936             'duration': video_duration,
1937             'thumbnail': thumbnail,
1938         }
1939         return [info]
1940
1941
1942 class BlipTVIE(InfoExtractor):
1943     """Information extractor for blip.tv"""
1944
1945     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947     IE_NAME = u'blip.tv'
1948
1949     def report_direct_download(self, title):
1950         """Report information extraction."""
1951         self.to_screen(u'%s: Direct download detected' % title)
1952
1953     def _real_extract(self, url):
1954         mobj = re.match(self._VALID_URL, url)
1955         if mobj is None:
1956             raise ExtractorError(u'Invalid URL: %s' % url)
1957
1958         # See https://github.com/rg3/youtube-dl/issues/857
1959         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960         if api_mobj is not None:
1961             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962         urlp = compat_urllib_parse_urlparse(url)
1963         if urlp.path.startswith('/play/'):
1964             request = compat_urllib_request.Request(url)
1965             response = compat_urllib_request.urlopen(request)
1966             redirecturl = response.geturl()
1967             rurlp = compat_urllib_parse_urlparse(redirecturl)
1968             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969             url = 'http://blip.tv/a/a-' + file_id
1970             return self._real_extract(url)
1971
1972
1973         if '?' in url:
1974             cchar = '&'
1975         else:
1976             cchar = '?'
1977         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978         request = compat_urllib_request.Request(json_url)
1979         request.add_header('User-Agent', 'iTunes/10.6.1')
1980         self.report_extraction(mobj.group(1))
1981         info = None
1982         try:
1983             urlh = compat_urllib_request.urlopen(request)
1984             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985                 basename = url.split('/')[-1]
1986                 title,ext = os.path.splitext(basename)
1987                 title = title.decode('UTF-8')
1988                 ext = ext.replace('.', '')
1989                 self.report_direct_download(title)
1990                 info = {
1991                     'id': title,
1992                     'url': url,
1993                     'uploader': None,
1994                     'upload_date': None,
1995                     'title': title,
1996                     'ext': ext,
1997                     'urlhandle': urlh
1998                 }
1999         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001         if info is None: # Regular URL
2002             try:
2003                 json_code_bytes = urlh.read()
2004                 json_code = json_code_bytes.decode('utf-8')
2005             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2007
2008             try:
2009                 json_data = json.loads(json_code)
2010                 if 'Post' in json_data:
2011                     data = json_data['Post']
2012                 else:
2013                     data = json_data
2014
2015                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016                 video_url = data['media']['url']
2017                 umobj = re.match(self._URL_EXT, video_url)
2018                 if umobj is None:
2019                     raise ValueError('Can not determine filename extension')
2020                 ext = umobj.group(1)
2021
2022                 info = {
2023                     'id': data['item_id'],
2024                     'url': video_url,
2025                     'uploader': data['display_name'],
2026                     'upload_date': upload_date,
2027                     'title': data['title'],
2028                     'ext': ext,
2029                     'format': data['media']['mimeType'],
2030                     'thumbnail': data['thumbnailUrl'],
2031                     'description': data['description'],
2032                     'player_url': data['embedUrl'],
2033                     'user_agent': 'iTunes/10.6.1',
2034                 }
2035             except (ValueError,KeyError) as err:
2036                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2037
2038         return [info]
2039
2040
2041 class MyVideoIE(InfoExtractor):
2042     """Information Extractor for myvideo.de."""
2043
2044     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045     IE_NAME = u'myvideo'
2046
2047     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049     # https://github.com/rg3/youtube-dl/pull/842
2050     def __rc4crypt(self,data, key):
2051         x = 0
2052         box = list(range(256))
2053         for i in list(range(256)):
2054             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055             box[i], box[x] = box[x], box[i]
2056         x = 0
2057         y = 0
2058         out = ''
2059         for char in data:
2060             x = (x + 1) % 256
2061             y = (y + box[x]) % 256
2062             box[x], box[y] = box[y], box[x]
2063             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2064         return out
2065
2066     def __md5(self,s):
2067         return hashlib.md5(s).hexdigest().encode()
2068
2069     def _real_extract(self,url):
2070         mobj = re.match(self._VALID_URL, url)
2071         if mobj is None:
2072             raise ExtractorError(u'invalid URL: %s' % url)
2073
2074         video_id = mobj.group(1)
2075
2076         GK = (
2077           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079           b'TnpsbA0KTVRkbU1tSTRNdz09'
2080         )
2081
2082         # Get video webpage
2083         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084         webpage = self._download_webpage(webpage_url, video_id)
2085
2086         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087         if mobj is not None:
2088             self.report_extraction(video_id)
2089             video_url = mobj.group(1) + '.flv'
2090
2091             video_title = self._search_regex('<title>([^<]+)</title>',
2092                 webpage, u'title')
2093
2094             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2095
2096             return [{
2097                 'id':       video_id,
2098                 'url':      video_url,
2099                 'uploader': None,
2100                 'upload_date':  None,
2101                 'title':    video_title,
2102                 'ext':      u'flv',
2103             }]
2104
2105         # try encxml
2106         mobj = re.search('var flashvars={(.+?)}', webpage)
2107         if mobj is None:
2108             raise ExtractorError(u'Unable to extract video')
2109
2110         params = {}
2111         encxml = ''
2112         sec = mobj.group(1)
2113         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114             if not a == '_encxml':
2115                 params[a] = b
2116             else:
2117                 encxml = compat_urllib_parse.unquote(b)
2118         if not params.get('domain'):
2119             params['domain'] = 'www.myvideo.de'
2120         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121         if 'flash_playertype=MTV' in xmldata_url:
2122             self._downloader.report_warning(u'avoiding MTV player')
2123             xmldata_url = (
2124                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2126             ) % video_id
2127
2128         # get enc data
2129         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130         enc_data_b = binascii.unhexlify(enc_data)
2131         sk = self.__md5(
2132             base64.b64decode(base64.b64decode(GK)) +
2133             self.__md5(
2134                 str(video_id).encode('utf-8')
2135             )
2136         )
2137         dec_data = self.__rc4crypt(enc_data_b, sk)
2138
2139         # extracting infos
2140         self.report_extraction(video_id)
2141
2142         video_url = None
2143         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2144         if mobj:
2145             video_url = compat_urllib_parse.unquote(mobj.group(1))
2146             if 'myvideo2flash' in video_url:
2147                 self._downloader.report_warning(u'forcing RTMPT ...')
2148                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2149
2150         if not video_url:
2151             # extract non rtmp videos
2152             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2153             if mobj is None:
2154                 raise ExtractorError(u'unable to extract url')
2155             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2156
2157         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158         video_file = compat_urllib_parse.unquote(video_file)
2159
2160         if not video_file.endswith('f4m'):
2161             ppath, prefix = video_file.split('.')
2162             video_playpath = '%s:%s' % (prefix, ppath)
2163             video_hls_playlist = ''
2164         else:
2165             video_playpath = ''
2166             video_hls_playlist = (
2167                 video_filepath + video_file
2168             ).replace('.f4m', '.m3u8')
2169
2170         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2172
2173         video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2174             webpage, u'title')
2175
2176         return [{
2177             'id':                 video_id,
2178             'url':                video_url,
2179             'tc_url':             video_url,
2180             'uploader':           None,
2181             'upload_date':        None,
2182             'title':              video_title,
2183             'ext':                u'flv',
2184             'play_path':          video_playpath,
2185             'video_file':         video_file,
2186             'video_hls_playlist': video_hls_playlist,
2187             'player_url':         video_swfobj,
2188         }]
2189
2190
2191 class ComedyCentralIE(InfoExtractor):
2192     """Information extractor for The Daily Show and Colbert Report """
2193
2194     # urls can be abbreviations like :thedailyshow or :colbert
2195     # urls for episodes like:
2196     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200                       |(https?://)?(www\.)?
2201                           (?P<showname>thedailyshow|colbertnation)\.com/
2202                          (full-episodes/(?P<episode>.*)|
2203                           (?P<clip>
2204                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2206                      $"""
2207
2208     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2209
2210     _video_extensions = {
2211         '3500': 'mp4',
2212         '2200': 'mp4',
2213         '1700': 'mp4',
2214         '1200': 'mp4',
2215         '750': 'mp4',
2216         '400': 'mp4',
2217     }
2218     _video_dimensions = {
2219         '3500': '1280x720',
2220         '2200': '960x540',
2221         '1700': '768x432',
2222         '1200': '640x360',
2223         '750': '512x288',
2224         '400': '384x216',
2225     }
2226
2227     @classmethod
2228     def suitable(cls, url):
2229         """Receives a URL and returns True if suitable for this IE."""
2230         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2231
2232     def _print_formats(self, formats):
2233         print('Available formats:')
2234         for x in formats:
2235             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2236
2237
2238     def _real_extract(self, url):
2239         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2240         if mobj is None:
2241             raise ExtractorError(u'Invalid URL: %s' % url)
2242
2243         if mobj.group('shortname'):
2244             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245                 url = u'http://www.thedailyshow.com/full-episodes/'
2246             else:
2247                 url = u'http://www.colbertnation.com/full-episodes/'
2248             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249             assert mobj is not None
2250
2251         if mobj.group('clip'):
2252             if mobj.group('showname') == 'thedailyshow':
2253                 epTitle = mobj.group('tdstitle')
2254             else:
2255                 epTitle = mobj.group('cntitle')
2256             dlNewest = False
2257         else:
2258             dlNewest = not mobj.group('episode')
2259             if dlNewest:
2260                 epTitle = mobj.group('showname')
2261             else:
2262                 epTitle = mobj.group('episode')
2263
2264         self.report_extraction(epTitle)
2265         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2266         if dlNewest:
2267             url = htmlHandle.geturl()
2268             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269             if mobj is None:
2270                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271             if mobj.group('episode') == '':
2272                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273             epTitle = mobj.group('episode')
2274
2275         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2276
2277         if len(mMovieParams) == 0:
2278             # The Colbert Report embeds the information in a without
2279             # a URL prefix; so extract the alternate reference
2280             # and then add the URL prefix manually.
2281
2282             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283             if len(altMovieParams) == 0:
2284                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2285             else:
2286                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2287
2288         uri = mMovieParams[0][1]
2289         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290         indexXml = self._download_webpage(indexUrl, epTitle,
2291                                           u'Downloading show index',
2292                                           u'unable to download episode index')
2293
2294         results = []
2295
2296         idoc = xml.etree.ElementTree.fromstring(indexXml)
2297         itemEls = idoc.findall('.//item')
2298         for partNum,itemEl in enumerate(itemEls):
2299             mediaId = itemEl.findall('./guid')[0].text
2300             shortMediaId = mediaId.split(':')[-1]
2301             showId = mediaId.split(':')[-2].replace('.com', '')
2302             officialTitle = itemEl.findall('./title')[0].text
2303             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2304
2305             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306                         compat_urllib_parse.urlencode({'uri': mediaId}))
2307             configXml = self._download_webpage(configUrl, epTitle,
2308                                                u'Downloading configuration for %s' % shortMediaId)
2309
2310             cdoc = xml.etree.ElementTree.fromstring(configXml)
2311             turls = []
2312             for rendition in cdoc.findall('.//rendition'):
2313                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2314                 turls.append(finfo)
2315
2316             if len(turls) == 0:
2317                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2318                 continue
2319
2320             if self._downloader.params.get('listformats', None):
2321                 self._print_formats([i[0] for i in turls])
2322                 return
2323
2324             # For now, just pick the highest bitrate
2325             format,rtmp_video_url = turls[-1]
2326
2327             # Get the format arg from the arg stream
2328             req_format = self._downloader.params.get('format', None)
2329
2330             # Select format if we can find one
2331             for f,v in turls:
2332                 if f == req_format:
2333                     format, rtmp_video_url = f, v
2334                     break
2335
2336             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2337             if not m:
2338                 raise ExtractorError(u'Cannot transform RTMP url')
2339             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340             video_url = base + m.group('finalid')
2341
2342             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2343             info = {
2344                 'id': shortMediaId,
2345                 'url': video_url,
2346                 'uploader': showId,
2347                 'upload_date': officialDate,
2348                 'title': effTitle,
2349                 'ext': 'mp4',
2350                 'format': format,
2351                 'thumbnail': None,
2352                 'description': officialTitle,
2353             }
2354             results.append(info)
2355
2356         return results
2357
2358
2359 class EscapistIE(InfoExtractor):
2360     """Information extractor for The Escapist """
2361
2362     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363     IE_NAME = u'escapist'
2364
2365     def _real_extract(self, url):
2366         mobj = re.match(self._VALID_URL, url)
2367         if mobj is None:
2368             raise ExtractorError(u'Invalid URL: %s' % url)
2369         showName = mobj.group('showname')
2370         videoId = mobj.group('episode')
2371
2372         self.report_extraction(showName)
2373         webpage = self._download_webpage(url, showName)
2374
2375         videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
2376             webpage, u'description', fatal=False)
2377         if videoDesc: videoDesc = unescapeHTML(videoDesc)
2378
2379         imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
2380             webpage, u'thumbnail', fatal=False)
2381         if imgUrl: imgUrl = unescapeHTML(imgUrl)
2382
2383         playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
2384             webpage, u'player url')
2385         playerUrl = unescapeHTML(playerUrl)
2386
2387         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388         configUrl = compat_urllib_parse.unquote(configUrl)
2389
2390         configJSON = self._download_webpage(configUrl, showName,
2391                                             u'Downloading configuration',
2392                                             u'unable to download configuration')
2393
2394         # Technically, it's JavaScript, not JSON
2395         configJSON = configJSON.replace("'", '"')
2396
2397         try:
2398             config = json.loads(configJSON)
2399         except (ValueError,) as err:
2400             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2401
2402         playlist = config['playlist']
2403         videoUrl = playlist[1]['url']
2404
2405         info = {
2406             'id': videoId,
2407             'url': videoUrl,
2408             'uploader': showName,
2409             'upload_date': None,
2410             'title': showName,
2411             'ext': 'mp4',
2412             'thumbnail': imgUrl,
2413             'description': videoDesc,
2414             'player_url': playerUrl,
2415         }
2416
2417         return [info]
2418
2419 class CollegeHumorIE(InfoExtractor):
2420     """Information extractor for collegehumor.com"""
2421
2422     _WORKING = False
2423     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424     IE_NAME = u'collegehumor'
2425
2426     def report_manifest(self, video_id):
2427         """Report information extraction."""
2428         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2429
2430     def _real_extract(self, url):
2431         mobj = re.match(self._VALID_URL, url)
2432         if mobj is None:
2433             raise ExtractorError(u'Invalid URL: %s' % url)
2434         video_id = mobj.group('videoid')
2435
2436         info = {
2437             'id': video_id,
2438             'uploader': None,
2439             'upload_date': None,
2440         }
2441
2442         self.report_extraction(video_id)
2443         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2444         try:
2445             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448
2449         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2450         try:
2451             videoNode = mdoc.findall('./video')[0]
2452             info['description'] = videoNode.findall('./description')[0].text
2453             info['title'] = videoNode.findall('./caption')[0].text
2454             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455             manifest_url = videoNode.findall('./file')[0].text
2456         except IndexError:
2457             raise ExtractorError(u'Invalid metadata XML file')
2458
2459         manifest_url += '?hdcore=2.10.3'
2460         self.report_manifest(video_id)
2461         try:
2462             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2465
2466         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2467         try:
2468             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469             node_id = media_node.attrib['url']
2470             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471         except IndexError as err:
2472             raise ExtractorError(u'Invalid manifest file')
2473
2474         url_pr = compat_urllib_parse_urlparse(manifest_url)
2475         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2476
2477         info['url'] = url
2478         info['ext'] = 'f4f'
2479         return [info]
2480
2481
2482 class XVideosIE(InfoExtractor):
2483     """Information extractor for xvideos.com"""
2484
2485     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486     IE_NAME = u'xvideos'
2487
2488     def _real_extract(self, url):
2489         mobj = re.match(self._VALID_URL, url)
2490         if mobj is None:
2491             raise ExtractorError(u'Invalid URL: %s' % url)
2492         video_id = mobj.group(1)
2493
2494         webpage = self._download_webpage(url, video_id)
2495
2496         self.report_extraction(video_id)
2497
2498         # Extract video URL
2499         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500             webpage, u'video URL'))
2501
2502         # Extract title
2503         video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
2504             webpage, u'title')
2505
2506         # Extract video thumbnail
2507         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508             webpage, u'thumbnail', fatal=False)
2509
2510         info = {
2511             'id': video_id,
2512             'url': video_url,
2513             'uploader': None,
2514             'upload_date': None,
2515             'title': video_title,
2516             'ext': 'flv',
2517             'thumbnail': video_thumbnail,
2518             'description': None,
2519         }
2520
2521         return [info]
2522
2523
2524 class SoundcloudIE(InfoExtractor):
2525     """Information extractor for soundcloud.com
2526        To access the media, the uid of the song and a stream token
2527        must be extracted from the page source and the script must make
2528        a request to media.soundcloud.com/crossdomain.xml. Then
2529        the media can be grabbed by requesting from an url composed
2530        of the stream token and uid
2531      """
2532
2533     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534     IE_NAME = u'soundcloud'
2535
2536     def report_resolve(self, video_id):
2537         """Report information extraction."""
2538         self.to_screen(u'%s: Resolving id' % video_id)
2539
2540     def _real_extract(self, url):
2541         mobj = re.match(self._VALID_URL, url)
2542         if mobj is None:
2543             raise ExtractorError(u'Invalid URL: %s' % url)
2544
2545         # extract uploader (which is in the url)
2546         uploader = mobj.group(1)
2547         # extract simple title (uploader + slug of song title)
2548         slug_title =  mobj.group(2)
2549         simple_title = uploader + u'-' + slug_title
2550         full_title = '%s/%s' % (uploader, slug_title)
2551
2552         self.report_resolve(full_title)
2553
2554         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2557
2558         info = json.loads(info_json)
2559         video_id = info['id']
2560         self.report_extraction(full_title)
2561
2562         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563         stream_json = self._download_webpage(streams_url, full_title,
2564                                              u'Downloading stream definitions',
2565                                              u'unable to download stream definitions')
2566
2567         streams = json.loads(stream_json)
2568         mediaURL = streams['http_mp3_128_url']
2569         upload_date = unified_strdate(info['created_at'])
2570
2571         return [{
2572             'id':       info['id'],
2573             'url':      mediaURL,
2574             'uploader': info['user']['username'],
2575             'upload_date': upload_date,
2576             'title':    info['title'],
2577             'ext':      u'mp3',
2578             'description': info['description'],
2579         }]
2580
2581 class SoundcloudSetIE(InfoExtractor):
2582     """Information extractor for soundcloud.com sets
2583        To access the media, the uid of the song and a stream token
2584        must be extracted from the page source and the script must make
2585        a request to media.soundcloud.com/crossdomain.xml. Then
2586        the media can be grabbed by requesting from an url composed
2587        of the stream token and uid
2588      """
2589
2590     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591     IE_NAME = u'soundcloud:set'
2592
2593     def report_resolve(self, video_id):
2594         """Report information extraction."""
2595         self.to_screen(u'%s: Resolving id' % video_id)
2596
2597     def _real_extract(self, url):
2598         mobj = re.match(self._VALID_URL, url)
2599         if mobj is None:
2600             raise ExtractorError(u'Invalid URL: %s' % url)
2601
2602         # extract uploader (which is in the url)
2603         uploader = mobj.group(1)
2604         # extract simple title (uploader + slug of song title)
2605         slug_title =  mobj.group(2)
2606         simple_title = uploader + u'-' + slug_title
2607         full_title = '%s/sets/%s' % (uploader, slug_title)
2608
2609         self.report_resolve(full_title)
2610
2611         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613         info_json = self._download_webpage(resolv_url, full_title)
2614
2615         videos = []
2616         info = json.loads(info_json)
2617         if 'errors' in info:
2618             for err in info['errors']:
2619                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2620             return
2621
2622         self.report_extraction(full_title)
2623         for track in info['tracks']:
2624             video_id = track['id']
2625
2626             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2628
2629             self.report_extraction(video_id)
2630             streams = json.loads(stream_json)
2631             mediaURL = streams['http_mp3_128_url']
2632
2633             videos.append({
2634                 'id':       video_id,
2635                 'url':      mediaURL,
2636                 'uploader': track['user']['username'],
2637                 'upload_date':  unified_strdate(track['created_at']),
2638                 'title':    track['title'],
2639                 'ext':      u'mp3',
2640                 'description': track['description'],
2641             })
2642         return videos
2643
2644
2645 class InfoQIE(InfoExtractor):
2646     """Information extractor for infoq.com"""
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2648
2649     def _real_extract(self, url):
2650         mobj = re.match(self._VALID_URL, url)
2651         if mobj is None:
2652             raise ExtractorError(u'Invalid URL: %s' % url)
2653
2654         webpage = self._download_webpage(url, video_id=url)
2655         self.report_extraction(url)
2656
2657         # Extract video URL
2658         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2659         if mobj is None:
2660             raise ExtractorError(u'Unable to extract video url')
2661         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2663
2664         # Extract title
2665         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2666             webpage, u'title')
2667
2668         # Extract description
2669         video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670             webpage, u'description', fatal=False)
2671
2672         video_filename = video_url.split('/')[-1]
2673         video_id, extension = video_filename.split('.')
2674
2675         info = {
2676             'id': video_id,
2677             'url': video_url,
2678             'uploader': None,
2679             'upload_date': None,
2680             'title': video_title,
2681             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2682             'thumbnail': None,
2683             'description': video_description,
2684         }
2685
2686         return [info]
2687
2688 class MixcloudIE(InfoExtractor):
2689     """Information extractor for www.mixcloud.com"""
2690
2691     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693     IE_NAME = u'mixcloud'
2694
2695     def report_download_json(self, file_id):
2696         """Report JSON download."""
2697         self.to_screen(u'Downloading json')
2698
2699     def get_urls(self, jsonData, fmt, bitrate='best'):
2700         """Get urls from 'audio_formats' section in json"""
2701         file_url = None
2702         try:
2703             bitrate_list = jsonData[fmt]
2704             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705                 bitrate = max(bitrate_list) # select highest
2706
2707             url_list = jsonData[fmt][bitrate]
2708         except TypeError: # we have no bitrate info.
2709             url_list = jsonData[fmt]
2710         return url_list
2711
2712     def check_urls(self, url_list):
2713         """Returns 1st active url from list"""
2714         for url in url_list:
2715             try:
2716                 compat_urllib_request.urlopen(url)
2717                 return url
2718             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2719                 url = None
2720
2721         return None
2722
2723     def _print_formats(self, formats):
2724         print('Available formats:')
2725         for fmt in formats.keys():
2726             for b in formats[fmt]:
2727                 try:
2728                     ext = formats[fmt][b][0]
2729                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730                 except TypeError: # we have no bitrate info
2731                     ext = formats[fmt][0]
2732                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2733                     break
2734
2735     def _real_extract(self, url):
2736         mobj = re.match(self._VALID_URL, url)
2737         if mobj is None:
2738             raise ExtractorError(u'Invalid URL: %s' % url)
2739         # extract uploader & filename from url
2740         uploader = mobj.group(1).decode('utf-8')
2741         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2742
2743         # construct API request
2744         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745         # retrieve .json file with links to files
2746         request = compat_urllib_request.Request(file_url)
2747         try:
2748             self.report_download_json(file_url)
2749             jsonData = compat_urllib_request.urlopen(request).read()
2750         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2752
2753         # parse JSON
2754         json_data = json.loads(jsonData)
2755         player_url = json_data['player_swf_url']
2756         formats = dict(json_data['audio_formats'])
2757
2758         req_format = self._downloader.params.get('format', None)
2759         bitrate = None
2760
2761         if self._downloader.params.get('listformats', None):
2762             self._print_formats(formats)
2763             return
2764
2765         if req_format is None or req_format == 'best':
2766             for format_param in formats.keys():
2767                 url_list = self.get_urls(formats, format_param)
2768                 # check urls
2769                 file_url = self.check_urls(url_list)
2770                 if file_url is not None:
2771                     break # got it!
2772         else:
2773             if req_format not in formats:
2774                 raise ExtractorError(u'Format is not available')
2775
2776             url_list = self.get_urls(formats, req_format)
2777             file_url = self.check_urls(url_list)
2778             format_param = req_format
2779
2780         return [{
2781             'id': file_id.decode('utf-8'),
2782             'url': file_url.decode('utf-8'),
2783             'uploader': uploader.decode('utf-8'),
2784             'upload_date': None,
2785             'title': json_data['name'],
2786             'ext': file_url.split('.')[-1].decode('utf-8'),
2787             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788             'thumbnail': json_data['thumbnail_url'],
2789             'description': json_data['description'],
2790             'player_url': player_url.decode('utf-8'),
2791         }]
2792
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794     """Information extractor for Stanford's Open ClassRoom"""
2795
2796     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797     IE_NAME = u'stanfordoc'
2798
2799     def _real_extract(self, url):
2800         mobj = re.match(self._VALID_URL, url)
2801         if mobj is None:
2802             raise ExtractorError(u'Invalid URL: %s' % url)
2803
2804         if mobj.group('course') and mobj.group('video'): # A specific video
2805             course = mobj.group('course')
2806             video = mobj.group('video')
2807             info = {
2808                 'id': course + '_' + video,
2809                 'uploader': None,
2810                 'upload_date': None,
2811             }
2812
2813             self.report_extraction(info['id'])
2814             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815             xmlUrl = baseUrl + video + '.xml'
2816             try:
2817                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2821             try:
2822                 info['title'] = mdoc.findall('./title')[0].text
2823                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2824             except IndexError:
2825                 raise ExtractorError(u'Invalid metadata XML file')
2826             info['ext'] = info['url'].rpartition('.')[2]
2827             return [info]
2828         elif mobj.group('course'): # A course page
2829             course = mobj.group('course')
2830             info = {
2831                 'id': course,
2832                 'type': 'playlist',
2833                 'uploader': None,
2834                 'upload_date': None,
2835             }
2836
2837             coursepage = self._download_webpage(url, info['id'],
2838                                         note='Downloading course info page',
2839                                         errnote='Unable to download course info page')
2840
2841             info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2842             info['title'] = unescapeHTML(info['title'])
2843
2844             info['description'] = self._search_regex('<description>([^<]+)</description>',
2845                 coursepage, u'description', fatal=False)
2846             if info['description']: info['description'] = unescapeHTML(info['description'])
2847
2848             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2849             info['list'] = [
2850                 {
2851                     'type': 'reference',
2852                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2853                 }
2854                     for vpage in links]
2855             results = []
2856             for entry in info['list']:
2857                 assert entry['type'] == 'reference'
2858                 results += self.extract(entry['url'])
2859             return results
2860         else: # Root page
2861             info = {
2862                 'id': 'Stanford OpenClassroom',
2863                 'type': 'playlist',
2864                 'uploader': None,
2865                 'upload_date': None,
2866             }
2867
2868             self.report_download_webpage(info['id'])
2869             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2870             try:
2871                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2872             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2873                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2874
2875             info['title'] = info['id']
2876
2877             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2878             info['list'] = [
2879                 {
2880                     'type': 'reference',
2881                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2882                 }
2883                     for cpage in links]
2884
2885             results = []
2886             for entry in info['list']:
2887                 assert entry['type'] == 'reference'
2888                 results += self.extract(entry['url'])
2889             return results
2890
2891 class MTVIE(InfoExtractor):
2892     """Information extractor for MTV.com"""
2893
2894     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2895     IE_NAME = u'mtv'
2896
2897     def _real_extract(self, url):
2898         mobj = re.match(self._VALID_URL, url)
2899         if mobj is None:
2900             raise ExtractorError(u'Invalid URL: %s' % url)
2901         if not mobj.group('proto'):
2902             url = 'http://' + url
2903         video_id = mobj.group('videoid')
2904
2905         webpage = self._download_webpage(url, video_id)
2906
2907         song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2908             webpage, u'song name', fatal=False)
2909         if song_name: song_name = unescapeHTML(song_name)
2910
2911         video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2912             webpage, u'title')
2913         video_title = unescapeHTML(video_title)
2914
2915         mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2916             webpage, u'mtvn_uri', fatal=False)
2917
2918         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2919             webpage, u'content id', fatal=False)
2920
2921         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2922         self.report_extraction(video_id)
2923         request = compat_urllib_request.Request(videogen_url)
2924         try:
2925             metadataXml = compat_urllib_request.urlopen(request).read()
2926         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2928
2929         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2930         renditions = mdoc.findall('.//rendition')
2931
2932         # For now, always pick the highest quality.
2933         rendition = renditions[-1]
2934
2935         try:
2936             _,_,ext = rendition.attrib['type'].partition('/')
2937             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2938             video_url = rendition.find('./src').text
2939         except KeyError:
2940             raise ExtractorError('Invalid rendition field.')
2941
2942         info = {
2943             'id': video_id,
2944             'url': video_url,
2945             'uploader': performer,
2946             'upload_date': None,
2947             'title': video_title,
2948             'ext': ext,
2949             'format': format,
2950         }
2951
2952         return [info]
2953
2954
2955 class YoukuIE(InfoExtractor):
2956     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2957
2958     def _gen_sid(self):
2959         nowTime = int(time.time() * 1000)
2960         random1 = random.randint(1000,1998)
2961         random2 = random.randint(1000,9999)
2962
2963         return "%d%d%d" %(nowTime,random1,random2)
2964
2965     def _get_file_ID_mix_string(self, seed):
2966         mixed = []
2967         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2968         seed = float(seed)
2969         for i in range(len(source)):
2970             seed  =  (seed * 211 + 30031 ) % 65536
2971             index  =  math.floor(seed / 65536 * len(source) )
2972             mixed.append(source[int(index)])
2973             source.remove(source[int(index)])
2974         #return ''.join(mixed)
2975         return mixed
2976
2977     def _get_file_id(self, fileId, seed):
2978         mixed = self._get_file_ID_mix_string(seed)
2979         ids = fileId.split('*')
2980         realId = []
2981         for ch in ids:
2982             if ch:
2983                 realId.append(mixed[int(ch)])
2984         return ''.join(realId)
2985
2986     def _real_extract(self, url):
2987         mobj = re.match(self._VALID_URL, url)
2988         if mobj is None:
2989             raise ExtractorError(u'Invalid URL: %s' % url)
2990         video_id = mobj.group('ID')
2991
2992         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2993
2994         jsondata = self._download_webpage(info_url, video_id)
2995
2996         self.report_extraction(video_id)
2997         try:
2998             config = json.loads(jsondata)
2999
3000             video_title =  config['data'][0]['title']
3001             seed = config['data'][0]['seed']
3002
3003             format = self._downloader.params.get('format', None)
3004             supported_format = list(config['data'][0]['streamfileids'].keys())
3005
3006             if format is None or format == 'best':
3007                 if 'hd2' in supported_format:
3008                     format = 'hd2'
3009                 else:
3010                     format = 'flv'
3011                 ext = u'flv'
3012             elif format == 'worst':
3013                 format = 'mp4'
3014                 ext = u'mp4'
3015             else:
3016                 format = 'flv'
3017                 ext = u'flv'
3018
3019
3020             fileid = config['data'][0]['streamfileids'][format]
3021             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3022         except (UnicodeDecodeError, ValueError, KeyError):
3023             raise ExtractorError(u'Unable to extract info section')
3024
3025         files_info=[]
3026         sid = self._gen_sid()
3027         fileid = self._get_file_id(fileid, seed)
3028
3029         #column 8,9 of fileid represent the segment number
3030         #fileid[7:9] should be changed
3031         for index, key in enumerate(keys):
3032
3033             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3034             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3035
3036             info = {
3037                 'id': '%s_part%02d' % (video_id, index),
3038                 'url': download_url,
3039                 'uploader': None,
3040                 'upload_date': None,
3041                 'title': video_title,
3042                 'ext': ext,
3043             }
3044             files_info.append(info)
3045
3046         return files_info
3047
3048
3049 class XNXXIE(InfoExtractor):
3050     """Information extractor for xnxx.com"""
3051
3052     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3053     IE_NAME = u'xnxx'
3054     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3055     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3056     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3057
3058     def _real_extract(self, url):
3059         mobj = re.match(self._VALID_URL, url)
3060         if mobj is None:
3061             raise ExtractorError(u'Invalid URL: %s' % url)
3062         video_id = mobj.group(1)
3063
3064         # Get webpage content
3065         webpage = self._download_webpage(url, video_id)
3066
3067         video_url = self._search_regex(self.VIDEO_URL_RE,
3068             webpage, u'video URL')
3069         video_url = compat_urllib_parse.unquote(video_url)
3070
3071         video_title = self._search_regex(self.VIDEO_TITLE_RE,
3072             webpage, u'title')
3073
3074         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3075             webpage, u'thumbnail', fatal=False)
3076
3077         return [{
3078             'id': video_id,
3079             'url': video_url,
3080             'uploader': None,
3081             'upload_date': None,
3082             'title': video_title,
3083             'ext': 'flv',
3084             'thumbnail': video_thumbnail,
3085             'description': None,
3086         }]
3087
3088
3089 class GooglePlusIE(InfoExtractor):
3090     """Information extractor for plus.google.com."""
3091
3092     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3093     IE_NAME = u'plus.google'
3094
3095     def _real_extract(self, url):
3096         # Extract id from URL
3097         mobj = re.match(self._VALID_URL, url)
3098         if mobj is None:
3099             raise ExtractorError(u'Invalid URL: %s' % url)
3100
3101         post_url = mobj.group(0)
3102         video_id = mobj.group(1)
3103
3104         video_extension = 'flv'
3105
3106         # Step 1, Retrieve post webpage to extract further information
3107         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3108
3109         self.report_extraction(video_id)
3110
3111         # Extract update date
3112         upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
3113             webpage, u'upload date', fatal=False)
3114         if upload_date:
3115             # Convert timestring to a format suitable for filename
3116             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3117             upload_date = upload_date.strftime('%Y%m%d')
3118
3119         # Extract uploader
3120         uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
3121             webpage, u'uploader', fatal=False)
3122
3123         # Extract title
3124         # Get the first line for title
3125         video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3126             webpage, 'title', default=u'NA')
3127
3128         # Step 2, Stimulate clicking the image box to launch video
3129         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3130             webpage, u'video page URL')
3131         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132
3133         # Extract video links on video page
3134         """Extract video links of all sizes"""
3135         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3136         mobj = re.findall(pattern, webpage)
3137         if len(mobj) == 0:
3138             raise ExtractorError(u'Unable to extract video links')
3139
3140         # Sort in resolution
3141         links = sorted(mobj)
3142
3143         # Choose the lowest of the sort, i.e. highest resolution
3144         video_url = links[-1]
3145         # Only get the url. The resolution part in the tuple has no use anymore
3146         video_url = video_url[-1]
3147         # Treat escaped \u0026 style hex
3148         try:
3149             video_url = video_url.decode("unicode_escape")
3150         except AttributeError: # Python 3
3151             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3152
3153
3154         return [{
3155             'id':       video_id,
3156             'url':      video_url,
3157             'uploader': uploader,
3158             'upload_date':  upload_date,
3159             'title':    video_title,
3160             'ext':      video_extension,
3161         }]
3162
3163 class NBAIE(InfoExtractor):
3164     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3165     IE_NAME = u'nba'
3166
3167     def _real_extract(self, url):
3168         mobj = re.match(self._VALID_URL, url)
3169         if mobj is None:
3170             raise ExtractorError(u'Invalid URL: %s' % url)
3171
3172         video_id = mobj.group(1)
3173
3174         webpage = self._download_webpage(url, video_id)
3175
3176         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3177
3178         shortened_video_id = video_id.rpartition('/')[2]
3179         title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
3180             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3181
3182         # It isn't there in the HTML it returns to us
3183         # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3184
3185         description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3186
3187         info = {
3188             'id': shortened_video_id,
3189             'url': video_url,
3190             'ext': 'mp4',
3191             'title': title,
3192             # 'uploader_date': uploader_date,
3193             'description': description,
3194         }
3195         return [info]
3196
3197 class JustinTVIE(InfoExtractor):
3198     """Information extractor for justin.tv and twitch.tv"""
3199     # TODO: One broadcast may be split into multiple videos. The key
3200     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3201     # starts at 1 and increases. Can we treat all parts as one video?
3202
3203     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3204         (?:
3205             (?P<channelid>[^/]+)|
3206             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3207             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3208         )
3209         /?(?:\#.*)?$
3210         """
3211     _JUSTIN_PAGE_LIMIT = 100
3212     IE_NAME = u'justin.tv'
3213
3214     def report_download_page(self, channel, offset):
3215         """Report attempt to download a single page of videos."""
3216         self.to_screen(u'%s: Downloading video information from %d to %d' %
3217                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3218
3219     # Return count of items, list of *valid* items
3220     def _parse_page(self, url, video_id):
3221         webpage = self._download_webpage(url, video_id,
3222                                          u'Downloading video info JSON',
3223                                          u'unable to download video info JSON')
3224
3225         response = json.loads(webpage)
3226         if type(response) != list:
3227             error_text = response.get('error', 'unknown error')
3228             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3229         info = []
3230         for clip in response:
3231             video_url = clip['video_file_url']
3232             if video_url:
3233                 video_extension = os.path.splitext(video_url)[1][1:]
3234                 video_date = re.sub('-', '', clip['start_time'][:10])
3235                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3236                 video_id = clip['id']
3237                 video_title = clip.get('title', video_id)
3238                 info.append({
3239                     'id': video_id,
3240                     'url': video_url,
3241                     'title': video_title,
3242                     'uploader': clip.get('channel_name', video_uploader_id),
3243                     'uploader_id': video_uploader_id,
3244                     'upload_date': video_date,
3245                     'ext': video_extension,
3246                 })
3247         return (len(response), info)
3248
3249     def _real_extract(self, url):
3250         mobj = re.match(self._VALID_URL, url)
3251         if mobj is None:
3252             raise ExtractorError(u'invalid URL: %s' % url)
3253
3254         api_base = 'http://api.justin.tv'
3255         paged = False
3256         if mobj.group('channelid'):
3257             paged = True
3258             video_id = mobj.group('channelid')
3259             api = api_base + '/channel/archives/%s.json' % video_id
3260         elif mobj.group('chapterid'):
3261             chapter_id = mobj.group('chapterid')
3262
3263             webpage = self._download_webpage(url, chapter_id)
3264             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3265             if not m:
3266                 raise ExtractorError(u'Cannot find archive of a chapter')
3267             archive_id = m.group(1)
3268
3269             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3270             chapter_info_xml = self._download_webpage(api, chapter_id,
3271                                              note=u'Downloading chapter information',
3272                                              errnote=u'Chapter information download failed')
3273             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3274             for a in doc.findall('.//archive'):
3275                 if archive_id == a.find('./id').text:
3276                     break
3277             else:
3278                 raise ExtractorError(u'Could not find chapter in chapter information')
3279
3280             video_url = a.find('./video_file_url').text
3281             video_ext = video_url.rpartition('.')[2] or u'flv'
3282
3283             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3284             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3285                                    note='Downloading chapter metadata',
3286                                    errnote='Download of chapter metadata failed')
3287             chapter_info = json.loads(chapter_info_json)
3288
3289             bracket_start = int(doc.find('.//bracket_start').text)
3290             bracket_end = int(doc.find('.//bracket_end').text)
3291
3292             # TODO determine start (and probably fix up file)
3293             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3294             #video_url += u'?start=' + TODO:start_timestamp
3295             # bracket_start is 13290, but we want 51670615
3296             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3297                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3298
3299             info = {
3300                 'id': u'c' + chapter_id,
3301                 'url': video_url,
3302                 'ext': video_ext,
3303                 'title': chapter_info['title'],
3304                 'thumbnail': chapter_info['preview'],
3305                 'description': chapter_info['description'],
3306                 'uploader': chapter_info['channel']['display_name'],
3307                 'uploader_id': chapter_info['channel']['name'],
3308             }
3309             return [info]
3310         else:
3311             video_id = mobj.group('videoid')
3312             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3313
3314         self.report_extraction(video_id)
3315
3316         info = []
3317         offset = 0
3318         limit = self._JUSTIN_PAGE_LIMIT
3319         while True:
3320             if paged:
3321                 self.report_download_page(video_id, offset)
3322             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3323             page_count, page_info = self._parse_page(page_url, video_id)
3324             info.extend(page_info)
3325             if not paged or page_count != limit:
3326                 break
3327             offset += limit
3328         return info
3329
3330 class FunnyOrDieIE(InfoExtractor):
3331     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3332
3333     def _real_extract(self, url):
3334         mobj = re.match(self._VALID_URL, url)
3335         if mobj is None:
3336             raise ExtractorError(u'invalid URL: %s' % url)
3337
3338         video_id = mobj.group('id')
3339         webpage = self._download_webpage(url, video_id)
3340
3341         video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3342             webpage, u'video URL', flags=re.DOTALL)
3343         video_url = unescapeHTML(video_url)
3344
3345         title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3346             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3347         title = clean_html(title)
3348
3349         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3350             webpage, u'description', fatal=False, flags=re.DOTALL)
3351         if video_description: video_description = unescapeHTML(video_description)
3352
3353         info = {
3354             'id': video_id,
3355             'url': video_url,
3356             'ext': 'mp4',
3357             'title': title,
3358             'description': video_description,
3359         }
3360         return [info]
3361
3362 class SteamIE(InfoExtractor):
3363     _VALID_URL = r"""http://store\.steampowered\.com/
3364                 (agecheck/)?
3365                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3366                 (?P<gameID>\d+)/?
3367                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3368                 """
3369
3370     @classmethod
3371     def suitable(cls, url):
3372         """Receives a URL and returns True if suitable for this IE."""
3373         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3374
3375     def _real_extract(self, url):
3376         m = re.match(self._VALID_URL, url, re.VERBOSE)
3377         gameID = m.group('gameID')
3378         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3379         self.report_age_confirmation()
3380         webpage = self._download_webpage(videourl, gameID)
3381         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3382
3383         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3384         mweb = re.finditer(urlRE, webpage)
3385         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3386         titles = re.finditer(namesRE, webpage)
3387         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3388         thumbs = re.finditer(thumbsRE, webpage)
3389         videos = []
3390         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3391             video_id = vid.group('videoID')
3392             title = vtitle.group('videoName')
3393             video_url = vid.group('videoURL')
3394             video_thumb = thumb.group('thumbnail')
3395             if not video_url:
3396                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3397             info = {
3398                 'id':video_id,
3399                 'url':video_url,
3400                 'ext': 'flv',
3401                 'title': unescapeHTML(title),
3402                 'thumbnail': video_thumb
3403                   }
3404             videos.append(info)
3405         return [self.playlist_result(videos, gameID, game_title)]
3406
3407 class UstreamIE(InfoExtractor):
3408     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3409     IE_NAME = u'ustream'
3410
3411     def _real_extract(self, url):
3412         m = re.match(self._VALID_URL, url)
3413         video_id = m.group('videoID')
3414
3415         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3416         webpage = self._download_webpage(url, video_id)
3417
3418         self.report_extraction(video_id)
3419
3420         video_title = self._search_regex(r'data-title="(?P<title>.+)"',
3421             webpage, u'title')
3422
3423         uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3424             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3425         if uploader: uploader = unescapeHTML(uploader.strip())
3426
3427         thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3428             webpage, u'thumbnail', fatal=False)
3429
3430         info = {
3431                 'id': video_id,
3432                 'url': video_url,
3433                 'ext': 'flv',
3434                 'title': video_title,
3435                 'uploader': uploader,
3436                 'thumbnail': thumbnail,
3437                }
3438         return info
3439
3440 class WorldStarHipHopIE(InfoExtractor):
3441     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3442     IE_NAME = u'WorldStarHipHop'
3443
3444     def _real_extract(self, url):
3445         m = re.match(self._VALID_URL, url)
3446         video_id = m.group('id')
3447
3448         webpage_src = self._download_webpage(url, video_id)
3449
3450         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3451             webpage_src, u'video URL')
3452
3453         if 'mp4' in video_url:
3454             ext = 'mp4'
3455         else:
3456             ext = 'flv'
3457
3458         video_title = self._search_regex(r"<title>(.*)</title>",
3459             webpage_src, u'title')
3460
3461         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3462         thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
3463             webpage_src, u'thumbnail', fatal=False)
3464
3465         if not thumbnail:
3466             _title = r"""candytitles.*>(.*)</span>"""
3467             mobj = re.search(_title, webpage_src)
3468             if mobj is not None:
3469                 video_title = mobj.group(1)
3470
3471         results = [{
3472                     'id': video_id,
3473                     'url' : video_url,
3474                     'title' : video_title,
3475                     'thumbnail' : thumbnail,
3476                     'ext' : ext,
3477                     }]
3478         return results
3479
3480 class RBMARadioIE(InfoExtractor):
3481     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3482
3483     def _real_extract(self, url):
3484         m = re.match(self._VALID_URL, url)
3485         video_id = m.group('videoID')
3486
3487         webpage = self._download_webpage(url, video_id)
3488
3489         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3490             webpage, u'json data')
3491
3492         try:
3493             data = json.loads(json_data)
3494         except ValueError as e:
3495             raise ExtractorError(u'Invalid JSON: ' + str(e))
3496
3497         video_url = data['akamai_url'] + '&cbr=256'
3498         url_parts = compat_urllib_parse_urlparse(video_url)
3499         video_ext = url_parts.path.rpartition('.')[2]
3500         info = {
3501                 'id': video_id,
3502                 'url': video_url,
3503                 'ext': video_ext,
3504                 'title': data['title'],
3505                 'description': data.get('teaser_text'),
3506                 'location': data.get('country_of_origin'),
3507                 'uploader': data.get('host', {}).get('name'),
3508                 'uploader_id': data.get('host', {}).get('slug'),
3509                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3510                 'duration': data.get('duration'),
3511         }
3512         return [info]
3513
3514
3515 class YouPornIE(InfoExtractor):
3516     """Information extractor for youporn.com."""
3517     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3518
3519     def _print_formats(self, formats):
3520         """Print all available formats"""
3521         print(u'Available formats:')
3522         print(u'ext\t\tformat')
3523         print(u'---------------------------------')
3524         for format in formats:
3525             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3526
3527     def _specific(self, req_format, formats):
3528         for x in formats:
3529             if(x["format"]==req_format):
3530                 return x
3531         return None
3532
3533     def _real_extract(self, url):
3534         mobj = re.match(self._VALID_URL, url)
3535         if mobj is None:
3536             raise ExtractorError(u'Invalid URL: %s' % url)
3537         video_id = mobj.group('videoid')
3538
3539         req = compat_urllib_request.Request(url)
3540         req.add_header('Cookie', 'age_verified=1')
3541         webpage = self._download_webpage(req, video_id)
3542
3543         # Get JSON parameters
3544         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3545         try:
3546             params = json.loads(json_params)
3547         except:
3548             raise ExtractorError(u'Invalid JSON')
3549
3550         self.report_extraction(video_id)
3551         try:
3552             video_title = params['title']
3553             upload_date = unified_strdate(params['release_date_f'])
3554             video_description = params['description']
3555             video_uploader = params['submitted_by']
3556             thumbnail = params['thumbnails'][0]['image']
3557         except KeyError:
3558             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3559
3560         # Get all of the formats available
3561         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3562         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3563             webpage, u'download list').strip()
3564
3565         # Get all of the links from the page
3566         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3567         links = re.findall(LINK_RE, download_list_html)
3568         if(len(links) == 0):
3569             raise ExtractorError(u'ERROR: no known formats available for video')
3570
3571         self.to_screen(u'Links found: %d' % len(links))
3572
3573         formats = []
3574         for link in links:
3575
3576             # A link looks like this:
3577             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3578             # A path looks like this:
3579             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3580             video_url = unescapeHTML( link )
3581             path = compat_urllib_parse_urlparse( video_url ).path
3582             extension = os.path.splitext( path )[1][1:]
3583             format = path.split('/')[4].split('_')[:2]
3584             size = format[0]
3585             bitrate = format[1]
3586             format = "-".join( format )
3587             title = u'%s-%s-%s' % (video_title, size, bitrate)
3588
3589             formats.append({
3590                 'id': video_id,
3591                 'url': video_url,
3592                 'uploader': video_uploader,
3593                 'upload_date': upload_date,
3594                 'title': title,
3595                 'ext': extension,
3596                 'format': format,
3597                 'thumbnail': thumbnail,
3598                 'description': video_description
3599             })
3600
3601         if self._downloader.params.get('listformats', None):
3602             self._print_formats(formats)
3603             return
3604
3605         req_format = self._downloader.params.get('format', None)
3606         self.to_screen(u'Format: %s' % req_format)
3607
3608         if req_format is None or req_format == 'best':
3609             return [formats[0]]
3610         elif req_format == 'worst':
3611             return [formats[-1]]
3612         elif req_format in ('-1', 'all'):
3613             return formats
3614         else:
3615             format = self._specific( req_format, formats )
3616             if result is None:
3617                 raise ExtractorError(u'Requested format not available')
3618             return [format]
3619
3620
3621
3622 class PornotubeIE(InfoExtractor):
3623     """Information extractor for pornotube.com."""
3624     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3625
3626     def _real_extract(self, url):
3627         mobj = re.match(self._VALID_URL, url)
3628         if mobj is None:
3629             raise ExtractorError(u'Invalid URL: %s' % url)
3630
3631         video_id = mobj.group('videoid')
3632         video_title = mobj.group('title')
3633
3634         # Get webpage content
3635         webpage = self._download_webpage(url, video_id)
3636
3637         # Get the video URL
3638         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3639         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3640         video_url = compat_urllib_parse.unquote(video_url)
3641
3642         #Get the uploaded date
3643         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3644         upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3645         if upload_date: upload_date = unified_strdate(upload_date)
3646
3647         info = {'id': video_id,
3648                 'url': video_url,
3649                 'uploader': None,
3650                 'upload_date': upload_date,
3651                 'title': video_title,
3652                 'ext': 'flv',
3653                 'format': 'flv'}
3654
3655         return [info]
3656
3657 class YouJizzIE(InfoExtractor):
3658     """Information extractor for youjizz.com."""
3659     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3660
3661     def _real_extract(self, url):
3662         mobj = re.match(self._VALID_URL, url)
3663         if mobj is None:
3664             raise ExtractorError(u'Invalid URL: %s' % url)
3665
3666         video_id = mobj.group('videoid')
3667
3668         # Get webpage content
3669         webpage = self._download_webpage(url, video_id)
3670
3671         # Get the video title
3672         video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
3673             webpage, u'title').strip()
3674
3675         # Get the embed page
3676         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3677         if result is None:
3678             raise ExtractorError(u'ERROR: unable to extract embed page')
3679
3680         embed_page_url = result.group(0).strip()
3681         video_id = result.group('videoid')
3682
3683         webpage = self._download_webpage(embed_page_url, video_id)
3684
3685         # Get the video URL
3686         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3687             webpage, u'video URL')
3688
3689         info = {'id': video_id,
3690                 'url': video_url,
3691                 'title': video_title,
3692                 'ext': 'flv',
3693                 'format': 'flv',
3694                 'player_url': embed_page_url}
3695
3696         return [info]
3697
3698 class EightTracksIE(InfoExtractor):
3699     IE_NAME = '8tracks'
3700     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3701
3702     def _real_extract(self, url):
3703         mobj = re.match(self._VALID_URL, url)
3704         if mobj is None:
3705             raise ExtractorError(u'Invalid URL: %s' % url)
3706         playlist_id = mobj.group('id')
3707
3708         webpage = self._download_webpage(url, playlist_id)
3709
3710         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3711         data = json.loads(json_like)
3712
3713         session = str(random.randint(0, 1000000000))
3714         mix_id = data['id']
3715         track_count = data['tracks_count']
3716         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3717         next_url = first_url
3718         res = []
3719         for i in itertools.count():
3720             api_json = self._download_webpage(next_url, playlist_id,
3721                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3722                 errnote=u'Failed to download song information')
3723             api_data = json.loads(api_json)
3724             track_data = api_data[u'set']['track']
3725             info = {
3726                 'id': track_data['id'],
3727                 'url': track_data['track_file_stream_url'],
3728                 'title': track_data['performer'] + u' - ' + track_data['name'],
3729                 'raw_title': track_data['name'],
3730                 'uploader_id': data['user']['login'],
3731                 'ext': 'm4a',
3732             }
3733             res.append(info)
3734             if api_data['set']['at_last_track']:
3735                 break
3736             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3737         return res
3738
3739 class KeekIE(InfoExtractor):
3740     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3741     IE_NAME = u'keek'
3742
3743     def _real_extract(self, url):
3744         m = re.match(self._VALID_URL, url)
3745         video_id = m.group('videoID')
3746
3747         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3748         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3749         webpage = self._download_webpage(url, video_id)
3750
3751         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3752             webpage, u'title')
3753         video_title = unescapeHTML(video_title)
3754
3755         uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3756             webpage, u'uploader', fatal=False)
3757         if uploader: uploader = clean_html(uploader)
3758
3759         info = {
3760                 'id': video_id,
3761                 'url': video_url,
3762                 'ext': 'mp4',
3763                 'title': video_title,
3764                 'thumbnail': thumbnail,
3765                 'uploader': uploader
3766         }
3767         return [info]
3768
3769 class TEDIE(InfoExtractor):
3770     _VALID_URL=r'''http://www\.ted\.com/
3771                    (
3772                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3773                         |
3774                         ((?P<type_talk>talks)) # We have a simple talk
3775                    )
3776                    (/lang/(.*?))? # The url may contain the language
3777                    /(?P<name>\w+) # Here goes the name and then ".html"
3778                    '''
3779
3780     @classmethod
3781     def suitable(cls, url):
3782         """Receives a URL and returns True if suitable for this IE."""
3783         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3784
3785     def _real_extract(self, url):
3786         m=re.match(self._VALID_URL, url, re.VERBOSE)
3787         if m.group('type_talk'):
3788             return [self._talk_info(url)]
3789         else :
3790             playlist_id=m.group('playlist_id')
3791             name=m.group('name')
3792             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3793             return [self._playlist_videos_info(url,name,playlist_id)]
3794
3795     def _talk_video_link(self,mediaSlug):
3796         '''Returns the video link for that mediaSlug'''
3797         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3798
3799     def _playlist_videos_info(self,url,name,playlist_id=0):
3800         '''Returns the videos of the playlist'''
3801         video_RE=r'''
3802                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3803                      ([.\s]*?)data-playlist_item_id="(\d+)"
3804                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805                      '''
3806         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3807         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3808         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3809         m_names=re.finditer(video_name_RE,webpage)
3810
3811         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3812         m_playlist = re.search(playlist_RE, webpage)
3813         playlist_title = m_playlist.group('playlist_title')
3814
3815         playlist_entries = []
3816         for m_video, m_name in zip(m_videos,m_names):
3817             video_id=m_video.group('video_id')
3818             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3819             playlist_entries.append(self.url_result(talk_url, 'TED'))
3820         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3821
3822     def _talk_info(self, url, video_id=0):
3823         """Return the video for the talk in the url"""
3824         m=re.match(self._VALID_URL, url,re.VERBOSE)
3825         videoName=m.group('name')
3826         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3827         # If the url includes the language we get the title translated
3828         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3829         title=re.search(title_RE, webpage).group('title')
3830         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3831                         "id":(?P<videoID>[\d]+).*?
3832                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3833         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3834         thumb_match=re.search(thumb_RE,webpage)
3835         info_match=re.search(info_RE,webpage,re.VERBOSE)
3836         video_id=info_match.group('videoID')
3837         mediaSlug=info_match.group('mediaSlug')
3838         video_url=self._talk_video_link(mediaSlug)
3839         info = {
3840                 'id': video_id,
3841                 'url': video_url,
3842                 'ext': 'mp4',
3843                 'title': title,
3844                 'thumbnail': thumb_match.group('thumbnail')
3845                 }
3846         return info
3847
3848 class MySpassIE(InfoExtractor):
3849     _VALID_URL = r'http://www.myspass.de/.*'
3850
3851     def _real_extract(self, url):
3852         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853
3854         # video id is the last path element of the URL
3855         # usually there is a trailing slash, so also try the second but last
3856         url_path = compat_urllib_parse_urlparse(url).path
3857         url_parent_path, video_id = os.path.split(url_path)
3858         if not video_id:
3859             _, video_id = os.path.split(url_parent_path)
3860
3861         # get metadata
3862         metadata_url = META_DATA_URL_TEMPLATE % video_id
3863         metadata_text = self._download_webpage(metadata_url, video_id)
3864         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865
3866         # extract values from metadata
3867         url_flv_el = metadata.find('url_flv')
3868         if url_flv_el is None:
3869             raise ExtractorError(u'Unable to extract download url')
3870         video_url = url_flv_el.text
3871         extension = os.path.splitext(video_url)[1][1:]
3872         title_el = metadata.find('title')
3873         if title_el is None:
3874             raise ExtractorError(u'Unable to extract title')
3875         title = title_el.text
3876         format_id_el = metadata.find('format_id')
3877         if format_id_el is None:
3878             format = ext
3879         else:
3880             format = format_id_el.text
3881         description_el = metadata.find('description')
3882         if description_el is not None:
3883             description = description_el.text
3884         else:
3885             description = None
3886         imagePreview_el = metadata.find('imagePreview')
3887         if imagePreview_el is not None:
3888             thumbnail = imagePreview_el.text
3889         else:
3890             thumbnail = None
3891         info = {
3892             'id': video_id,
3893             'url': video_url,
3894             'title': title,
3895             'ext': extension,
3896             'format': format,
3897             'thumbnail': thumbnail,
3898             'description': description
3899         }
3900         return [info]
3901
3902 class SpiegelIE(InfoExtractor):
3903     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904
3905     def _real_extract(self, url):
3906         m = re.match(self._VALID_URL, url)
3907         video_id = m.group('videoID')
3908
3909         webpage = self._download_webpage(url, video_id)
3910
3911         video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
3912             webpage, u'title')
3913         video_title = unescapeHTML(video_title)
3914
3915         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3916         xml_code = self._download_webpage(xml_url, video_id,
3917                     note=u'Downloading XML', errnote=u'Failed to download XML')
3918
3919         idoc = xml.etree.ElementTree.fromstring(xml_code)
3920         last_type = idoc[-1]
3921         filename = last_type.findall('./filename')[0].text
3922         duration = float(last_type.findall('./duration')[0].text)
3923
3924         video_url = 'http://video2.spiegel.de/flash/' + filename
3925         video_ext = filename.rpartition('.')[2]
3926         info = {
3927             'id': video_id,
3928             'url': video_url,
3929             'ext': video_ext,
3930             'title': video_title,
3931             'duration': duration,
3932         }
3933         return [info]
3934
3935 class LiveLeakIE(InfoExtractor):
3936
3937     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3938     IE_NAME = u'liveleak'
3939
3940     def _real_extract(self, url):
3941         mobj = re.match(self._VALID_URL, url)
3942         if mobj is None:
3943             raise ExtractorError(u'Invalid URL: %s' % url)
3944
3945         video_id = mobj.group('video_id')
3946
3947         webpage = self._download_webpage(url, video_id)
3948
3949         video_url = self._search_regex(r'file: "(.*?)",',
3950             webpage, u'video URL')
3951
3952         video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3953             webpage, u'title')
3954         video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
3955
3956         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3957             webpage, u'description', fatal=False)
3958         if video_description: video_description = unescapeHTML(video_description)
3959
3960         video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
3961             webpage, u'uploader', fatal=False)
3962
3963         info = {
3964             'id':  video_id,
3965             'url': video_url,
3966             'ext': 'mp4',
3967             'title': video_title,
3968             'description': video_description,
3969             'uploader': video_uploader
3970         }
3971
3972         return [info]
3973
3974 class ARDIE(InfoExtractor):
3975     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3976     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3977     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3978
3979     def _real_extract(self, url):
3980         # determine video id from url
3981         m = re.match(self._VALID_URL, url)
3982
3983         numid = re.search(r'documentId=([0-9]+)', url)
3984         if numid:
3985             video_id = numid.group(1)
3986         else:
3987             video_id = m.group('video_id')
3988
3989         # determine title and media streams from webpage
3990         html = self._download_webpage(url, video_id)
3991         title = re.search(self._TITLE, html).group('title')
3992         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3993         if not streams:
3994             assert '"fsk"' in html
3995             raise ExtractorError(u'This video is only available after 8:00 pm')
3996
3997         # choose default media type and highest quality for now
3998         stream = max([s for s in streams if int(s["media_type"]) == 0],
3999                      key=lambda s: int(s["quality"]))
4000
4001         # there's two possibilities: RTMP stream or HTTP download
4002         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4003         if stream['rtmp_url']:
4004             self.to_screen(u'RTMP download detected')
4005             assert stream['video_url'].startswith('mp4:')
4006             info["url"] = stream["rtmp_url"]
4007             info["play_path"] = stream['video_url']
4008         else:
4009             assert stream["video_url"].endswith('.mp4')
4010             info["url"] = stream["video_url"]
4011         return [info]
4012
4013 class TumblrIE(InfoExtractor):
4014     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4015
4016     def _real_extract(self, url):
4017         m_url = re.match(self._VALID_URL, url)
4018         video_id = m_url.group('id')
4019         blog = m_url.group('blog_name')
4020
4021         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4022         webpage = self._download_webpage(url, video_id)
4023
4024         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4025         video = re.search(re_video, webpage)
4026         if video is None:
4027            raise ExtractorError(u'Unable to extract video')
4028         video_url = video.group('video_url')
4029         ext = video.group('ext')
4030
4031         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4032             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4033         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4034
4035         # The only place where you can get a title, it's not complete,
4036         # but searching in other places doesn't work for all videos
4037         video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
4038             webpage, u'title', flags=re.DOTALL)
4039         video_title = unescapeHTML(video_title)
4040
4041         return [{'id': video_id,
4042                  'url': video_url,
4043                  'title': video_title,
4044                  'thumbnail': video_thumbnail,
4045                  'ext': ext
4046                  }]
4047
4048 class BandcampIE(InfoExtractor):
4049     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4050
4051     def _real_extract(self, url):
4052         mobj = re.match(self._VALID_URL, url)
4053         title = mobj.group('title')
4054         webpage = self._download_webpage(url, title)
4055         # We get the link to the free download page
4056         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4057         if m_download is None:
4058             raise ExtractorError(u'No free songs found')
4059
4060         download_link = m_download.group(1)
4061         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4062                        webpage, re.MULTILINE|re.DOTALL).group('id')
4063
4064         download_webpage = self._download_webpage(download_link, id,
4065                                                   'Downloading free downloads page')
4066         # We get the dictionary of the track from some javascrip code
4067         info = re.search(r'items: (.*?),$',
4068                          download_webpage, re.MULTILINE).group(1)
4069         info = json.loads(info)[0]
4070         # We pick mp3-320 for now, until format selection can be easily implemented.
4071         mp3_info = info[u'downloads'][u'mp3-320']
4072         # If we try to use this url it says the link has expired
4073         initial_url = mp3_info[u'url']
4074         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4075         m_url = re.match(re_url, initial_url)
4076         #We build the url we will use to get the final track url
4077         # This url is build in Bandcamp in the script download_bunde_*.js
4078         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4079         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4080         # If we could correctly generate the .rand field the url would be
4081         #in the "download_url" key
4082         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4083
4084         track_info = {'id':id,
4085                       'title' : info[u'title'],
4086                       'ext' :   'mp3',
4087                       'url' :   final_url,
4088                       'thumbnail' : info[u'thumb_url'],
4089                       'uploader' :  info[u'artist']
4090                       }
4091
4092         return [track_info]
4093
4094 class RedTubeIE(InfoExtractor):
4095     """Information Extractor for redtube"""
4096     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4097
4098     def _real_extract(self,url):
4099         mobj = re.match(self._VALID_URL, url)
4100         if mobj is None:
4101             raise ExtractorError(u'Invalid URL: %s' % url)
4102
4103         video_id = mobj.group('id')
4104         video_extension = 'mp4'
4105         webpage = self._download_webpage(url, video_id)
4106
4107         self.report_extraction(video_id)
4108
4109         video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
4110             webpage, u'video URL')
4111
4112         video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4113             webpage, u'title')
4114
4115         return [{
4116             'id':       video_id,
4117             'url':      video_url,
4118             'ext':      video_extension,
4119             'title':    video_title,
4120         }]
4121
4122 class InaIE(InfoExtractor):
4123     """Information Extractor for Ina.fr"""
4124     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4125
4126     def _real_extract(self,url):
4127         mobj = re.match(self._VALID_URL, url)
4128
4129         video_id = mobj.group('id')
4130         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4131         video_extension = 'mp4'
4132         webpage = self._download_webpage(mrss_url, video_id)
4133
4134         self.report_extraction(video_id)
4135
4136         video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4137             webpage, u'video URL')
4138
4139         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4140             webpage, u'title')
4141
4142         return [{
4143             'id':       video_id,
4144             'url':      video_url,
4145             'ext':      video_extension,
4146             'title':    video_title,
4147         }]
4148
4149 class HowcastIE(InfoExtractor):
4150     """Information Extractor for Howcast.com"""
4151     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4152
4153     def _real_extract(self, url):
4154         mobj = re.match(self._VALID_URL, url)
4155
4156         video_id = mobj.group('id')
4157         webpage_url = 'http://www.howcast.com/videos/' + video_id
4158         webpage = self._download_webpage(webpage_url, video_id)
4159
4160         self.report_extraction(video_id)
4161
4162         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4163             webpage, u'video URL')
4164
4165         video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4166             webpage, u'title')
4167
4168         video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4169             webpage, u'description', fatal=False)
4170
4171         thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4172             webpage, u'thumbnail', fatal=False)
4173
4174         return [{
4175             'id':       video_id,
4176             'url':      video_url,
4177             'ext':      'mp4',
4178             'title':    video_title,
4179             'description': video_description,
4180             'thumbnail': thumbnail,
4181         }]
4182
4183 class VineIE(InfoExtractor):
4184     """Information Extractor for Vine.co"""
4185     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4186
4187     def _real_extract(self, url):
4188         mobj = re.match(self._VALID_URL, url)
4189
4190         video_id = mobj.group('id')
4191         webpage_url = 'https://vine.co/v/' + video_id
4192         webpage = self._download_webpage(webpage_url, video_id)
4193
4194         self.report_extraction(video_id)
4195
4196         video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4197             webpage, u'video URL')
4198
4199         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4200             webpage, u'title')
4201
4202         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4203             webpage, u'thumbnail', fatal=False)
4204
4205         uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4206             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4207
4208         return [{
4209             'id':        video_id,
4210             'url':       video_url,
4211             'ext':       'mp4',
4212             'title':     video_title,
4213             'thumbnail': thumbnail,
4214             'uploader':  uploader,
4215         }]
4216
4217 class FlickrIE(InfoExtractor):
4218     """Information Extractor for Flickr videos"""
4219     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4220
4221     def _real_extract(self, url):
4222         mobj = re.match(self._VALID_URL, url)
4223
4224         video_id = mobj.group('id')
4225         video_uploader_id = mobj.group('uploader_id')
4226         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4227         webpage = self._download_webpage(webpage_url, video_id)
4228
4229         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4230
4231         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4232         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4233
4234         node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4235             first_xml, u'node_id')
4236
4237         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4238         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4239
4240         self.report_extraction(video_id)
4241
4242         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4243         if mobj is None:
4244             raise ExtractorError(u'Unable to extract video url')
4245         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4246
4247         video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4248             webpage, u'video title')
4249
4250         video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4251             webpage, u'description', fatal=False)
4252
4253         thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4254             webpage, u'thumbnail', fatal=False)
4255
4256         return [{
4257             'id':          video_id,
4258             'url':         video_url,
4259             'ext':         'mp4',
4260             'title':       video_title,
4261             'description': video_description,
4262             'thumbnail':   thumbnail,
4263             'uploader_id': video_uploader_id,
4264         }]
4265
4266 class TeamcocoIE(InfoExtractor):
4267     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4268
4269     def _real_extract(self, url):
4270         mobj = re.match(self._VALID_URL, url)
4271         if mobj is None:
4272             raise ExtractorError(u'Invalid URL: %s' % url)
4273         url_title = mobj.group('url_title')
4274         webpage = self._download_webpage(url, url_title)
4275
4276         video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
4277             webpage, u'video id')
4278
4279         self.report_extraction(video_id)
4280
4281         video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
4282             webpage, u'title')
4283
4284         thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
4285             webpage, u'thumbnail', fatal=False)
4286
4287         video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
4288             webpage, u'description', fatal=False)
4289
4290         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4291         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4292
4293         video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
4294             data, u'video URL')
4295
4296         return [{
4297             'id':          video_id,
4298             'url':         video_url,
4299             'ext':         'mp4',
4300             'title':       video_title,
4301             'thumbnail':   thumbnail,
4302             'description': video_description,
4303         }]
4304
4305 class XHamsterIE(InfoExtractor):
4306     """Information Extractor for xHamster"""
4307     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4308
4309     def _real_extract(self,url):
4310         mobj = re.match(self._VALID_URL, url)
4311
4312         video_id = mobj.group('id')
4313         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4314         webpage = self._download_webpage(mrss_url, video_id)
4315
4316         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4317         if mobj is None:
4318             raise ExtractorError(u'Unable to extract media URL')
4319         if len(mobj.group('server')) == 0:
4320             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4321         else:
4322             video_url = mobj.group('server')+'/key='+mobj.group('file')
4323         video_extension = video_url.split('.')[-1]
4324
4325         video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4326             webpage, u'title')
4327         video_title = unescapeHTML(video_title)
4328
4329         # Can't see the description anywhere in the UI
4330         # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4331         #     webpage, u'description', fatal=False)
4332         # if video_description: video_description = unescapeHTML(video_description)
4333
4334         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4335         if mobj:
4336             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4337         else:
4338             video_upload_date = None
4339             self._downloader.report_warning(u'Unable to extract upload date')
4340
4341         video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
4342             webpage, u'uploader id', default=u'anonymous')
4343
4344         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4345             webpage, u'thumbnail', fatal=False)
4346
4347         return [{
4348             'id':       video_id,
4349             'url':      video_url,
4350             'ext':      video_extension,
4351             'title':    video_title,
4352             # 'description': video_description,
4353             'upload_date': video_upload_date,
4354             'uploader_id': video_uploader_id,
4355             'thumbnail': video_thumbnail
4356         }]
4357
4358 class HypemIE(InfoExtractor):
4359     """Information Extractor for hypem"""
4360     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4361
4362     def _real_extract(self, url):
4363         mobj = re.match(self._VALID_URL, url)
4364         if mobj is None:
4365             raise ExtractorError(u'Invalid URL: %s' % url)
4366         track_id = mobj.group(1)
4367
4368         data = { 'ax': 1, 'ts': time.time() }
4369         data_encoded = compat_urllib_parse.urlencode(data)
4370         complete_url = url + "?" + data_encoded
4371         request = compat_urllib_request.Request(complete_url)
4372         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4373         cookie = urlh.headers.get('Set-Cookie', '')
4374
4375         self.report_extraction(track_id)
4376
4377         html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4378             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4379         try:
4380             track_list = json.loads(html_tracks)
4381             track = track_list[u'tracks'][0]
4382         except ValueError:
4383             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4384
4385         key = track[u"key"]
4386         track_id = track[u"id"]
4387         artist = track[u"artist"]
4388         title = track[u"song"]
4389
4390         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4391         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4392         request.add_header('cookie', cookie)
4393         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4394         try:
4395             song_data = json.loads(song_data_json)
4396         except ValueError:
4397             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4398         final_url = song_data[u"url"]
4399
4400         return [{
4401             'id':       track_id,
4402             'url':      final_url,
4403             'ext':      "mp3",
4404             'title':    title,
4405             'artist':   artist,
4406         }]
4407
4408
4409 def gen_extractors():
4410     """ Return a list of an instance of every supported extractor.
4411     The order does matter; the first extractor matched is the one handling the URL.
4412     """
4413     return [
4414         YoutubePlaylistIE(),
4415         YoutubeChannelIE(),
4416         YoutubeUserIE(),
4417         YoutubeSearchIE(),
4418         YoutubeIE(),
4419         MetacafeIE(),
4420         DailymotionIE(),
4421         GoogleSearchIE(),
4422         PhotobucketIE(),
4423         YahooIE(),
4424         YahooSearchIE(),
4425         DepositFilesIE(),
4426         FacebookIE(),
4427         BlipTVIE(),
4428         BlipTVUserIE(),
4429         VimeoIE(),
4430         MyVideoIE(),
4431         ComedyCentralIE(),
4432         EscapistIE(),
4433         CollegeHumorIE(),
4434         XVideosIE(),
4435         SoundcloudSetIE(),
4436         SoundcloudIE(),
4437         InfoQIE(),
4438         MixcloudIE(),
4439         StanfordOpenClassroomIE(),
4440         MTVIE(),
4441         YoukuIE(),
4442         XNXXIE(),
4443         YouJizzIE(),
4444         PornotubeIE(),
4445         YouPornIE(),
4446         GooglePlusIE(),
4447         ArteTvIE(),
4448         NBAIE(),
4449         WorldStarHipHopIE(),
4450         JustinTVIE(),
4451         FunnyOrDieIE(),
4452         SteamIE(),
4453         UstreamIE(),
4454         RBMARadioIE(),
4455         EightTracksIE(),
4456         KeekIE(),
4457         TEDIE(),
4458         MySpassIE(),
4459         SpiegelIE(),
4460         LiveLeakIE(),
4461         ARDIE(),
4462         TumblrIE(),
4463         BandcampIE(),
4464         RedTubeIE(),
4465         InaIE(),
4466         HowcastIE(),
4467         VineIE(),
4468         FlickrIE(),
4469         TeamcocoIE(),
4470         XHamsterIE(),
4471         HypemIE(),
4472         GenericIE()
4473     ]
4474
4475 def get_info_extractor(ie_name):
4476     """Returns the info extractor class with the given ie_name"""
4477     return globals()[ie_name+'IE']