_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194 class SearchInfoExtractor(InfoExtractor):
 195     """
 196     Base class for paged search queries extractors.
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 199     """
 200
 201     @classmethod
 202     def _make_valid_url(cls):
 203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 204
 205     @classmethod
 206     def suitable(cls, url):
 207         return re.match(cls._make_valid_url(), url) is not None
 208
 209     def _real_extract(self, query):
 210         mobj = re.match(self._make_valid_url(), query)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid search query "%s"' % query)
 213
 214         prefix = mobj.group('prefix')
 215         query = mobj.group('query')
 216         if prefix == '':
 217             return self._get_n_results(query, 1)
 218         elif prefix == 'all':
 219             return self._get_n_results(query, self._MAX_RESULTS)
 220         else:
 221             n = int(prefix)
 222             if n <= 0:
 223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 224             elif n > self._MAX_RESULTS:
 225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 226                 n = self._MAX_RESULTS
 227             return self._get_n_results(query, n)
 228
 229     def _get_n_results(self, query, n):
 230         """Get a specified number of results for a query"""
 231         raise NotImplementedError("This method must be implemented by sublclasses")
 232
 233
 234 class YoutubeIE(InfoExtractor):
 235     """Information extractor for youtube.com."""
 236
 237     _VALID_URL = r"""^
 238                      (
 239                          (?:https?://)?                                       # http(s):// (optional)
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 243                          (?:                                                  # the various things that can precede the ID:
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 245                              |(?:                                             # or the v= param in all its forms
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 249                                  v=
 250                              )
 251                          )?                                                   # optional -> youtube.com/xxxx is OK
 252                      )?                                                       # all until now is optional -> you can pass the naked ID
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 254                      (?(1).+)?                                                # if we found the ID, everything can follow
 255                      $"""
 256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 260     _NETRC_MACHINE = 'youtube'
 261     # Listed in order of quality
 262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 264     _video_extensions = {
 265         '13': '3gp',
 266         '17': 'mp4',
 267         '18': 'mp4',
 268         '22': 'mp4',
 269         '37': 'mp4',
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 271         '43': 'webm',
 272         '44': 'webm',
 273         '45': 'webm',
 274         '46': 'webm',
 275     }
 276     _video_dimensions = {
 277         '5': '240x400',
 278         '6': '???',
 279         '13': '???',
 280         '17': '144x176',
 281         '18': '360x640',
 282         '22': '720x1280',
 283         '34': '360x640',
 284         '35': '480x854',
 285         '37': '1080x1920',
 286         '38': '3072x4096',
 287         '43': '360x640',
 288         '44': '480x854',
 289         '45': '720x1280',
 290         '46': '1080x1920',
 291     }
 292     IE_NAME = u'youtube'
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         """Receives a URL and returns True if suitable for this IE."""
 297         if YoutubePlaylistIE.suitable(url): return False
 298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 299
 300     def report_lang(self):
 301         """Report attempt to set language."""
 302         self.to_screen(u'Setting language')
 303
 304     def report_login(self):
 305         """Report attempt to log in."""
 306         self.to_screen(u'Logging in')
 307
 308     def report_video_webpage_download(self, video_id):
 309         """Report attempt to download video webpage."""
 310         self.to_screen(u'%s: Downloading video webpage' % video_id)
 311
 312     def report_video_info_webpage_download(self, video_id):
 313         """Report attempt to download video info webpage."""
 314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 315
 316     def report_video_subtitles_download(self, video_id):
 317         """Report attempt to download video info webpage."""
 318         self.to_screen(u'%s: Checking available subtitles' % video_id)
 319
 320     def report_video_subtitles_request(self, video_id, sub_lang, format):
 321         """Report attempt to download video info webpage."""
 322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 323
 324     def report_video_subtitles_available(self, video_id, sub_lang_list):
 325         """Report available subtitles."""
 326         sub_lang = ",".join(list(sub_lang_list.keys()))
 327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 328
 329     def report_information_extraction(self, video_id):
 330         """Report attempt to extract video information."""
 331         self.to_screen(u'%s: Extracting video information' % video_id)
 332
 333     def report_unavailable_format(self, video_id, format):
 334         """Report extracted video URL."""
 335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 336
 337     def report_rtmp_download(self):
 338         """Indicate the download will use the RTMP protocol."""
 339         self.to_screen(u'RTMP download detected')
 340
 341     def _get_available_subtitles(self, video_id):
 342         self.report_video_subtitles_download(video_id)
 343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 344         try:
 345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 350         if not sub_lang_list:
 351             return (u'video doesn\'t have subtitles', None)
 352         return sub_lang_list
 353
 354     def _list_available_subtitles(self, video_id):
 355         sub_lang_list = self._get_available_subtitles(video_id)
 356         self.report_video_subtitles_available(video_id, sub_lang_list)
 357
 358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 359         """
 360         Return tuple:
 361         (error_message, sub_lang, sub)
 362         """
 363         self.report_video_subtitles_request(video_id, sub_lang, format)
 364         params = compat_urllib_parse.urlencode({
 365             'lang': sub_lang,
 366             'name': sub_name,
 367             'v': video_id,
 368             'fmt': format,
 369         })
 370         url = 'http://www.youtube.com/api/timedtext?' + params
 371         try:
 372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 375         if not sub:
 376             return (u'Did not fetch video subtitles', None, None)
 377         return (None, sub_lang, sub)
 378
 379     def _request_automatic_caption(self, video_id, webpage):
 380         """We need the webpage for getting the captions url, pass it as an
 381            argument to speed up the process."""
 382         sub_lang = self._downloader.params.get('subtitleslang')
 383         sub_format = self._downloader.params.get('subtitlesformat')
 384         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 385         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 386         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 387         if mobj is None:
 388             return [(err_msg, None, None)]
 389         player_config = json.loads(mobj.group(1))
 390         try:
 391             args = player_config[u'args']
 392             caption_url = args[u'ttsurl']
 393             timestamp = args[u'timestamp']
 394             params = compat_urllib_parse.urlencode({
 395                 'lang': 'en',
 396                 'tlang': sub_lang,
 397                 'fmt': sub_format,
 398                 'ts': timestamp,
 399                 'kind': 'asr',
 400             })
 401             subtitles_url = caption_url + '&' + params
 402             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 403             return [(None, sub_lang, sub)]
 404         except KeyError:
 405             return [(err_msg, None, None)]
 406
 407     def _extract_subtitle(self, video_id):
 408         """
 409         Return a list with a tuple:
 410         [(error_message, sub_lang, sub)]
 411         """
 412         sub_lang_list = self._get_available_subtitles(video_id)
 413         sub_format = self._downloader.params.get('subtitlesformat')
 414         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 415             return [(sub_lang_list[0], None, None)]
 416         if self._downloader.params.get('subtitleslang', False):
 417             sub_lang = self._downloader.params.get('subtitleslang')
 418         elif 'en' in sub_lang_list:
 419             sub_lang = 'en'
 420         else:
 421             sub_lang = list(sub_lang_list.keys())[0]
 422         if not sub_lang in sub_lang_list:
 423             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 424
 425         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 426         return [subtitle]
 427
 428     def _extract_all_subtitles(self, video_id):
 429         sub_lang_list = self._get_available_subtitles(video_id)
 430         sub_format = self._downloader.params.get('subtitlesformat')
 431         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 432             return [(sub_lang_list[0], None, None)]
 433         subtitles = []
 434         for sub_lang in sub_lang_list:
 435             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 436             subtitles.append(subtitle)
 437         return subtitles
 438
 439     def _print_formats(self, formats):
 440         print('Available formats:')
 441         for x in formats:
 442             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 443
 444     def _real_initialize(self):
 445         if self._downloader is None:
 446             return
 447
 448         username = None
 449         password = None
 450         downloader_params = self._downloader.params
 451
 452         # Attempt to use provided username and password or .netrc data
 453         if downloader_params.get('username', None) is not None:
 454             username = downloader_params['username']
 455             password = downloader_params['password']
 456         elif downloader_params.get('usenetrc', False):
 457             try:
 458                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 459                 if info is not None:
 460                     username = info[0]
 461                     password = info[2]
 462                 else:
 463                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 464             except (IOError, netrc.NetrcParseError) as err:
 465                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 466                 return
 467
 468         # Set language
 469         request = compat_urllib_request.Request(self._LANG_URL)
 470         try:
 471             self.report_lang()
 472             compat_urllib_request.urlopen(request).read()
 473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 474             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 475             return
 476
 477         # No authentication to be performed
 478         if username is None:
 479             return
 480
 481         request = compat_urllib_request.Request(self._LOGIN_URL)
 482         try:
 483             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 485             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 486             return
 487
 488         galx = None
 489         dsh = None
 490         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 491         if match:
 492           galx = match.group(1)
 493
 494         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 495         if match:
 496           dsh = match.group(1)
 497
 498         # Log in
 499         login_form_strs = {
 500                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 501                 u'Email': username,
 502                 u'GALX': galx,
 503                 u'Passwd': password,
 504                 u'PersistentCookie': u'yes',
 505                 u'_utf8': u'霱',
 506                 u'bgresponse': u'js_disabled',
 507                 u'checkConnection': u'',
 508                 u'checkedDomains': u'youtube',
 509                 u'dnConn': u'',
 510                 u'dsh': dsh,
 511                 u'pstMsg': u'0',
 512                 u'rmShown': u'1',
 513                 u'secTok': u'',
 514                 u'signIn': u'Sign in',
 515                 u'timeStmp': u'',
 516                 u'service': u'youtube',
 517                 u'uilel': u'3',
 518                 u'hl': u'en_US',
 519         }
 520         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 521         # chokes on unicode
 522         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 523         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 524         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 525         try:
 526             self.report_login()
 527             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 528             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 529                 self._downloader.report_warning(u'unable to log in: bad username or password')
 530                 return
 531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 532             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 533             return
 534
 535         # Confirm age
 536         age_form = {
 537                 'next_url':     '/',
 538                 'action_confirm':   'Confirm',
 539                 }
 540         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 541         try:
 542             self.report_age_confirmation()
 543             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 545             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 546
 547     def _extract_id(self, url):
 548         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 549         if mobj is None:
 550             raise ExtractorError(u'Invalid URL: %s' % url)
 551         video_id = mobj.group(2)
 552         return video_id
 553
 554     def _real_extract(self, url):
 555         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 556         mobj = re.search(self._NEXT_URL_RE, url)
 557         if mobj:
 558             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 559         video_id = self._extract_id(url)
 560
 561         # Get video webpage
 562         self.report_video_webpage_download(video_id)
 563         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 564         request = compat_urllib_request.Request(url)
 565         try:
 566             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 568             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 569
 570         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 571
 572         # Attempt to extract SWF player URL
 573         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 574         if mobj is not None:
 575             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 576         else:
 577             player_url = None
 578
 579         # Get video info
 580         self.report_video_info_webpage_download(video_id)
 581         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 582             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 583                     % (video_id, el_type))
 584             video_info_webpage = self._download_webpage(video_info_url, video_id,
 585                                     note=False,
 586                                     errnote='unable to download video info webpage')
 587             video_info = compat_parse_qs(video_info_webpage)
 588             if 'token' in video_info:
 589                 break
 590         if 'token' not in video_info:
 591             if 'reason' in video_info:
 592                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 593             else:
 594                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 595
 596         # Check for "rental" videos
 597         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 598             raise ExtractorError(u'"rental" videos not supported')
 599
 600         # Start extracting information
 601         self.report_information_extraction(video_id)
 602
 603         # uploader
 604         if 'author' not in video_info:
 605             raise ExtractorError(u'Unable to extract uploader name')
 606         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 607
 608         # uploader_id
 609         video_uploader_id = None
 610         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 611         if mobj is not None:
 612             video_uploader_id = mobj.group(1)
 613         else:
 614             self._downloader.report_warning(u'unable to extract uploader nickname')
 615
 616         # title
 617         if 'title' not in video_info:
 618             raise ExtractorError(u'Unable to extract video title')
 619         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 620
 621         # thumbnail image
 622         if 'thumbnail_url' not in video_info:
 623             self._downloader.report_warning(u'unable to extract video thumbnail')
 624             video_thumbnail = ''
 625         else:   # don't panic if we can't find it
 626             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 627
 628         # upload date
 629         upload_date = None
 630         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 631         if mobj is not None:
 632             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 633             upload_date = unified_strdate(upload_date)
 634
 635         # description
 636         video_description = get_element_by_id("eow-description", video_webpage)
 637         if video_description:
 638             video_description = clean_html(video_description)
 639         else:
 640             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 641             if fd_mobj:
 642                 video_description = unescapeHTML(fd_mobj.group(1))
 643             else:
 644                 video_description = u''
 645
 646         # subtitles
 647         video_subtitles = None
 648
 649         if self._downloader.params.get('writesubtitles', False):
 650             video_subtitles = self._extract_subtitle(video_id)
 651             if video_subtitles:
 652                 (sub_error, sub_lang, sub) = video_subtitles[0]
 653                 if sub_error:
 654                     # We try with the automatic captions
 655                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 656                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 657                     if sub is not None:
 658                         pass
 659                     else:
 660                         # We report the original error
 661                         self._downloader.report_error(sub_error)
 662
 663         if self._downloader.params.get('allsubtitles', False):
 664             video_subtitles = self._extract_all_subtitles(video_id)
 665             for video_subtitle in video_subtitles:
 666                 (sub_error, sub_lang, sub) = video_subtitle
 667                 if sub_error:
 668                     self._downloader.report_error(sub_error)
 669
 670         if self._downloader.params.get('listsubtitles', False):
 671             sub_lang_list = self._list_available_subtitles(video_id)
 672             return
 673
 674         if 'length_seconds' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video duration')
 676             video_duration = ''
 677         else:
 678             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 679
 680         # token
 681         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 682
 683         # Decide which formats to download
 684         req_format = self._downloader.params.get('format', None)
 685
 686         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 687             self.report_rtmp_download()
 688             video_url_list = [(None, video_info['conn'][0])]
 689         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 690             url_map = {}
 691             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 692                 url_data = compat_parse_qs(url_data_str)
 693                 if 'itag' in url_data and 'url' in url_data:
 694                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 695                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 696                     url_map[url_data['itag'][0]] = url
 697
 698             format_limit = self._downloader.params.get('format_limit', None)
 699             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 700             if format_limit is not None and format_limit in available_formats:
 701                 format_list = available_formats[available_formats.index(format_limit):]
 702             else:
 703                 format_list = available_formats
 704             existing_formats = [x for x in format_list if x in url_map]
 705             if len(existing_formats) == 0:
 706                 raise ExtractorError(u'no known formats available for video')
 707             if self._downloader.params.get('listformats', None):
 708                 self._print_formats(existing_formats)
 709                 return
 710             if req_format is None or req_format == 'best':
 711                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 712             elif req_format == 'worst':
 713                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 714             elif req_format in ('-1', 'all'):
 715                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 716             else:
 717                 # Specific formats. We pick the first in a slash-delimeted sequence.
 718                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 719                 req_formats = req_format.split('/')
 720                 video_url_list = None
 721                 for rf in req_formats:
 722                     if rf in url_map:
 723                         video_url_list = [(rf, url_map[rf])]
 724                         break
 725                 if video_url_list is None:
 726                     raise ExtractorError(u'requested format not available')
 727         else:
 728             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 729
 730         results = []
 731         for format_param, video_real_url in video_url_list:
 732             # Extension
 733             video_extension = self._video_extensions.get(format_param, 'flv')
 734
 735             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 736                                               self._video_dimensions.get(format_param, '???'))
 737
 738             results.append({
 739                 'id':       video_id,
 740                 'url':      video_real_url,
 741                 'uploader': video_uploader,
 742                 'uploader_id': video_uploader_id,
 743                 'upload_date':  upload_date,
 744                 'title':    video_title,
 745                 'ext':      video_extension,
 746                 'format':   video_format,
 747                 'thumbnail':    video_thumbnail,
 748                 'description':  video_description,
 749                 'player_url':   player_url,
 750                 'subtitles':    video_subtitles,
 751                 'duration':     video_duration
 752             })
 753         return results
 754
 755
 756 class MetacafeIE(InfoExtractor):
 757     """Information Extractor for metacafe.com."""
 758
 759     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 760     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 761     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 762     IE_NAME = u'metacafe'
 763
 764     def report_disclaimer(self):
 765         """Report disclaimer retrieval."""
 766         self.to_screen(u'Retrieving disclaimer')
 767
 768     def _real_initialize(self):
 769         # Retrieve disclaimer
 770         request = compat_urllib_request.Request(self._DISCLAIMER)
 771         try:
 772             self.report_disclaimer()
 773             disclaimer = compat_urllib_request.urlopen(request).read()
 774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 775             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 776
 777         # Confirm age
 778         disclaimer_form = {
 779             'filters': '0',
 780             'submit': "Continue - I'm over 18",
 781             }
 782         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 783         try:
 784             self.report_age_confirmation()
 785             disclaimer = compat_urllib_request.urlopen(request).read()
 786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 787             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             raise ExtractorError(u'Invalid URL: %s' % url)
 794
 795         video_id = mobj.group(1)
 796
 797         # Check if video comes from YouTube
 798         mobj2 = re.match(r'^yt-(.*)$', video_id)
 799         if mobj2 is not None:
 800             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 801
 802         # Retrieve video webpage to extract further information
 803         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 808         if mobj is not None:
 809             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810             video_extension = mediaURL[-3:]
 811
 812             # Extract gdaKey if available
 813             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 814             if mobj is None:
 815                 video_url = mediaURL
 816             else:
 817                 gdaKey = mobj.group(1)
 818                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 819         else:
 820             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 821             if mobj is None:
 822                 raise ExtractorError(u'Unable to extract media URL')
 823             vardict = compat_parse_qs(mobj.group(1))
 824             if 'mediaData' not in vardict:
 825                 raise ExtractorError(u'Unable to extract media URL')
 826             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 827             if mobj is None:
 828                 raise ExtractorError(u'Unable to extract media URL')
 829             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 830             video_extension = mediaURL[-3:]
 831             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 832
 833         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 834         if mobj is None:
 835             raise ExtractorError(u'Unable to extract title')
 836         video_title = mobj.group(1).decode('utf-8')
 837
 838         mobj = re.search(r'submitter=(.*?);', webpage)
 839         if mobj is None:
 840             raise ExtractorError(u'Unable to extract uploader nickname')
 841         video_uploader = mobj.group(1)
 842
 843         return [{
 844             'id':       video_id.decode('utf-8'),
 845             'url':      video_url.decode('utf-8'),
 846             'uploader': video_uploader.decode('utf-8'),
 847             'upload_date':  None,
 848             'title':    video_title,
 849             'ext':      video_extension.decode('utf-8'),
 850         }]
 851
 852 class DailymotionIE(InfoExtractor):
 853     """Information Extractor for Dailymotion"""
 854
 855     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 856     IE_NAME = u'dailymotion'
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1).split('_')[0].split('?')[0]
 865
 866         video_extension = 'mp4'
 867
 868         # Retrieve video webpage to extract further information
 869         request = compat_urllib_request.Request(url)
 870         request.add_header('Cookie', 'family_filter=off')
 871         webpage = self._download_webpage(request, video_id)
 872
 873         # Extract URL, uploader and title from webpage
 874         self.report_extraction(video_id)
 875         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 876         if mobj is None:
 877             raise ExtractorError(u'Unable to extract media URL')
 878         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 879
 880         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 881             if key in flashvars:
 882                 max_quality = key
 883                 self.to_screen(u'Using %s' % key)
 884                 break
 885         else:
 886             raise ExtractorError(u'Unable to extract video URL')
 887
 888         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 889         if mobj is None:
 890             raise ExtractorError(u'Unable to extract video URL')
 891
 892         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 893
 894         # TODO: support choosing qualities
 895
 896         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 897         if mobj is None:
 898             raise ExtractorError(u'Unable to extract title')
 899         video_title = unescapeHTML(mobj.group('title'))
 900
 901         video_uploader = None
 902         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 903         if mobj is None:
 904             # lookin for official user
 905             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 906             if mobj_official is None:
 907                 self._downloader.report_warning(u'unable to extract uploader nickname')
 908             else:
 909                 video_uploader = mobj_official.group(1)
 910         else:
 911             video_uploader = mobj.group(1)
 912
 913         video_upload_date = None
 914         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 915         if mobj is not None:
 916             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 917
 918         return [{
 919             'id':       video_id,
 920             'url':      video_url,
 921             'uploader': video_uploader,
 922             'upload_date':  video_upload_date,
 923             'title':    video_title,
 924             'ext':      video_extension,
 925         }]
 926
 927
 928 class PhotobucketIE(InfoExtractor):
 929     """Information extractor for photobucket.com."""
 930
 931     # TODO: the original _VALID_URL was:
 932     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 933     # Check if it's necessary to keep the old extracion process
 934     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 935     IE_NAME = u'photobucket'
 936
 937     def _real_extract(self, url):
 938         # Extract id from URL
 939         mobj = re.match(self._VALID_URL, url)
 940         if mobj is None:
 941             raise ExtractorError(u'Invalid URL: %s' % url)
 942
 943         video_id = mobj.group('id')
 944
 945         video_extension = mobj.group('ext')
 946
 947         # Retrieve video webpage to extract further information
 948         webpage = self._download_webpage(url, video_id)
 949
 950         # Extract URL, uploader, and title from webpage
 951         self.report_extraction(video_id)
 952         # We try first by looking the javascript code:
 953         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 954         if mobj is not None:
 955             info = json.loads(mobj.group('json'))
 956             return [{
 957                 'id':       video_id,
 958                 'url':      info[u'downloadUrl'],
 959                 'uploader': info[u'username'],
 960                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 961                 'title':    info[u'title'],
 962                 'ext':      video_extension,
 963                 'thumbnail': info[u'thumbUrl'],
 964             }]
 965
 966         # We try looking in other parts of the webpage
 967         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 968         if mobj is None:
 969             raise ExtractorError(u'Unable to extract media URL')
 970         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 971
 972         video_url = mediaURL
 973
 974         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 975         if mobj is None:
 976             raise ExtractorError(u'Unable to extract title')
 977         video_title = mobj.group(1).decode('utf-8')
 978
 979         video_uploader = mobj.group(2).decode('utf-8')
 980
 981         return [{
 982             'id':       video_id.decode('utf-8'),
 983             'url':      video_url.decode('utf-8'),
 984             'uploader': video_uploader,
 985             'upload_date':  None,
 986             'title':    video_title,
 987             'ext':      video_extension.decode('utf-8'),
 988         }]
 989
 990
 991 class YahooIE(InfoExtractor):
 992     """Information extractor for screen.yahoo.com."""
 993     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 994
 995     def _real_extract(self, url):
 996         mobj = re.match(self._VALID_URL, url)
 997         if mobj is None:
 998             raise ExtractorError(u'Invalid URL: %s' % url)
 999         video_id = mobj.group('id')
1000         webpage = self._download_webpage(url, video_id)
1001         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1002
1003         if m_id is None:
1004             # TODO: Check which url parameters are required
1005             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1011                         '''
1012             self.report_extraction(video_id)
1013             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1014             if m_info is None:
1015                 raise ExtractorError(u'Unable to extract video info')
1016             video_title = m_info.group('title')
1017             video_description = m_info.group('description')
1018             video_thumb = m_info.group('thumb')
1019             video_date = m_info.group('date')
1020             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1021
1022             # TODO: Find a way to get mp4 videos
1023             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026             video_url = m_rest.group('url')
1027             video_path = m_rest.group('path')
1028             if m_rest is None:
1029                 raise ExtractorError(u'Unable to extract video url')
1030
1031         else: # We have to use a different method if another id is defined
1032             long_id = m_id.group('new_id')
1033             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036             info = json.loads(json_str)
1037             res = info[u'query'][u'results'][u'mediaObj'][0]
1038             stream = res[u'streams'][0]
1039             video_path = stream[u'path']
1040             video_url = stream[u'host']
1041             meta = res[u'meta']
1042             video_title = meta[u'title']
1043             video_description = meta[u'description']
1044             video_thumb = meta[u'thumbnail']
1045             video_date = None # I can't find it
1046
1047         info_dict = {
1048                      'id': video_id,
1049                      'url': video_url,
1050                      'play_path': video_path,
1051                      'title':video_title,
1052                      'description': video_description,
1053                      'thumbnail': video_thumb,
1054                      'upload_date': video_date,
1055                      'ext': 'flv',
1056                      }
1057         return info_dict
1058
1059 class VimeoIE(InfoExtractor):
1060     """Information extractor for vimeo.com."""
1061
1062     # _VALID_URL matches Vimeo URLs
1063     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1064     IE_NAME = u'vimeo'
1065
1066     def _real_extract(self, url, new_video=True):
1067         # Extract ID from URL
1068         mobj = re.match(self._VALID_URL, url)
1069         if mobj is None:
1070             raise ExtractorError(u'Invalid URL: %s' % url)
1071
1072         video_id = mobj.group('id')
1073         if not mobj.group('proto'):
1074             url = 'https://' + url
1075         if mobj.group('direct_link') or mobj.group('pro'):
1076             url = 'https://vimeo.com/' + video_id
1077
1078         # Retrieve video webpage to extract further information
1079         request = compat_urllib_request.Request(url, None, std_headers)
1080         webpage = self._download_webpage(request, video_id)
1081
1082         # Now we begin extracting as much information as we can from what we
1083         # retrieved. First we extract the information common to all extractors,
1084         # and latter we extract those that are Vimeo specific.
1085         self.report_extraction(video_id)
1086
1087         # Extract the config JSON
1088         try:
1089             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090             config = json.loads(config)
1091         except:
1092             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1094             else:
1095                 raise ExtractorError(u'Unable to extract info section')
1096
1097         # Extract title
1098         video_title = config["video"]["title"]
1099
1100         # Extract uploader and uploader_id
1101         video_uploader = config["video"]["owner"]["name"]
1102         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1103
1104         # Extract video thumbnail
1105         video_thumbnail = config["video"]["thumbnail"]
1106
1107         # Extract video description
1108         video_description = get_element_by_attribute("itemprop", "description", webpage)
1109         if video_description: video_description = clean_html(video_description)
1110         else: video_description = u''
1111
1112         # Extract upload date
1113         video_upload_date = None
1114         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115         if mobj is not None:
1116             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1117
1118         # Vimeo specific: extract request signature and timestamp
1119         sig = config['request']['signature']
1120         timestamp = config['request']['timestamp']
1121
1122         # Vimeo specific: extract video codec and quality information
1123         # First consider quality, then codecs, then take everything
1124         # TODO bind to format param
1125         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126         files = { 'hd': [], 'sd': [], 'other': []}
1127         for codec_name, codec_extension in codecs:
1128             if codec_name in config["video"]["files"]:
1129                 if 'hd' in config["video"]["files"][codec_name]:
1130                     files['hd'].append((codec_name, codec_extension, 'hd'))
1131                 elif 'sd' in config["video"]["files"][codec_name]:
1132                     files['sd'].append((codec_name, codec_extension, 'sd'))
1133                 else:
1134                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135
1136         for quality in ('hd', 'sd', 'other'):
1137             if len(files[quality]) > 0:
1138                 video_quality = files[quality][0][2]
1139                 video_codec = files[quality][0][0]
1140                 video_extension = files[quality][0][1]
1141                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1142                 break
1143         else:
1144             raise ExtractorError(u'No known codec found')
1145
1146         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1148
1149         return [{
1150             'id':       video_id,
1151             'url':      video_url,
1152             'uploader': video_uploader,
1153             'uploader_id': video_uploader_id,
1154             'upload_date':  video_upload_date,
1155             'title':    video_title,
1156             'ext':      video_extension,
1157             'thumbnail':    video_thumbnail,
1158             'description':  video_description,
1159         }]
1160
1161
1162 class ArteTvIE(InfoExtractor):
1163     """arte.tv information extractor."""
1164
1165     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166     _LIVE_URL = r'index-[0-9]+\.html$'
1167
1168     IE_NAME = u'arte.tv'
1169
1170     def fetch_webpage(self, url):
1171         request = compat_urllib_request.Request(url)
1172         try:
1173             self.report_download_webpage(url)
1174             webpage = compat_urllib_request.urlopen(request).read()
1175         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177         except ValueError as err:
1178             raise ExtractorError(u'Invalid URL: %s' % url)
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             raise ExtractorError(u'Invalid URL: %s' % url)
1188
1189         for (i, key, err) in matchTuples:
1190             if mobj.group(i) is None:
1191                 raise ExtractorError(err)
1192             else:
1193                 info[key] = mobj.group(i)
1194
1195         return info
1196
1197     def extractLiveStream(self, url):
1198         video_lang = url.split('/')[-4]
1199         info = self.grep_webpage(
1200             url,
1201             r'src="(.*?/videothek_js.*?\.js)',
1202             0,
1203             [
1204                 (1, 'url', u'Invalid URL: %s' % url)
1205             ]
1206         )
1207         http_host = url.split('/')[2]
1208         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212                 '(http://.*?\.swf).*?' +
1213                 '(rtmp://.*?)\'',
1214             re.DOTALL,
1215             [
1216                 (1, 'path',   u'could not extract video path: %s' % url),
1217                 (2, 'player', u'could not extract video player: %s' % url),
1218                 (3, 'url',    u'could not extract video url: %s' % url)
1219             ]
1220         )
1221         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223     def extractPlus7Stream(self, url):
1224         video_lang = url.split('/')[-3]
1225         info = self.grep_webpage(
1226             url,
1227             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228             0,
1229             [
1230                 (1, 'url', u'Invalid URL: %s' % url)
1231             ]
1232         )
1233         next_url = compat_urllib_parse.unquote(info.get('url'))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237             0,
1238             [
1239                 (1, 'url', u'Could not find <video> tag: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video id="(.*?)".*?>.*?' +
1247                 '<name>(.*?)</name>.*?' +
1248                 '<dateVideo>(.*?)</dateVideo>.*?' +
1249                 '<url quality="hd">(.*?)</url>',
1250             re.DOTALL,
1251             [
1252                 (1, 'id',    u'could not extract video id: %s' % url),
1253                 (2, 'title', u'could not extract video title: %s' % url),
1254                 (3, 'date',  u'could not extract video date: %s' % url),
1255                 (4, 'url',   u'could not extract video url: %s' % url)
1256             ]
1257         )
1258
1259         return {
1260             'id':           info.get('id'),
1261             'url':          compat_urllib_parse.unquote(info.get('url')),
1262             'uploader':     u'arte.tv',
1263             'upload_date':  unified_strdate(info.get('date')),
1264             'title':        info.get('title').decode('utf-8'),
1265             'ext':          u'mp4',
1266             'format':       u'NA',
1267             'player_url':   None,
1268         }
1269
1270     def _real_extract(self, url):
1271         video_id = url.split('/')[-1]
1272         self.report_extraction(video_id)
1273
1274         if re.search(self._LIVE_URL, video_id) is not None:
1275             self.extractLiveStream(url)
1276             return
1277         else:
1278             info = self.extractPlus7Stream(url)
1279
1280         return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284     """Generic last-resort information extractor."""
1285
1286     _VALID_URL = r'.*'
1287     IE_NAME = u'generic'
1288
1289     def report_download_webpage(self, video_id):
1290         """Report webpage download."""
1291         if not self._downloader.params.get('test', False):
1292             self._downloader.report_warning(u'Falling back on generic information extractor.')
1293         super(GenericIE, self).report_download_webpage(video_id)
1294
1295     def report_following_redirect(self, new_url):
1296         """Report information extraction."""
1297         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1298
1299     def _test_redirect(self, url):
1300         """Check if it is a redirect, like url shorteners, in case return the new url."""
1301         class HeadRequest(compat_urllib_request.Request):
1302             def get_method(self):
1303                 return "HEAD"
1304
1305         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1306             """
1307             Subclass the HTTPRedirectHandler to make it use our
1308             HeadRequest also on the redirected URL
1309             """
1310             def redirect_request(self, req, fp, code, msg, headers, newurl):
1311                 if code in (301, 302, 303, 307):
1312                     newurl = newurl.replace(' ', '%20')
1313                     newheaders = dict((k,v) for k,v in req.headers.items()
1314                                       if k.lower() not in ("content-length", "content-type"))
1315                     return HeadRequest(newurl,
1316                                        headers=newheaders,
1317                                        origin_req_host=req.get_origin_req_host(),
1318                                        unverifiable=True)
1319                 else:
1320                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1321
1322         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1323             """
1324             Fallback to GET if HEAD is not allowed (405 HTTP error)
1325             """
1326             def http_error_405(self, req, fp, code, msg, headers):
1327                 fp.read()
1328                 fp.close()
1329
1330                 newheaders = dict((k,v) for k,v in req.headers.items()
1331                                   if k.lower() not in ("content-length", "content-type"))
1332                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1333                                                  headers=newheaders,
1334                                                  origin_req_host=req.get_origin_req_host(),
1335                                                  unverifiable=True))
1336
1337         # Build our opener
1338         opener = compat_urllib_request.OpenerDirector()
1339         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340                         HTTPMethodFallback, HEADRedirectHandler,
1341                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342             opener.add_handler(handler())
1343
1344         response = opener.open(HeadRequest(url))
1345         if response is None:
1346             raise ExtractorError(u'Invalid URL protocol')
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             raise ExtractorError(u'Invalid URL: %s' % url)
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             raise ExtractorError(u'Invalid URL: %s' % url)
1378
1379         # It's possible that one of the regexes
1380         # matched, but returned an empty group:
1381         if mobj.group(1) is None:
1382             raise ExtractorError(u'Invalid URL: %s' % url)
1383
1384         video_url = compat_urllib_parse.unquote(mobj.group(1))
1385         video_id = os.path.basename(video_url)
1386
1387         # here's a fun little line of code for you:
1388         video_extension = os.path.splitext(video_id)[1][1:]
1389         video_id = os.path.splitext(video_id)[0]
1390
1391         # it's tempting to parse this further, but you would
1392         # have to take into account all the variations like
1393         #   Video Title - Site Name
1394         #   Site Name | Video Title
1395         #   Video Title - Tagline | Site Name
1396         # and so on and so forth; it's just not practical
1397         mobj = re.search(r'<title>(.*)</title>', webpage)
1398         if mobj is None:
1399             raise ExtractorError(u'Unable to extract title')
1400         video_title = mobj.group(1)
1401
1402         # video uploader is domain name
1403         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404         if mobj is None:
1405             raise ExtractorError(u'Unable to extract title')
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(SearchInfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1421     _MAX_RESULTS = 1000
1422     IE_NAME = u'youtube:search'
1423     _SEARCH_KEY = 'ytsearch'
1424
1425     def report_download_page(self, query, pagenum):
1426         """Report attempt to download search page with given number."""
1427         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1428
1429     def _get_n_results(self, query, n):
1430         """Get a specified number of results for a query"""
1431
1432         video_ids = []
1433         pagenum = 0
1434         limit = n
1435
1436         while (50 * pagenum) < limit:
1437             self.report_download_page(query, pagenum+1)
1438             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1439             request = compat_urllib_request.Request(result_url)
1440             try:
1441                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1442             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1443                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1444             api_response = json.loads(data)['data']
1445
1446             if not 'items' in api_response:
1447                 raise ExtractorError(u'[youtube] No video results')
1448
1449             new_ids = list(video['id'] for video in api_response['items'])
1450             video_ids += new_ids
1451
1452             limit = min(n, api_response['totalItems'])
1453             pagenum += 1
1454
1455         if len(video_ids) > n:
1456             video_ids = video_ids[:n]
1457         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1458         return self.playlist_result(videos, query)
1459
1460
1461 class GoogleSearchIE(SearchInfoExtractor):
1462     """Information Extractor for Google Video search queries."""
1463     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1464     _MAX_RESULTS = 1000
1465     IE_NAME = u'video.google:search'
1466     _SEARCH_KEY = 'gvsearch'
1467
1468     def _get_n_results(self, query, n):
1469         """Get a specified number of results for a query"""
1470
1471         res = {
1472             '_type': 'playlist',
1473             'id': query,
1474             'entries': []
1475         }
1476
1477         for pagenum in itertools.count(1):
1478             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1479             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1480                                              note='Downloading result page ' + str(pagenum))
1481
1482             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1483                 e = {
1484                     '_type': 'url',
1485                     'url': mobj.group(1)
1486                 }
1487                 res['entries'].append(e)
1488
1489             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1490                 return res
1491
1492 class YahooSearchIE(SearchInfoExtractor):
1493     """Information Extractor for Yahoo! Video search queries."""
1494
1495     _MAX_RESULTS = 1000
1496     IE_NAME = u'screen.yahoo:search'
1497     _SEARCH_KEY = 'yvsearch'
1498
1499     def _get_n_results(self, query, n):
1500         """Get a specified number of results for a query"""
1501
1502         res = {
1503             '_type': 'playlist',
1504             'id': query,
1505             'entries': []
1506         }
1507         for pagenum in itertools.count(0):
1508             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1509             webpage = self._download_webpage(result_url, query,
1510                                              note='Downloading results page '+str(pagenum+1))
1511             info = json.loads(webpage)
1512             m = info[u'm']
1513             results = info[u'results']
1514
1515             for (i, r) in enumerate(results):
1516                 if (pagenum * 30) +i >= n:
1517                     break
1518                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1519                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1520                 res['entries'].append(e)
1521             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1522                 break
1523
1524         return res
1525
1526
1527 class YoutubePlaylistIE(InfoExtractor):
1528     """Information Extractor for YouTube playlists."""
1529
1530     _VALID_URL = r"""(?:
1531                         (?:https?://)?
1532                         (?:\w+\.)?
1533                         youtube\.com/
1534                         (?:
1535                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1536                            \? (?:.*?&)*? (?:p|a|list)=
1537                         |  p/
1538                         )
1539                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1540                         .*
1541                      |
1542                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1543                      )"""
1544     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1545     _MAX_RESULTS = 50
1546     IE_NAME = u'youtube:playlist'
1547
1548     @classmethod
1549     def suitable(cls, url):
1550         """Receives a URL and returns True if suitable for this IE."""
1551         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1552
1553     def _real_extract(self, url):
1554         # Extract playlist id
1555         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1556         if mobj is None:
1557             raise ExtractorError(u'Invalid URL: %s' % url)
1558
1559         # Download playlist videos from API
1560         playlist_id = mobj.group(1) or mobj.group(2)
1561         page_num = 1
1562         videos = []
1563
1564         while True:
1565             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1566             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1567
1568             try:
1569                 response = json.loads(page)
1570             except ValueError as err:
1571                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1572
1573             if 'feed' not in response:
1574                 raise ExtractorError(u'Got a malformed response from YouTube API')
1575             playlist_title = response['feed']['title']['$t']
1576             if 'entry' not in response['feed']:
1577                 # Number of videos is a multiple of self._MAX_RESULTS
1578                 break
1579
1580             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1581                         for entry in response['feed']['entry']
1582                         if 'content' in entry ]
1583
1584             if len(response['feed']['entry']) < self._MAX_RESULTS:
1585                 break
1586             page_num += 1
1587
1588         videos = [v[1] for v in sorted(videos)]
1589
1590         url_results = [self.url_result(url, 'Youtube') for url in videos]
1591         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1592
1593
1594 class YoutubeChannelIE(InfoExtractor):
1595     """Information Extractor for YouTube channels."""
1596
1597     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1598     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1599     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1600     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1601     IE_NAME = u'youtube:channel'
1602
1603     def extract_videos_from_page(self, page):
1604         ids_in_page = []
1605         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1606             if mobj.group(1) not in ids_in_page:
1607                 ids_in_page.append(mobj.group(1))
1608         return ids_in_page
1609
1610     def _real_extract(self, url):
1611         # Extract channel id
1612         mobj = re.match(self._VALID_URL, url)
1613         if mobj is None:
1614             raise ExtractorError(u'Invalid URL: %s' % url)
1615
1616         # Download channel page
1617         channel_id = mobj.group(1)
1618         video_ids = []
1619         pagenum = 1
1620
1621         url = self._TEMPLATE_URL % (channel_id, pagenum)
1622         page = self._download_webpage(url, channel_id,
1623                                       u'Downloading page #%s' % pagenum)
1624
1625         # Extract video identifiers
1626         ids_in_page = self.extract_videos_from_page(page)
1627         video_ids.extend(ids_in_page)
1628
1629         # Download any subsequent channel pages using the json-based channel_ajax query
1630         if self._MORE_PAGES_INDICATOR in page:
1631             while True:
1632                 pagenum = pagenum + 1
1633
1634                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1635                 page = self._download_webpage(url, channel_id,
1636                                               u'Downloading page #%s' % pagenum)
1637
1638                 page = json.loads(page)
1639
1640                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1641                 video_ids.extend(ids_in_page)
1642
1643                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1644                     break
1645
1646         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1647
1648         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1649         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1650         return [self.playlist_result(url_entries, channel_id)]
1651
1652
1653 class YoutubeUserIE(InfoExtractor):
1654     """Information Extractor for YouTube users."""
1655
1656     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1657     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1658     _GDATA_PAGE_SIZE = 50
1659     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1660     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1661     IE_NAME = u'youtube:user'
1662
1663     def _real_extract(self, url):
1664         # Extract username
1665         mobj = re.match(self._VALID_URL, url)
1666         if mobj is None:
1667             raise ExtractorError(u'Invalid URL: %s' % url)
1668
1669         username = mobj.group(1)
1670
1671         # Download video ids using YouTube Data API. Result size per
1672         # query is limited (currently to 50 videos) so we need to query
1673         # page by page until there are no video ids - it means we got
1674         # all of them.
1675
1676         video_ids = []
1677         pagenum = 0
1678
1679         while True:
1680             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1681
1682             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1683             page = self._download_webpage(gdata_url, username,
1684                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1685
1686             # Extract video identifiers
1687             ids_in_page = []
1688
1689             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690                 if mobj.group(1) not in ids_in_page:
1691                     ids_in_page.append(mobj.group(1))
1692
1693             video_ids.extend(ids_in_page)
1694
1695             # A little optimization - if current page is not
1696             # "full", ie. does not contain PAGE_SIZE video ids then
1697             # we can assume that this page is the last one - there
1698             # are no more ids on further pages - no need to query
1699             # again.
1700
1701             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1702                 break
1703
1704             pagenum += 1
1705
1706         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1707         url_results = [self.url_result(url, 'Youtube') for url in urls]
1708         return [self.playlist_result(url_results, playlist_title = username)]
1709
1710
1711 class BlipTVUserIE(InfoExtractor):
1712     """Information Extractor for blip.tv users."""
1713
1714     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1715     _PAGE_SIZE = 12
1716     IE_NAME = u'blip.tv:user'
1717
1718     def _real_extract(self, url):
1719         # Extract username
1720         mobj = re.match(self._VALID_URL, url)
1721         if mobj is None:
1722             raise ExtractorError(u'Invalid URL: %s' % url)
1723
1724         username = mobj.group(1)
1725
1726         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1727
1728         page = self._download_webpage(url, username, u'Downloading user page')
1729         mobj = re.search(r'data-users-id="([^"]+)"', page)
1730         page_base = page_base % mobj.group(1)
1731
1732
1733         # Download video ids using BlipTV Ajax calls. Result size per
1734         # query is limited (currently to 12 videos) so we need to query
1735         # page by page until there are no video ids - it means we got
1736         # all of them.
1737
1738         video_ids = []
1739         pagenum = 1
1740
1741         while True:
1742             url = page_base + "&page=" + str(pagenum)
1743             page = self._download_webpage(url, username,
1744                                           u'Downloading video ids from page %d' % pagenum)
1745
1746             # Extract video identifiers
1747             ids_in_page = []
1748
1749             for mobj in re.finditer(r'href="/([^"]+)"', page):
1750                 if mobj.group(1) not in ids_in_page:
1751                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1752
1753             video_ids.extend(ids_in_page)
1754
1755             # A little optimization - if current page is not
1756             # "full", ie. does not contain PAGE_SIZE video ids then
1757             # we can assume that this page is the last one - there
1758             # are no more ids on further pages - no need to query
1759             # again.
1760
1761             if len(ids_in_page) < self._PAGE_SIZE:
1762                 break
1763
1764             pagenum += 1
1765
1766         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1767         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1768         return [self.playlist_result(url_entries, playlist_title = username)]
1769
1770
1771 class DepositFilesIE(InfoExtractor):
1772     """Information extractor for depositfiles.com"""
1773
1774     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1775
1776     def _real_extract(self, url):
1777         file_id = url.split('/')[-1]
1778         # Rebuild url in english locale
1779         url = 'http://depositfiles.com/en/files/' + file_id
1780
1781         # Retrieve file webpage with 'Free download' button pressed
1782         free_download_indication = { 'gateway_result' : '1' }
1783         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1784         try:
1785             self.report_download_webpage(file_id)
1786             webpage = compat_urllib_request.urlopen(request).read()
1787         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1789
1790         # Search for the real file URL
1791         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1792         if (mobj is None) or (mobj.group(1) is None):
1793             # Try to figure out reason of the error.
1794             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1795             if (mobj is not None) and (mobj.group(1) is not None):
1796                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1797                 raise ExtractorError(u'%s' % restriction_message)
1798             else:
1799                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1800
1801         file_url = mobj.group(1)
1802         file_extension = os.path.splitext(file_url)[1][1:]
1803
1804         # Search for file title
1805         mobj = re.search(r'<b title="(.*?)">', webpage)
1806         if mobj is None:
1807             raise ExtractorError(u'Unable to extract title')
1808         file_title = mobj.group(1).decode('utf-8')
1809
1810         return [{
1811             'id':       file_id.decode('utf-8'),
1812             'url':      file_url.decode('utf-8'),
1813             'uploader': None,
1814             'upload_date':  None,
1815             'title':    file_title,
1816             'ext':      file_extension.decode('utf-8'),
1817         }]
1818
1819
1820 class FacebookIE(InfoExtractor):
1821     """Information Extractor for Facebook"""
1822
1823     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1824     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1825     _NETRC_MACHINE = 'facebook'
1826     IE_NAME = u'facebook'
1827
1828     def report_login(self):
1829         """Report attempt to log in."""
1830         self.to_screen(u'Logging in')
1831
1832     def _real_initialize(self):
1833         if self._downloader is None:
1834             return
1835
1836         useremail = None
1837         password = None
1838         downloader_params = self._downloader.params
1839
1840         # Attempt to use provided username and password or .netrc data
1841         if downloader_params.get('username', None) is not None:
1842             useremail = downloader_params['username']
1843             password = downloader_params['password']
1844         elif downloader_params.get('usenetrc', False):
1845             try:
1846                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1847                 if info is not None:
1848                     useremail = info[0]
1849                     password = info[2]
1850                 else:
1851                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1852             except (IOError, netrc.NetrcParseError) as err:
1853                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1854                 return
1855
1856         if useremail is None:
1857             return
1858
1859         # Log in
1860         login_form = {
1861             'email': useremail,
1862             'pass': password,
1863             'login': 'Log+In'
1864             }
1865         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1866         try:
1867             self.report_login()
1868             login_results = compat_urllib_request.urlopen(request).read()
1869             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1870                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1871                 return
1872         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1873             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1874             return
1875
1876     def _real_extract(self, url):
1877         mobj = re.match(self._VALID_URL, url)
1878         if mobj is None:
1879             raise ExtractorError(u'Invalid URL: %s' % url)
1880         video_id = mobj.group('ID')
1881
1882         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1883         webpage = self._download_webpage(url, video_id)
1884
1885         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1886         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1887         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1888         if not m:
1889             raise ExtractorError(u'Cannot parse data')
1890         data = dict(json.loads(m.group(1)))
1891         params_raw = compat_urllib_parse.unquote(data['params'])
1892         params = json.loads(params_raw)
1893         video_data = params['video_data'][0]
1894         video_url = video_data.get('hd_src')
1895         if not video_url:
1896             video_url = video_data['sd_src']
1897         if not video_url:
1898             raise ExtractorError(u'Cannot find video URL')
1899         video_duration = int(video_data['video_duration'])
1900         thumbnail = video_data['thumbnail_src']
1901
1902         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1903         if not m:
1904             raise ExtractorError(u'Cannot find title in webpage')
1905         video_title = unescapeHTML(m.group(1))
1906
1907         info = {
1908             'id': video_id,
1909             'title': video_title,
1910             'url': video_url,
1911             'ext': 'mp4',
1912             'duration': video_duration,
1913             'thumbnail': thumbnail,
1914         }
1915         return [info]
1916
1917
1918 class BlipTVIE(InfoExtractor):
1919     """Information extractor for blip.tv"""
1920
1921     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1922     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1923     IE_NAME = u'blip.tv'
1924
1925     def report_direct_download(self, title):
1926         """Report information extraction."""
1927         self.to_screen(u'%s: Direct download detected' % title)
1928
1929     def _real_extract(self, url):
1930         mobj = re.match(self._VALID_URL, url)
1931         if mobj is None:
1932             raise ExtractorError(u'Invalid URL: %s' % url)
1933
1934         # See https://github.com/rg3/youtube-dl/issues/857
1935         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1936         if api_mobj is not None:
1937             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1938         urlp = compat_urllib_parse_urlparse(url)
1939         if urlp.path.startswith('/play/'):
1940             request = compat_urllib_request.Request(url)
1941             response = compat_urllib_request.urlopen(request)
1942             redirecturl = response.geturl()
1943             rurlp = compat_urllib_parse_urlparse(redirecturl)
1944             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1945             url = 'http://blip.tv/a/a-' + file_id
1946             return self._real_extract(url)
1947
1948
1949         if '?' in url:
1950             cchar = '&'
1951         else:
1952             cchar = '?'
1953         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1954         request = compat_urllib_request.Request(json_url)
1955         request.add_header('User-Agent', 'iTunes/10.6.1')
1956         self.report_extraction(mobj.group(1))
1957         info = None
1958         try:
1959             urlh = compat_urllib_request.urlopen(request)
1960             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1961                 basename = url.split('/')[-1]
1962                 title,ext = os.path.splitext(basename)
1963                 title = title.decode('UTF-8')
1964                 ext = ext.replace('.', '')
1965                 self.report_direct_download(title)
1966                 info = {
1967                     'id': title,
1968                     'url': url,
1969                     'uploader': None,
1970                     'upload_date': None,
1971                     'title': title,
1972                     'ext': ext,
1973                     'urlhandle': urlh
1974                 }
1975         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1976             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1977         if info is None: # Regular URL
1978             try:
1979                 json_code_bytes = urlh.read()
1980                 json_code = json_code_bytes.decode('utf-8')
1981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1983
1984             try:
1985                 json_data = json.loads(json_code)
1986                 if 'Post' in json_data:
1987                     data = json_data['Post']
1988                 else:
1989                     data = json_data
1990
1991                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1992                 video_url = data['media']['url']
1993                 umobj = re.match(self._URL_EXT, video_url)
1994                 if umobj is None:
1995                     raise ValueError('Can not determine filename extension')
1996                 ext = umobj.group(1)
1997
1998                 info = {
1999                     'id': data['item_id'],
2000                     'url': video_url,
2001                     'uploader': data['display_name'],
2002                     'upload_date': upload_date,
2003                     'title': data['title'],
2004                     'ext': ext,
2005                     'format': data['media']['mimeType'],
2006                     'thumbnail': data['thumbnailUrl'],
2007                     'description': data['description'],
2008                     'player_url': data['embedUrl'],
2009                     'user_agent': 'iTunes/10.6.1',
2010                 }
2011             except (ValueError,KeyError) as err:
2012                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2013
2014         return [info]
2015
2016
2017 class MyVideoIE(InfoExtractor):
2018     """Information Extractor for myvideo.de."""
2019
2020     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2021     IE_NAME = u'myvideo'
2022
2023     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2024     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2025     # https://github.com/rg3/youtube-dl/pull/842
2026     def __rc4crypt(self,data, key):
2027         x = 0
2028         box = list(range(256))
2029         for i in list(range(256)):
2030             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2031             box[i], box[x] = box[x], box[i]
2032         x = 0
2033         y = 0
2034         out = ''
2035         for char in data:
2036             x = (x + 1) % 256
2037             y = (y + box[x]) % 256
2038             box[x], box[y] = box[y], box[x]
2039             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2040         return out
2041
2042     def __md5(self,s):
2043         return hashlib.md5(s).hexdigest().encode()
2044
2045     def _real_extract(self,url):
2046         mobj = re.match(self._VALID_URL, url)
2047         if mobj is None:
2048             raise ExtractorError(u'invalid URL: %s' % url)
2049
2050         video_id = mobj.group(1)
2051
2052         GK = (
2053           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2054           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2055           b'TnpsbA0KTVRkbU1tSTRNdz09'
2056         )
2057
2058         # Get video webpage
2059         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2060         webpage = self._download_webpage(webpage_url, video_id)
2061
2062         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2063         if mobj is not None:
2064             self.report_extraction(video_id)
2065             video_url = mobj.group(1) + '.flv'
2066
2067             mobj = re.search('<title>([^<]+)</title>', webpage)
2068             if mobj is None:
2069                 raise ExtractorError(u'Unable to extract title')
2070             video_title = mobj.group(1)
2071
2072             mobj = re.search('[.](.+?)$', video_url)
2073             if mobj is None:
2074                 raise ExtractorError(u'Unable to extract extention')
2075             video_ext = mobj.group(1)
2076
2077             return [{
2078                 'id':       video_id,
2079                 'url':      video_url,
2080                 'uploader': None,
2081                 'upload_date':  None,
2082                 'title':    video_title,
2083                 'ext':      u'flv',
2084             }]
2085
2086         # try encxml
2087         mobj = re.search('var flashvars={(.+?)}', webpage)
2088         if mobj is None:
2089             raise ExtractorError(u'Unable to extract video')
2090
2091         params = {}
2092         encxml = ''
2093         sec = mobj.group(1)
2094         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2095             if not a == '_encxml':
2096                 params[a] = b
2097             else:
2098                 encxml = compat_urllib_parse.unquote(b)
2099         if not params.get('domain'):
2100             params['domain'] = 'www.myvideo.de'
2101         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2102         if 'flash_playertype=MTV' in xmldata_url:
2103             self._downloader.report_warning(u'avoiding MTV player')
2104             xmldata_url = (
2105                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2106                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2107             ) % video_id
2108
2109         # get enc data
2110         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2111         enc_data_b = binascii.unhexlify(enc_data)
2112         sk = self.__md5(
2113             base64.b64decode(base64.b64decode(GK)) +
2114             self.__md5(
2115                 str(video_id).encode('utf-8')
2116             )
2117         )
2118         dec_data = self.__rc4crypt(enc_data_b, sk)
2119
2120         # extracting infos
2121         self.report_extraction(video_id)
2122
2123         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2124         if mobj is None:
2125             raise ExtractorError(u'unable to extract rtmpurl')
2126         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2127         if 'myvideo2flash' in video_rtmpurl:
2128             self._downloader.report_warning(u'forcing RTMPT ...')
2129             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2130
2131         # extract non rtmp videos
2132         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2133             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2134             if mobj is None:
2135                 raise ExtractorError(u'unable to extract url')
2136             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2137
2138         mobj = re.search('source=\'(.*?)\'', dec_data)
2139         if mobj is None:
2140             raise ExtractorError(u'unable to extract swfobj')
2141         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2142
2143         if not video_file.endswith('f4m'):
2144             ppath, prefix = video_file.split('.')
2145             video_playpath = '%s:%s' % (prefix, ppath)
2146             video_hls_playlist = ''
2147         else:
2148             video_playpath = ''
2149             video_hls_playlist = (
2150                 video_filepath + video_file
2151             ).replace('.f4m', '.m3u8')
2152
2153         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2154         if mobj is None:
2155             raise ExtractorError(u'unable to extract swfobj')
2156         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2157
2158         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2159         if mobj is None:
2160             raise ExtractorError(u'unable to extract title')
2161         video_title = mobj.group(1)
2162
2163         return [{
2164             'id':                 video_id,
2165             'url':                video_rtmpurl,
2166             'tc_url':             video_rtmpurl,
2167             'uploader':           None,
2168             'upload_date':        None,
2169             'title':              video_title,
2170             'ext':                u'flv',
2171             'play_path':          video_playpath,
2172             'video_file':         video_file,
2173             'video_hls_playlist': video_hls_playlist,
2174             'player_url':         video_swfobj,
2175         }]
2176
2177 class ComedyCentralIE(InfoExtractor):
2178     """Information extractor for The Daily Show and Colbert Report """
2179
2180     # urls can be abbreviations like :thedailyshow or :colbert
2181     # urls for episodes like:
2182     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2183     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2184     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2185     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2186                       |(https?://)?(www\.)?
2187                           (?P<showname>thedailyshow|colbertnation)\.com/
2188                          (full-episodes/(?P<episode>.*)|
2189                           (?P<clip>
2190                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2191                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2192                      $"""
2193
2194     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2195
2196     _video_extensions = {
2197         '3500': 'mp4',
2198         '2200': 'mp4',
2199         '1700': 'mp4',
2200         '1200': 'mp4',
2201         '750': 'mp4',
2202         '400': 'mp4',
2203     }
2204     _video_dimensions = {
2205         '3500': '1280x720',
2206         '2200': '960x540',
2207         '1700': '768x432',
2208         '1200': '640x360',
2209         '750': '512x288',
2210         '400': '384x216',
2211     }
2212
2213     @classmethod
2214     def suitable(cls, url):
2215         """Receives a URL and returns True if suitable for this IE."""
2216         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2217
2218     def _print_formats(self, formats):
2219         print('Available formats:')
2220         for x in formats:
2221             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2222
2223
2224     def _real_extract(self, url):
2225         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2226         if mobj is None:
2227             raise ExtractorError(u'Invalid URL: %s' % url)
2228
2229         if mobj.group('shortname'):
2230             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2231                 url = u'http://www.thedailyshow.com/full-episodes/'
2232             else:
2233                 url = u'http://www.colbertnation.com/full-episodes/'
2234             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2235             assert mobj is not None
2236
2237         if mobj.group('clip'):
2238             if mobj.group('showname') == 'thedailyshow':
2239                 epTitle = mobj.group('tdstitle')
2240             else:
2241                 epTitle = mobj.group('cntitle')
2242             dlNewest = False
2243         else:
2244             dlNewest = not mobj.group('episode')
2245             if dlNewest:
2246                 epTitle = mobj.group('showname')
2247             else:
2248                 epTitle = mobj.group('episode')
2249
2250         self.report_extraction(epTitle)
2251         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2252         if dlNewest:
2253             url = htmlHandle.geturl()
2254             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2255             if mobj is None:
2256                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2257             if mobj.group('episode') == '':
2258                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2259             epTitle = mobj.group('episode')
2260
2261         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2262
2263         if len(mMovieParams) == 0:
2264             # The Colbert Report embeds the information in a without
2265             # a URL prefix; so extract the alternate reference
2266             # and then add the URL prefix manually.
2267
2268             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2269             if len(altMovieParams) == 0:
2270                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2271             else:
2272                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2273
2274         uri = mMovieParams[0][1]
2275         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2276         indexXml = self._download_webpage(indexUrl, epTitle,
2277                                           u'Downloading show index',
2278                                           u'unable to download episode index')
2279
2280         results = []
2281
2282         idoc = xml.etree.ElementTree.fromstring(indexXml)
2283         itemEls = idoc.findall('.//item')
2284         for partNum,itemEl in enumerate(itemEls):
2285             mediaId = itemEl.findall('./guid')[0].text
2286             shortMediaId = mediaId.split(':')[-1]
2287             showId = mediaId.split(':')[-2].replace('.com', '')
2288             officialTitle = itemEl.findall('./title')[0].text
2289             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2290
2291             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2292                         compat_urllib_parse.urlencode({'uri': mediaId}))
2293             configXml = self._download_webpage(configUrl, epTitle,
2294                                                u'Downloading configuration for %s' % shortMediaId)
2295
2296             cdoc = xml.etree.ElementTree.fromstring(configXml)
2297             turls = []
2298             for rendition in cdoc.findall('.//rendition'):
2299                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2300                 turls.append(finfo)
2301
2302             if len(turls) == 0:
2303                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2304                 continue
2305
2306             if self._downloader.params.get('listformats', None):
2307                 self._print_formats([i[0] for i in turls])
2308                 return
2309
2310             # For now, just pick the highest bitrate
2311             format,rtmp_video_url = turls[-1]
2312
2313             # Get the format arg from the arg stream
2314             req_format = self._downloader.params.get('format', None)
2315
2316             # Select format if we can find one
2317             for f,v in turls:
2318                 if f == req_format:
2319                     format, rtmp_video_url = f, v
2320                     break
2321
2322             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2323             if not m:
2324                 raise ExtractorError(u'Cannot transform RTMP url')
2325             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2326             video_url = base + m.group('finalid')
2327
2328             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2329             info = {
2330                 'id': shortMediaId,
2331                 'url': video_url,
2332                 'uploader': showId,
2333                 'upload_date': officialDate,
2334                 'title': effTitle,
2335                 'ext': 'mp4',
2336                 'format': format,
2337                 'thumbnail': None,
2338                 'description': officialTitle,
2339             }
2340             results.append(info)
2341
2342         return results
2343
2344
2345 class EscapistIE(InfoExtractor):
2346     """Information extractor for The Escapist """
2347
2348     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2349     IE_NAME = u'escapist'
2350
2351     def _real_extract(self, url):
2352         mobj = re.match(self._VALID_URL, url)
2353         if mobj is None:
2354             raise ExtractorError(u'Invalid URL: %s' % url)
2355         showName = mobj.group('showname')
2356         videoId = mobj.group('episode')
2357
2358         self.report_extraction(showName)
2359         webPage = self._download_webpage(url, showName)
2360
2361         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2362         description = unescapeHTML(descMatch.group(1))
2363         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2364         imgUrl = unescapeHTML(imgMatch.group(1))
2365         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2366         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2367         configUrlMatch = re.search('config=(.*)$', playerUrl)
2368         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2369
2370         configJSON = self._download_webpage(configUrl, showName,
2371                                             u'Downloading configuration',
2372                                             u'unable to download configuration')
2373
2374         # Technically, it's JavaScript, not JSON
2375         configJSON = configJSON.replace("'", '"')
2376
2377         try:
2378             config = json.loads(configJSON)
2379         except (ValueError,) as err:
2380             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2381
2382         playlist = config['playlist']
2383         videoUrl = playlist[1]['url']
2384
2385         info = {
2386             'id': videoId,
2387             'url': videoUrl,
2388             'uploader': showName,
2389             'upload_date': None,
2390             'title': showName,
2391             'ext': 'mp4',
2392             'thumbnail': imgUrl,
2393             'description': description,
2394             'player_url': playerUrl,
2395         }
2396
2397         return [info]
2398
2399 class CollegeHumorIE(InfoExtractor):
2400     """Information extractor for collegehumor.com"""
2401
2402     _WORKING = False
2403     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2404     IE_NAME = u'collegehumor'
2405
2406     def report_manifest(self, video_id):
2407         """Report information extraction."""
2408         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2409
2410     def _real_extract(self, url):
2411         mobj = re.match(self._VALID_URL, url)
2412         if mobj is None:
2413             raise ExtractorError(u'Invalid URL: %s' % url)
2414         video_id = mobj.group('videoid')
2415
2416         info = {
2417             'id': video_id,
2418             'uploader': None,
2419             'upload_date': None,
2420         }
2421
2422         self.report_extraction(video_id)
2423         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2424         try:
2425             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2426         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2427             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2428
2429         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2430         try:
2431             videoNode = mdoc.findall('./video')[0]
2432             info['description'] = videoNode.findall('./description')[0].text
2433             info['title'] = videoNode.findall('./caption')[0].text
2434             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2435             manifest_url = videoNode.findall('./file')[0].text
2436         except IndexError:
2437             raise ExtractorError(u'Invalid metadata XML file')
2438
2439         manifest_url += '?hdcore=2.10.3'
2440         self.report_manifest(video_id)
2441         try:
2442             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2445
2446         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2447         try:
2448             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2449             node_id = media_node.attrib['url']
2450             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2451         except IndexError as err:
2452             raise ExtractorError(u'Invalid manifest file')
2453
2454         url_pr = compat_urllib_parse_urlparse(manifest_url)
2455         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2456
2457         info['url'] = url
2458         info['ext'] = 'f4f'
2459         return [info]
2460
2461
2462 class XVideosIE(InfoExtractor):
2463     """Information extractor for xvideos.com"""
2464
2465     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2466     IE_NAME = u'xvideos'
2467
2468     def _real_extract(self, url):
2469         mobj = re.match(self._VALID_URL, url)
2470         if mobj is None:
2471             raise ExtractorError(u'Invalid URL: %s' % url)
2472         video_id = mobj.group(1)
2473
2474         webpage = self._download_webpage(url, video_id)
2475
2476         self.report_extraction(video_id)
2477
2478
2479         # Extract video URL
2480         mobj = re.search(r'flv_url=(.+?)&', webpage)
2481         if mobj is None:
2482             raise ExtractorError(u'Unable to extract video url')
2483         video_url = compat_urllib_parse.unquote(mobj.group(1))
2484
2485
2486         # Extract title
2487         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2488         if mobj is None:
2489             raise ExtractorError(u'Unable to extract video title')
2490         video_title = mobj.group(1)
2491
2492
2493         # Extract video thumbnail
2494         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2495         if mobj is None:
2496             raise ExtractorError(u'Unable to extract video thumbnail')
2497         video_thumbnail = mobj.group(0)
2498
2499         info = {
2500             'id': video_id,
2501             'url': video_url,
2502             'uploader': None,
2503             'upload_date': None,
2504             'title': video_title,
2505             'ext': 'flv',
2506             'thumbnail': video_thumbnail,
2507             'description': None,
2508         }
2509
2510         return [info]
2511
2512
2513 class SoundcloudIE(InfoExtractor):
2514     """Information extractor for soundcloud.com
2515        To access the media, the uid of the song and a stream token
2516        must be extracted from the page source and the script must make
2517        a request to media.soundcloud.com/crossdomain.xml. Then
2518        the media can be grabbed by requesting from an url composed
2519        of the stream token and uid
2520      """
2521
2522     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2523     IE_NAME = u'soundcloud'
2524
2525     def report_resolve(self, video_id):
2526         """Report information extraction."""
2527         self.to_screen(u'%s: Resolving id' % video_id)
2528
2529     def _real_extract(self, url):
2530         mobj = re.match(self._VALID_URL, url)
2531         if mobj is None:
2532             raise ExtractorError(u'Invalid URL: %s' % url)
2533
2534         # extract uploader (which is in the url)
2535         uploader = mobj.group(1)
2536         # extract simple title (uploader + slug of song title)
2537         slug_title =  mobj.group(2)
2538         simple_title = uploader + u'-' + slug_title
2539         full_title = '%s/%s' % (uploader, slug_title)
2540
2541         self.report_resolve(full_title)
2542
2543         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2544         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2545         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2546
2547         info = json.loads(info_json)
2548         video_id = info['id']
2549         self.report_extraction(full_title)
2550
2551         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2552         stream_json = self._download_webpage(streams_url, full_title,
2553                                              u'Downloading stream definitions',
2554                                              u'unable to download stream definitions')
2555
2556         streams = json.loads(stream_json)
2557         mediaURL = streams['http_mp3_128_url']
2558         upload_date = unified_strdate(info['created_at'])
2559
2560         return [{
2561             'id':       info['id'],
2562             'url':      mediaURL,
2563             'uploader': info['user']['username'],
2564             'upload_date': upload_date,
2565             'title':    info['title'],
2566             'ext':      u'mp3',
2567             'description': info['description'],
2568         }]
2569
2570 class SoundcloudSetIE(InfoExtractor):
2571     """Information extractor for soundcloud.com sets
2572        To access the media, the uid of the song and a stream token
2573        must be extracted from the page source and the script must make
2574        a request to media.soundcloud.com/crossdomain.xml. Then
2575        the media can be grabbed by requesting from an url composed
2576        of the stream token and uid
2577      """
2578
2579     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2580     IE_NAME = u'soundcloud:set'
2581
2582     def report_resolve(self, video_id):
2583         """Report information extraction."""
2584         self.to_screen(u'%s: Resolving id' % video_id)
2585
2586     def _real_extract(self, url):
2587         mobj = re.match(self._VALID_URL, url)
2588         if mobj is None:
2589             raise ExtractorError(u'Invalid URL: %s' % url)
2590
2591         # extract uploader (which is in the url)
2592         uploader = mobj.group(1)
2593         # extract simple title (uploader + slug of song title)
2594         slug_title =  mobj.group(2)
2595         simple_title = uploader + u'-' + slug_title
2596         full_title = '%s/sets/%s' % (uploader, slug_title)
2597
2598         self.report_resolve(full_title)
2599
2600         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2601         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2602         info_json = self._download_webpage(resolv_url, full_title)
2603
2604         videos = []
2605         info = json.loads(info_json)
2606         if 'errors' in info:
2607             for err in info['errors']:
2608                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2609             return
2610
2611         self.report_extraction(full_title)
2612         for track in info['tracks']:
2613             video_id = track['id']
2614
2615             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2616             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2617
2618             self.report_extraction(video_id)
2619             streams = json.loads(stream_json)
2620             mediaURL = streams['http_mp3_128_url']
2621
2622             videos.append({
2623                 'id':       video_id,
2624                 'url':      mediaURL,
2625                 'uploader': track['user']['username'],
2626                 'upload_date':  unified_strdate(track['created_at']),
2627                 'title':    track['title'],
2628                 'ext':      u'mp3',
2629                 'description': track['description'],
2630             })
2631         return videos
2632
2633
2634 class InfoQIE(InfoExtractor):
2635     """Information extractor for infoq.com"""
2636     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2637
2638     def _real_extract(self, url):
2639         mobj = re.match(self._VALID_URL, url)
2640         if mobj is None:
2641             raise ExtractorError(u'Invalid URL: %s' % url)
2642
2643         webpage = self._download_webpage(url, video_id=url)
2644         self.report_extraction(url)
2645
2646         # Extract video URL
2647         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2648         if mobj is None:
2649             raise ExtractorError(u'Unable to extract video url')
2650         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2651         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2652
2653         # Extract title
2654         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2655         if mobj is None:
2656             raise ExtractorError(u'Unable to extract video title')
2657         video_title = mobj.group(1)
2658
2659         # Extract description
2660         video_description = u'No description available.'
2661         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2662         if mobj is not None:
2663             video_description = mobj.group(1)
2664
2665         video_filename = video_url.split('/')[-1]
2666         video_id, extension = video_filename.split('.')
2667
2668         info = {
2669             'id': video_id,
2670             'url': video_url,
2671             'uploader': None,
2672             'upload_date': None,
2673             'title': video_title,
2674             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2675             'thumbnail': None,
2676             'description': video_description,
2677         }
2678
2679         return [info]
2680
2681 class MixcloudIE(InfoExtractor):
2682     """Information extractor for www.mixcloud.com"""
2683
2684     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2685     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2686     IE_NAME = u'mixcloud'
2687
2688     def report_download_json(self, file_id):
2689         """Report JSON download."""
2690         self.to_screen(u'Downloading json')
2691
2692     def get_urls(self, jsonData, fmt, bitrate='best'):
2693         """Get urls from 'audio_formats' section in json"""
2694         file_url = None
2695         try:
2696             bitrate_list = jsonData[fmt]
2697             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2698                 bitrate = max(bitrate_list) # select highest
2699
2700             url_list = jsonData[fmt][bitrate]
2701         except TypeError: # we have no bitrate info.
2702             url_list = jsonData[fmt]
2703         return url_list
2704
2705     def check_urls(self, url_list):
2706         """Returns 1st active url from list"""
2707         for url in url_list:
2708             try:
2709                 compat_urllib_request.urlopen(url)
2710                 return url
2711             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2712                 url = None
2713
2714         return None
2715
2716     def _print_formats(self, formats):
2717         print('Available formats:')
2718         for fmt in formats.keys():
2719             for b in formats[fmt]:
2720                 try:
2721                     ext = formats[fmt][b][0]
2722                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2723                 except TypeError: # we have no bitrate info
2724                     ext = formats[fmt][0]
2725                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2726                     break
2727
2728     def _real_extract(self, url):
2729         mobj = re.match(self._VALID_URL, url)
2730         if mobj is None:
2731             raise ExtractorError(u'Invalid URL: %s' % url)
2732         # extract uploader & filename from url
2733         uploader = mobj.group(1).decode('utf-8')
2734         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2735
2736         # construct API request
2737         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2738         # retrieve .json file with links to files
2739         request = compat_urllib_request.Request(file_url)
2740         try:
2741             self.report_download_json(file_url)
2742             jsonData = compat_urllib_request.urlopen(request).read()
2743         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2744             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2745
2746         # parse JSON
2747         json_data = json.loads(jsonData)
2748         player_url = json_data['player_swf_url']
2749         formats = dict(json_data['audio_formats'])
2750
2751         req_format = self._downloader.params.get('format', None)
2752         bitrate = None
2753
2754         if self._downloader.params.get('listformats', None):
2755             self._print_formats(formats)
2756             return
2757
2758         if req_format is None or req_format == 'best':
2759             for format_param in formats.keys():
2760                 url_list = self.get_urls(formats, format_param)
2761                 # check urls
2762                 file_url = self.check_urls(url_list)
2763                 if file_url is not None:
2764                     break # got it!
2765         else:
2766             if req_format not in formats:
2767                 raise ExtractorError(u'Format is not available')
2768
2769             url_list = self.get_urls(formats, req_format)
2770             file_url = self.check_urls(url_list)
2771             format_param = req_format
2772
2773         return [{
2774             'id': file_id.decode('utf-8'),
2775             'url': file_url.decode('utf-8'),
2776             'uploader': uploader.decode('utf-8'),
2777             'upload_date': None,
2778             'title': json_data['name'],
2779             'ext': file_url.split('.')[-1].decode('utf-8'),
2780             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2781             'thumbnail': json_data['thumbnail_url'],
2782             'description': json_data['description'],
2783             'player_url': player_url.decode('utf-8'),
2784         }]
2785
2786 class StanfordOpenClassroomIE(InfoExtractor):
2787     """Information extractor for Stanford's Open ClassRoom"""
2788
2789     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2790     IE_NAME = u'stanfordoc'
2791
2792     def _real_extract(self, url):
2793         mobj = re.match(self._VALID_URL, url)
2794         if mobj is None:
2795             raise ExtractorError(u'Invalid URL: %s' % url)
2796
2797         if mobj.group('course') and mobj.group('video'): # A specific video
2798             course = mobj.group('course')
2799             video = mobj.group('video')
2800             info = {
2801                 'id': course + '_' + video,
2802                 'uploader': None,
2803                 'upload_date': None,
2804             }
2805
2806             self.report_extraction(info['id'])
2807             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2808             xmlUrl = baseUrl + video + '.xml'
2809             try:
2810                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2811             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2813             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2814             try:
2815                 info['title'] = mdoc.findall('./title')[0].text
2816                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2817             except IndexError:
2818                 raise ExtractorError(u'Invalid metadata XML file')
2819             info['ext'] = info['url'].rpartition('.')[2]
2820             return [info]
2821         elif mobj.group('course'): # A course page
2822             course = mobj.group('course')
2823             info = {
2824                 'id': course,
2825                 'type': 'playlist',
2826                 'uploader': None,
2827                 'upload_date': None,
2828             }
2829
2830             coursepage = self._download_webpage(url, info['id'],
2831                                         note='Downloading course info page',
2832                                         errnote='Unable to download course info page')
2833
2834             m = re.search('<h1>([^<]+)</h1>', coursepage)
2835             if m:
2836                 info['title'] = unescapeHTML(m.group(1))
2837             else:
2838                 info['title'] = info['id']
2839
2840             m = re.search('<description>([^<]+)</description>', coursepage)
2841             if m:
2842                 info['description'] = unescapeHTML(m.group(1))
2843
2844             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2845             info['list'] = [
2846                 {
2847                     'type': 'reference',
2848                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2849                 }
2850                     for vpage in links]
2851             results = []
2852             for entry in info['list']:
2853                 assert entry['type'] == 'reference'
2854                 results += self.extract(entry['url'])
2855             return results
2856         else: # Root page
2857             info = {
2858                 'id': 'Stanford OpenClassroom',
2859                 'type': 'playlist',
2860                 'uploader': None,
2861                 'upload_date': None,
2862             }
2863
2864             self.report_download_webpage(info['id'])
2865             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2866             try:
2867                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2868             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2870
2871             info['title'] = info['id']
2872
2873             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2874             info['list'] = [
2875                 {
2876                     'type': 'reference',
2877                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2878                 }
2879                     for cpage in links]
2880
2881             results = []
2882             for entry in info['list']:
2883                 assert entry['type'] == 'reference'
2884                 results += self.extract(entry['url'])
2885             return results
2886
2887 class MTVIE(InfoExtractor):
2888     """Information extractor for MTV.com"""
2889
2890     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2891     IE_NAME = u'mtv'
2892
2893     def _real_extract(self, url):
2894         mobj = re.match(self._VALID_URL, url)
2895         if mobj is None:
2896             raise ExtractorError(u'Invalid URL: %s' % url)
2897         if not mobj.group('proto'):
2898             url = 'http://' + url
2899         video_id = mobj.group('videoid')
2900
2901         webpage = self._download_webpage(url, video_id)
2902
2903         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2904         if mobj is None:
2905             raise ExtractorError(u'Unable to extract song name')
2906         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2907         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2908         if mobj is None:
2909             raise ExtractorError(u'Unable to extract performer')
2910         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2911         video_title = performer + ' - ' + song_name
2912
2913         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2914         if mobj is None:
2915             raise ExtractorError(u'Unable to mtvn_uri')
2916         mtvn_uri = mobj.group(1)
2917
2918         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2919         if mobj is None:
2920             raise ExtractorError(u'Unable to extract content id')
2921         content_id = mobj.group(1)
2922
2923         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2924         self.report_extraction(video_id)
2925         request = compat_urllib_request.Request(videogen_url)
2926         try:
2927             metadataXml = compat_urllib_request.urlopen(request).read()
2928         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2929             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2930
2931         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2932         renditions = mdoc.findall('.//rendition')
2933
2934         # For now, always pick the highest quality.
2935         rendition = renditions[-1]
2936
2937         try:
2938             _,_,ext = rendition.attrib['type'].partition('/')
2939             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2940             video_url = rendition.find('./src').text
2941         except KeyError:
2942             raise ExtractorError('Invalid rendition field.')
2943
2944         info = {
2945             'id': video_id,
2946             'url': video_url,
2947             'uploader': performer,
2948             'upload_date': None,
2949             'title': video_title,
2950             'ext': ext,
2951             'format': format,
2952         }
2953
2954         return [info]
2955
2956
2957 class YoukuIE(InfoExtractor):
2958     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2959
2960     def _gen_sid(self):
2961         nowTime = int(time.time() * 1000)
2962         random1 = random.randint(1000,1998)
2963         random2 = random.randint(1000,9999)
2964
2965         return "%d%d%d" %(nowTime,random1,random2)
2966
2967     def _get_file_ID_mix_string(self, seed):
2968         mixed = []
2969         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2970         seed = float(seed)
2971         for i in range(len(source)):
2972             seed  =  (seed * 211 + 30031 ) % 65536
2973             index  =  math.floor(seed / 65536 * len(source) )
2974             mixed.append(source[int(index)])
2975             source.remove(source[int(index)])
2976         #return ''.join(mixed)
2977         return mixed
2978
2979     def _get_file_id(self, fileId, seed):
2980         mixed = self._get_file_ID_mix_string(seed)
2981         ids = fileId.split('*')
2982         realId = []
2983         for ch in ids:
2984             if ch:
2985                 realId.append(mixed[int(ch)])
2986         return ''.join(realId)
2987
2988     def _real_extract(self, url):
2989         mobj = re.match(self._VALID_URL, url)
2990         if mobj is None:
2991             raise ExtractorError(u'Invalid URL: %s' % url)
2992         video_id = mobj.group('ID')
2993
2994         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2995
2996         jsondata = self._download_webpage(info_url, video_id)
2997
2998         self.report_extraction(video_id)
2999         try:
3000             config = json.loads(jsondata)
3001
3002             video_title =  config['data'][0]['title']
3003             seed = config['data'][0]['seed']
3004
3005             format = self._downloader.params.get('format', None)
3006             supported_format = list(config['data'][0]['streamfileids'].keys())
3007
3008             if format is None or format == 'best':
3009                 if 'hd2' in supported_format:
3010                     format = 'hd2'
3011                 else:
3012                     format = 'flv'
3013                 ext = u'flv'
3014             elif format == 'worst':
3015                 format = 'mp4'
3016                 ext = u'mp4'
3017             else:
3018                 format = 'flv'
3019                 ext = u'flv'
3020
3021
3022             fileid = config['data'][0]['streamfileids'][format]
3023             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3024         except (UnicodeDecodeError, ValueError, KeyError):
3025             raise ExtractorError(u'Unable to extract info section')
3026
3027         files_info=[]
3028         sid = self._gen_sid()
3029         fileid = self._get_file_id(fileid, seed)
3030
3031         #column 8,9 of fileid represent the segment number
3032         #fileid[7:9] should be changed
3033         for index, key in enumerate(keys):
3034
3035             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3036             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3037
3038             info = {
3039                 'id': '%s_part%02d' % (video_id, index),
3040                 'url': download_url,
3041                 'uploader': None,
3042                 'upload_date': None,
3043                 'title': video_title,
3044                 'ext': ext,
3045             }
3046             files_info.append(info)
3047
3048         return files_info
3049
3050
3051 class XNXXIE(InfoExtractor):
3052     """Information extractor for xnxx.com"""
3053
3054     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3055     IE_NAME = u'xnxx'
3056     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3057     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3058     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3059
3060     def _real_extract(self, url):
3061         mobj = re.match(self._VALID_URL, url)
3062         if mobj is None:
3063             raise ExtractorError(u'Invalid URL: %s' % url)
3064         video_id = mobj.group(1)
3065
3066         # Get webpage content
3067         webpage = self._download_webpage(url, video_id)
3068
3069         result = re.search(self.VIDEO_URL_RE, webpage)
3070         if result is None:
3071             raise ExtractorError(u'Unable to extract video url')
3072         video_url = compat_urllib_parse.unquote(result.group(1))
3073
3074         result = re.search(self.VIDEO_TITLE_RE, webpage)
3075         if result is None:
3076             raise ExtractorError(u'Unable to extract video title')
3077         video_title = result.group(1)
3078
3079         result = re.search(self.VIDEO_THUMB_RE, webpage)
3080         if result is None:
3081             raise ExtractorError(u'Unable to extract video thumbnail')
3082         video_thumbnail = result.group(1)
3083
3084         return [{
3085             'id': video_id,
3086             'url': video_url,
3087             'uploader': None,
3088             'upload_date': None,
3089             'title': video_title,
3090             'ext': 'flv',
3091             'thumbnail': video_thumbnail,
3092             'description': None,
3093         }]
3094
3095
3096 class GooglePlusIE(InfoExtractor):
3097     """Information extractor for plus.google.com."""
3098
3099     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3100     IE_NAME = u'plus.google'
3101
3102     def report_extract_entry(self, url):
3103         """Report downloading extry"""
3104         self.to_screen(u'Downloading entry: %s' % url)
3105
3106     def report_date(self, upload_date):
3107         """Report downloading extry"""
3108         self.to_screen(u'Entry date: %s' % upload_date)
3109
3110     def report_uploader(self, uploader):
3111         """Report downloading extry"""
3112         self.to_screen(u'Uploader: %s' % uploader)
3113
3114     def report_title(self, video_title):
3115         """Report downloading extry"""
3116         self.to_screen(u'Title: %s' % video_title)
3117
3118     def report_extract_vid_page(self, video_page):
3119         """Report information extraction."""
3120         self.to_screen(u'Extracting video page: %s' % video_page)
3121
3122     def _real_extract(self, url):
3123         # Extract id from URL
3124         mobj = re.match(self._VALID_URL, url)
3125         if mobj is None:
3126             raise ExtractorError(u'Invalid URL: %s' % url)
3127
3128         post_url = mobj.group(0)
3129         video_id = mobj.group(1)
3130
3131         video_extension = 'flv'
3132
3133         # Step 1, Retrieve post webpage to extract further information
3134         self.report_extract_entry(post_url)
3135         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3136
3137         # Extract update date
3138         upload_date = None
3139         pattern = 'title="Timestamp">(.*?)</a>'
3140         mobj = re.search(pattern, webpage)
3141         if mobj:
3142             upload_date = mobj.group(1)
3143             # Convert timestring to a format suitable for filename
3144             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3145             upload_date = upload_date.strftime('%Y%m%d')
3146         self.report_date(upload_date)
3147
3148         # Extract uploader
3149         uploader = None
3150         pattern = r'rel\="author".*?>(.*?)</a>'
3151         mobj = re.search(pattern, webpage)
3152         if mobj:
3153             uploader = mobj.group(1)
3154         self.report_uploader(uploader)
3155
3156         # Extract title
3157         # Get the first line for title
3158         video_title = u'NA'
3159         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3160         mobj = re.search(pattern, webpage)
3161         if mobj:
3162             video_title = mobj.group(1)
3163         self.report_title(video_title)
3164
3165         # Step 2, Stimulate clicking the image box to launch video
3166         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3167         mobj = re.search(pattern, webpage)
3168         if mobj is None:
3169             raise ExtractorError(u'Unable to extract video page URL')
3170
3171         video_page = mobj.group(1)
3172         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3173         self.report_extract_vid_page(video_page)
3174
3175
3176         # Extract video links on video page
3177         """Extract video links of all sizes"""
3178         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3179         mobj = re.findall(pattern, webpage)
3180         if len(mobj) == 0:
3181             raise ExtractorError(u'Unable to extract video links')
3182
3183         # Sort in resolution
3184         links = sorted(mobj)
3185
3186         # Choose the lowest of the sort, i.e. highest resolution
3187         video_url = links[-1]
3188         # Only get the url. The resolution part in the tuple has no use anymore
3189         video_url = video_url[-1]
3190         # Treat escaped \u0026 style hex
3191         try:
3192             video_url = video_url.decode("unicode_escape")
3193         except AttributeError: # Python 3
3194             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3195
3196
3197         return [{
3198             'id':       video_id,
3199             'url':      video_url,
3200             'uploader': uploader,
3201             'upload_date':  upload_date,
3202             'title':    video_title,
3203             'ext':      video_extension,
3204         }]
3205
3206 class NBAIE(InfoExtractor):
3207     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3208     IE_NAME = u'nba'
3209
3210     def _real_extract(self, url):
3211         mobj = re.match(self._VALID_URL, url)
3212         if mobj is None:
3213             raise ExtractorError(u'Invalid URL: %s' % url)
3214
3215         video_id = mobj.group(1)
3216         if video_id.endswith('/index.html'):
3217             video_id = video_id[:-len('/index.html')]
3218
3219         webpage = self._download_webpage(url, video_id)
3220
3221         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3222         def _findProp(rexp, default=None):
3223             m = re.search(rexp, webpage)
3224             if m:
3225                 return unescapeHTML(m.group(1))
3226             else:
3227                 return default
3228
3229         shortened_video_id = video_id.rpartition('/')[2]
3230         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3231         info = {
3232             'id': shortened_video_id,
3233             'url': video_url,
3234             'ext': 'mp4',
3235             'title': title,
3236             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3237             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3238         }
3239         return [info]
3240
3241 class JustinTVIE(InfoExtractor):
3242     """Information extractor for justin.tv and twitch.tv"""
3243     # TODO: One broadcast may be split into multiple videos. The key
3244     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3245     # starts at 1 and increases. Can we treat all parts as one video?
3246
3247     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3248         (?:
3249             (?P<channelid>[^/]+)|
3250             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3251             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3252         )
3253         /?(?:\#.*)?$
3254         """
3255     _JUSTIN_PAGE_LIMIT = 100
3256     IE_NAME = u'justin.tv'
3257
3258     def report_download_page(self, channel, offset):
3259         """Report attempt to download a single page of videos."""
3260         self.to_screen(u'%s: Downloading video information from %d to %d' %
3261                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3262
3263     # Return count of items, list of *valid* items
3264     def _parse_page(self, url, video_id):
3265         webpage = self._download_webpage(url, video_id,
3266                                          u'Downloading video info JSON',
3267                                          u'unable to download video info JSON')
3268
3269         response = json.loads(webpage)
3270         if type(response) != list:
3271             error_text = response.get('error', 'unknown error')
3272             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3273         info = []
3274         for clip in response:
3275             video_url = clip['video_file_url']
3276             if video_url:
3277                 video_extension = os.path.splitext(video_url)[1][1:]
3278                 video_date = re.sub('-', '', clip['start_time'][:10])
3279                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3280                 video_id = clip['id']
3281                 video_title = clip.get('title', video_id)
3282                 info.append({
3283                     'id': video_id,
3284                     'url': video_url,
3285                     'title': video_title,
3286                     'uploader': clip.get('channel_name', video_uploader_id),
3287                     'uploader_id': video_uploader_id,
3288                     'upload_date': video_date,
3289                     'ext': video_extension,
3290                 })
3291         return (len(response), info)
3292
3293     def _real_extract(self, url):
3294         mobj = re.match(self._VALID_URL, url)
3295         if mobj is None:
3296             raise ExtractorError(u'invalid URL: %s' % url)
3297
3298         api_base = 'http://api.justin.tv'
3299         paged = False
3300         if mobj.group('channelid'):
3301             paged = True
3302             video_id = mobj.group('channelid')
3303             api = api_base + '/channel/archives/%s.json' % video_id
3304         elif mobj.group('chapterid'):
3305             chapter_id = mobj.group('chapterid')
3306
3307             webpage = self._download_webpage(url, chapter_id)
3308             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3309             if not m:
3310                 raise ExtractorError(u'Cannot find archive of a chapter')
3311             archive_id = m.group(1)
3312
3313             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3314             chapter_info_xml = self._download_webpage(api, chapter_id,
3315                                              note=u'Downloading chapter information',
3316                                              errnote=u'Chapter information download failed')
3317             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3318             for a in doc.findall('.//archive'):
3319                 if archive_id == a.find('./id').text:
3320                     break
3321             else:
3322                 raise ExtractorError(u'Could not find chapter in chapter information')
3323
3324             video_url = a.find('./video_file_url').text
3325             video_ext = video_url.rpartition('.')[2] or u'flv'
3326
3327             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3328             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3329                                    note='Downloading chapter metadata',
3330                                    errnote='Download of chapter metadata failed')
3331             chapter_info = json.loads(chapter_info_json)
3332
3333             bracket_start = int(doc.find('.//bracket_start').text)
3334             bracket_end = int(doc.find('.//bracket_end').text)
3335
3336             # TODO determine start (and probably fix up file)
3337             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3338             #video_url += u'?start=' + TODO:start_timestamp
3339             # bracket_start is 13290, but we want 51670615
3340             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3341                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3342
3343             info = {
3344                 'id': u'c' + chapter_id,
3345                 'url': video_url,
3346                 'ext': video_ext,
3347                 'title': chapter_info['title'],
3348                 'thumbnail': chapter_info['preview'],
3349                 'description': chapter_info['description'],
3350                 'uploader': chapter_info['channel']['display_name'],
3351                 'uploader_id': chapter_info['channel']['name'],
3352             }
3353             return [info]
3354         else:
3355             video_id = mobj.group('videoid')
3356             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3357
3358         self.report_extraction(video_id)
3359
3360         info = []
3361         offset = 0
3362         limit = self._JUSTIN_PAGE_LIMIT
3363         while True:
3364             if paged:
3365                 self.report_download_page(video_id, offset)
3366             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3367             page_count, page_info = self._parse_page(page_url, video_id)
3368             info.extend(page_info)
3369             if not paged or page_count != limit:
3370                 break
3371             offset += limit
3372         return info
3373
3374 class FunnyOrDieIE(InfoExtractor):
3375     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3376
3377     def _real_extract(self, url):
3378         mobj = re.match(self._VALID_URL, url)
3379         if mobj is None:
3380             raise ExtractorError(u'invalid URL: %s' % url)
3381
3382         video_id = mobj.group('id')
3383         webpage = self._download_webpage(url, video_id)
3384
3385         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3386         if not m:
3387             raise ExtractorError(u'Unable to find video information')
3388         video_url = unescapeHTML(m.group('url'))
3389
3390         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3391         if not m:
3392             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3393             if not m:
3394                 raise ExtractorError(u'Cannot find video title')
3395         title = clean_html(m.group('title'))
3396
3397         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3398         if m:
3399             desc = unescapeHTML(m.group('desc'))
3400         else:
3401             desc = None
3402
3403         info = {
3404             'id': video_id,
3405             'url': video_url,
3406             'ext': 'mp4',
3407             'title': title,
3408             'description': desc,
3409         }
3410         return [info]
3411
3412 class SteamIE(InfoExtractor):
3413     _VALID_URL = r"""http://store\.steampowered\.com/
3414                 (agecheck/)?
3415                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3416                 (?P<gameID>\d+)/?
3417                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3418                 """
3419
3420     @classmethod
3421     def suitable(cls, url):
3422         """Receives a URL and returns True if suitable for this IE."""
3423         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3424
3425     def _real_extract(self, url):
3426         m = re.match(self._VALID_URL, url, re.VERBOSE)
3427         gameID = m.group('gameID')
3428         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3429         self.report_age_confirmation()
3430         webpage = self._download_webpage(videourl, gameID)
3431         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3432
3433         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3434         mweb = re.finditer(urlRE, webpage)
3435         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3436         titles = re.finditer(namesRE, webpage)
3437         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3438         thumbs = re.finditer(thumbsRE, webpage)
3439         videos = []
3440         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3441             video_id = vid.group('videoID')
3442             title = vtitle.group('videoName')
3443             video_url = vid.group('videoURL')
3444             video_thumb = thumb.group('thumbnail')
3445             if not video_url:
3446                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3447             info = {
3448                 'id':video_id,
3449                 'url':video_url,
3450                 'ext': 'flv',
3451                 'title': unescapeHTML(title),
3452                 'thumbnail': video_thumb
3453                   }
3454             videos.append(info)
3455         return [self.playlist_result(videos, gameID, game_title)]
3456
3457 class UstreamIE(InfoExtractor):
3458     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3459     IE_NAME = u'ustream'
3460
3461     def _real_extract(self, url):
3462         m = re.match(self._VALID_URL, url)
3463         video_id = m.group('videoID')
3464         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3465         webpage = self._download_webpage(url, video_id)
3466         self.report_extraction(video_id)
3467         try:
3468             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3469             title = m.group('title')
3470             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3471                           webpage, re.DOTALL)
3472             uploader = unescapeHTML(m.group('uploader').strip())
3473             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3474             thumb = m.group('thumb')
3475         except AttributeError:
3476             raise ExtractorError(u'Unable to extract info')
3477         info = {
3478                 'id':video_id,
3479                 'url':video_url,
3480                 'ext': 'flv',
3481                 'title': title,
3482                 'uploader': uploader,
3483                 'thumbnail': thumb,
3484                   }
3485         return info
3486
3487 class WorldStarHipHopIE(InfoExtractor):
3488     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3489     IE_NAME = u'WorldStarHipHop'
3490
3491     def _real_extract(self, url):
3492         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3493
3494         m = re.match(self._VALID_URL, url)
3495         video_id = m.group('id')
3496
3497         webpage_src = self._download_webpage(url, video_id)
3498
3499         mobj = re.search(_src_url, webpage_src)
3500
3501         if mobj is not None:
3502             video_url = mobj.group(1)
3503             if 'mp4' in video_url:
3504                 ext = 'mp4'
3505             else:
3506                 ext = 'flv'
3507         else:
3508             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3509
3510         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3511
3512         if mobj is None:
3513             raise ExtractorError(u'Cannot determine title')
3514         title = mobj.group(1)
3515
3516         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3517         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3518         if mobj is not None:
3519             thumbnail = mobj.group(1)
3520         else:
3521             _title = r"""candytitles.*>(.*)</span>"""
3522             mobj = re.search(_title, webpage_src)
3523             if mobj is not None:
3524                 title = mobj.group(1)
3525             thumbnail = None
3526
3527         results = [{
3528                     'id': video_id,
3529                     'url' : video_url,
3530                     'title' : title,
3531                     'thumbnail' : thumbnail,
3532                     'ext' : ext,
3533                     }]
3534         return results
3535
3536 class RBMARadioIE(InfoExtractor):
3537     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3538
3539     def _real_extract(self, url):
3540         m = re.match(self._VALID_URL, url)
3541         video_id = m.group('videoID')
3542
3543         webpage = self._download_webpage(url, video_id)
3544         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3545         if not m:
3546             raise ExtractorError(u'Cannot find metadata')
3547         json_data = m.group(1)
3548
3549         try:
3550             data = json.loads(json_data)
3551         except ValueError as e:
3552             raise ExtractorError(u'Invalid JSON: ' + str(e))
3553
3554         video_url = data['akamai_url'] + '&cbr=256'
3555         url_parts = compat_urllib_parse_urlparse(video_url)
3556         video_ext = url_parts.path.rpartition('.')[2]
3557         info = {
3558                 'id': video_id,
3559                 'url': video_url,
3560                 'ext': video_ext,
3561                 'title': data['title'],
3562                 'description': data.get('teaser_text'),
3563                 'location': data.get('country_of_origin'),
3564                 'uploader': data.get('host', {}).get('name'),
3565                 'uploader_id': data.get('host', {}).get('slug'),
3566                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3567                 'duration': data.get('duration'),
3568         }
3569         return [info]
3570
3571
3572 class YouPornIE(InfoExtractor):
3573     """Information extractor for youporn.com."""
3574     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3575
3576     def _print_formats(self, formats):
3577         """Print all available formats"""
3578         print(u'Available formats:')
3579         print(u'ext\t\tformat')
3580         print(u'---------------------------------')
3581         for format in formats:
3582             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3583
3584     def _specific(self, req_format, formats):
3585         for x in formats:
3586             if(x["format"]==req_format):
3587                 return x
3588         return None
3589
3590     def _real_extract(self, url):
3591         mobj = re.match(self._VALID_URL, url)
3592         if mobj is None:
3593             raise ExtractorError(u'Invalid URL: %s' % url)
3594
3595         video_id = mobj.group('videoid')
3596
3597         req = compat_urllib_request.Request(url)
3598         req.add_header('Cookie', 'age_verified=1')
3599         webpage = self._download_webpage(req, video_id)
3600
3601         # Get the video title
3602         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3603         if result is None:
3604             raise ExtractorError(u'Unable to extract video title')
3605         video_title = result.group('title').strip()
3606
3607         # Get the video date
3608         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3609         if result is None:
3610             self._downloader.report_warning(u'unable to extract video date')
3611             upload_date = None
3612         else:
3613             upload_date = unified_strdate(result.group('date').strip())
3614
3615         # Get the video uploader
3616         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3617         if result is None:
3618             self._downloader.report_warning(u'unable to extract uploader')
3619             video_uploader = None
3620         else:
3621             video_uploader = result.group('uploader').strip()
3622             video_uploader = clean_html( video_uploader )
3623
3624         # Get all of the formats available
3625         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3626         result = re.search(DOWNLOAD_LIST_RE, webpage)
3627         if result is None:
3628             raise ExtractorError(u'Unable to extract download list')
3629         download_list_html = result.group('download_list').strip()
3630
3631         # Get all of the links from the page
3632         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3633         links = re.findall(LINK_RE, download_list_html)
3634         if(len(links) == 0):
3635             raise ExtractorError(u'ERROR: no known formats available for video')
3636
3637         self.to_screen(u'Links found: %d' % len(links))
3638
3639         formats = []
3640         for link in links:
3641
3642             # A link looks like this:
3643             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3644             # A path looks like this:
3645             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3646             video_url = unescapeHTML( link )
3647             path = compat_urllib_parse_urlparse( video_url ).path
3648             extension = os.path.splitext( path )[1][1:]
3649             format = path.split('/')[4].split('_')[:2]
3650             size = format[0]
3651             bitrate = format[1]
3652             format = "-".join( format )
3653             title = u'%s-%s-%s' % (video_title, size, bitrate)
3654
3655             formats.append({
3656                 'id': video_id,
3657                 'url': video_url,
3658                 'uploader': video_uploader,
3659                 'upload_date': upload_date,
3660                 'title': title,
3661                 'ext': extension,
3662                 'format': format,
3663                 'thumbnail': None,
3664                 'description': None,
3665                 'player_url': None
3666             })
3667
3668         if self._downloader.params.get('listformats', None):
3669             self._print_formats(formats)
3670             return
3671
3672         req_format = self._downloader.params.get('format', None)
3673         self.to_screen(u'Format: %s' % req_format)
3674
3675         if req_format is None or req_format == 'best':
3676             return [formats[0]]
3677         elif req_format == 'worst':
3678             return [formats[-1]]
3679         elif req_format in ('-1', 'all'):
3680             return formats
3681         else:
3682             format = self._specific( req_format, formats )
3683             if result is None:
3684                 raise ExtractorError(u'Requested format not available')
3685             return [format]
3686
3687
3688
3689 class PornotubeIE(InfoExtractor):
3690     """Information extractor for pornotube.com."""
3691     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3692
3693     def _real_extract(self, url):
3694         mobj = re.match(self._VALID_URL, url)
3695         if mobj is None:
3696             raise ExtractorError(u'Invalid URL: %s' % url)
3697
3698         video_id = mobj.group('videoid')
3699         video_title = mobj.group('title')
3700
3701         # Get webpage content
3702         webpage = self._download_webpage(url, video_id)
3703
3704         # Get the video URL
3705         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3706         result = re.search(VIDEO_URL_RE, webpage)
3707         if result is None:
3708             raise ExtractorError(u'Unable to extract video url')
3709         video_url = compat_urllib_parse.unquote(result.group('url'))
3710
3711         #Get the uploaded date
3712         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3713         result = re.search(VIDEO_UPLOADED_RE, webpage)
3714         if result is None:
3715             raise ExtractorError(u'Unable to extract video title')
3716         upload_date = unified_strdate(result.group('date'))
3717
3718         info = {'id': video_id,
3719                 'url': video_url,
3720                 'uploader': None,
3721                 'upload_date': upload_date,
3722                 'title': video_title,
3723                 'ext': 'flv',
3724                 'format': 'flv'}
3725
3726         return [info]
3727
3728 class YouJizzIE(InfoExtractor):
3729     """Information extractor for youjizz.com."""
3730     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3731
3732     def _real_extract(self, url):
3733         mobj = re.match(self._VALID_URL, url)
3734         if mobj is None:
3735             raise ExtractorError(u'Invalid URL: %s' % url)
3736
3737         video_id = mobj.group('videoid')
3738
3739         # Get webpage content
3740         webpage = self._download_webpage(url, video_id)
3741
3742         # Get the video title
3743         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3744         if result is None:
3745             raise ExtractorError(u'ERROR: unable to extract video title')
3746         video_title = result.group('title').strip()
3747
3748         # Get the embed page
3749         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3750         if result is None:
3751             raise ExtractorError(u'ERROR: unable to extract embed page')
3752
3753         embed_page_url = result.group(0).strip()
3754         video_id = result.group('videoid')
3755
3756         webpage = self._download_webpage(embed_page_url, video_id)
3757
3758         # Get the video URL
3759         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3760         if result is None:
3761             raise ExtractorError(u'ERROR: unable to extract video url')
3762         video_url = result.group('source')
3763
3764         info = {'id': video_id,
3765                 'url': video_url,
3766                 'title': video_title,
3767                 'ext': 'flv',
3768                 'format': 'flv',
3769                 'player_url': embed_page_url}
3770
3771         return [info]
3772
3773 class EightTracksIE(InfoExtractor):
3774     IE_NAME = '8tracks'
3775     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3776
3777     def _real_extract(self, url):
3778         mobj = re.match(self._VALID_URL, url)
3779         if mobj is None:
3780             raise ExtractorError(u'Invalid URL: %s' % url)
3781         playlist_id = mobj.group('id')
3782
3783         webpage = self._download_webpage(url, playlist_id)
3784
3785         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3786         if not m:
3787             raise ExtractorError(u'Cannot find trax information')
3788         json_like = m.group(1)
3789         data = json.loads(json_like)
3790
3791         session = str(random.randint(0, 1000000000))
3792         mix_id = data['id']
3793         track_count = data['tracks_count']
3794         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3795         next_url = first_url
3796         res = []
3797         for i in itertools.count():
3798             api_json = self._download_webpage(next_url, playlist_id,
3799                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3800                 errnote=u'Failed to download song information')
3801             api_data = json.loads(api_json)
3802             track_data = api_data[u'set']['track']
3803             info = {
3804                 'id': track_data['id'],
3805                 'url': track_data['track_file_stream_url'],
3806                 'title': track_data['performer'] + u' - ' + track_data['name'],
3807                 'raw_title': track_data['name'],
3808                 'uploader_id': data['user']['login'],
3809                 'ext': 'm4a',
3810             }
3811             res.append(info)
3812             if api_data['set']['at_last_track']:
3813                 break
3814             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3815         return res
3816
3817 class KeekIE(InfoExtractor):
3818     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3819     IE_NAME = u'keek'
3820
3821     def _real_extract(self, url):
3822         m = re.match(self._VALID_URL, url)
3823         video_id = m.group('videoID')
3824         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3825         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3826         webpage = self._download_webpage(url, video_id)
3827         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3828         title = unescapeHTML(m.group('title'))
3829         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3830         uploader = clean_html(m.group('uploader'))
3831         info = {
3832                 'id': video_id,
3833                 'url': video_url,
3834                 'ext': 'mp4',
3835                 'title': title,
3836                 'thumbnail': thumbnail,
3837                 'uploader': uploader
3838         }
3839         return [info]
3840
3841 class TEDIE(InfoExtractor):
3842     _VALID_URL=r'''http://www\.ted\.com/
3843                    (
3844                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3845                         |
3846                         ((?P<type_talk>talks)) # We have a simple talk
3847                    )
3848                    (/lang/(.*?))? # The url may contain the language
3849                    /(?P<name>\w+) # Here goes the name and then ".html"
3850                    '''
3851
3852     @classmethod
3853     def suitable(cls, url):
3854         """Receives a URL and returns True if suitable for this IE."""
3855         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3856
3857     def _real_extract(self, url):
3858         m=re.match(self._VALID_URL, url, re.VERBOSE)
3859         if m.group('type_talk'):
3860             return [self._talk_info(url)]
3861         else :
3862             playlist_id=m.group('playlist_id')
3863             name=m.group('name')
3864             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3865             return [self._playlist_videos_info(url,name,playlist_id)]
3866
3867     def _talk_video_link(self,mediaSlug):
3868         '''Returns the video link for that mediaSlug'''
3869         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3870
3871     def _playlist_videos_info(self,url,name,playlist_id=0):
3872         '''Returns the videos of the playlist'''
3873         video_RE=r'''
3874                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3875                      ([.\s]*?)data-playlist_item_id="(\d+)"
3876                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3877                      '''
3878         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3879         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3880         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3881         m_names=re.finditer(video_name_RE,webpage)
3882
3883         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3884         m_playlist = re.search(playlist_RE, webpage)
3885         playlist_title = m_playlist.group('playlist_title')
3886
3887         playlist_entries = []
3888         for m_video, m_name in zip(m_videos,m_names):
3889             video_id=m_video.group('video_id')
3890             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3891             playlist_entries.append(self.url_result(talk_url, 'TED'))
3892         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3893
3894     def _talk_info(self, url, video_id=0):
3895         """Return the video for the talk in the url"""
3896         m=re.match(self._VALID_URL, url,re.VERBOSE)
3897         videoName=m.group('name')
3898         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3899         # If the url includes the language we get the title translated
3900         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3901         title=re.search(title_RE, webpage).group('title')
3902         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3903                         "id":(?P<videoID>[\d]+).*?
3904                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3905         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3906         thumb_match=re.search(thumb_RE,webpage)
3907         info_match=re.search(info_RE,webpage,re.VERBOSE)
3908         video_id=info_match.group('videoID')
3909         mediaSlug=info_match.group('mediaSlug')
3910         video_url=self._talk_video_link(mediaSlug)
3911         info = {
3912                 'id': video_id,
3913                 'url': video_url,
3914                 'ext': 'mp4',
3915                 'title': title,
3916                 'thumbnail': thumb_match.group('thumbnail')
3917                 }
3918         return info
3919
3920 class MySpassIE(InfoExtractor):
3921     _VALID_URL = r'http://www.myspass.de/.*'
3922
3923     def _real_extract(self, url):
3924         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3925
3926         # video id is the last path element of the URL
3927         # usually there is a trailing slash, so also try the second but last
3928         url_path = compat_urllib_parse_urlparse(url).path
3929         url_parent_path, video_id = os.path.split(url_path)
3930         if not video_id:
3931             _, video_id = os.path.split(url_parent_path)
3932
3933         # get metadata
3934         metadata_url = META_DATA_URL_TEMPLATE % video_id
3935         metadata_text = self._download_webpage(metadata_url, video_id)
3936         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3937
3938         # extract values from metadata
3939         url_flv_el = metadata.find('url_flv')
3940         if url_flv_el is None:
3941             raise ExtractorError(u'Unable to extract download url')
3942         video_url = url_flv_el.text
3943         extension = os.path.splitext(video_url)[1][1:]
3944         title_el = metadata.find('title')
3945         if title_el is None:
3946             raise ExtractorError(u'Unable to extract title')
3947         title = title_el.text
3948         format_id_el = metadata.find('format_id')
3949         if format_id_el is None:
3950             format = ext
3951         else:
3952             format = format_id_el.text
3953         description_el = metadata.find('description')
3954         if description_el is not None:
3955             description = description_el.text
3956         else:
3957             description = None
3958         imagePreview_el = metadata.find('imagePreview')
3959         if imagePreview_el is not None:
3960             thumbnail = imagePreview_el.text
3961         else:
3962             thumbnail = None
3963         info = {
3964             'id': video_id,
3965             'url': video_url,
3966             'title': title,
3967             'ext': extension,
3968             'format': format,
3969             'thumbnail': thumbnail,
3970             'description': description
3971         }
3972         return [info]
3973
3974 class SpiegelIE(InfoExtractor):
3975     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3976
3977     def _real_extract(self, url):
3978         m = re.match(self._VALID_URL, url)
3979         video_id = m.group('videoID')
3980
3981         webpage = self._download_webpage(url, video_id)
3982         m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
3983         if not m:
3984             raise ExtractorError(u'Cannot find title')
3985         video_title = unescapeHTML(m.group(1))
3986
3987         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3988         xml_code = self._download_webpage(xml_url, video_id,
3989                     note=u'Downloading XML', errnote=u'Failed to download XML')
3990
3991         idoc = xml.etree.ElementTree.fromstring(xml_code)
3992         last_type = idoc[-1]
3993         filename = last_type.findall('./filename')[0].text
3994         duration = float(last_type.findall('./duration')[0].text)
3995
3996         video_url = 'http://video2.spiegel.de/flash/' + filename
3997         video_ext = filename.rpartition('.')[2]
3998         info = {
3999             'id': video_id,
4000             'url': video_url,
4001             'ext': video_ext,
4002             'title': video_title,
4003             'duration': duration,
4004         }
4005         return [info]
4006
4007 class LiveLeakIE(InfoExtractor):
4008
4009     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4010     IE_NAME = u'liveleak'
4011
4012     def _real_extract(self, url):
4013         mobj = re.match(self._VALID_URL, url)
4014         if mobj is None:
4015             raise ExtractorError(u'Invalid URL: %s' % url)
4016
4017         video_id = mobj.group('video_id')
4018
4019         webpage = self._download_webpage(url, video_id)
4020
4021         m = re.search(r'file: "(.*?)",', webpage)
4022         if not m:
4023             raise ExtractorError(u'Unable to find video url')
4024         video_url = m.group(1)
4025
4026         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4027         if not m:
4028             raise ExtractorError(u'Cannot find video title')
4029         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4030
4031         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4032         if m:
4033             desc = unescapeHTML(m.group('desc'))
4034         else:
4035             desc = None
4036
4037         m = re.search(r'By:.*?(\w+)</a>', webpage)
4038         if m:
4039             uploader = clean_html(m.group(1))
4040         else:
4041             uploader = None
4042
4043         info = {
4044             'id':  video_id,
4045             'url': video_url,
4046             'ext': 'mp4',
4047             'title': title,
4048             'description': desc,
4049             'uploader': uploader
4050         }
4051
4052         return [info]
4053
4054 class ARDIE(InfoExtractor):
4055     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4056     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4057     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4058
4059     def _real_extract(self, url):
4060         # determine video id from url
4061         m = re.match(self._VALID_URL, url)
4062
4063         numid = re.search(r'documentId=([0-9]+)', url)
4064         if numid:
4065             video_id = numid.group(1)
4066         else:
4067             video_id = m.group('video_id')
4068
4069         # determine title and media streams from webpage
4070         html = self._download_webpage(url, video_id)
4071         title = re.search(self._TITLE, html).group('title')
4072         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4073         if not streams:
4074             assert '"fsk"' in html
4075             raise ExtractorError(u'This video is only available after 8:00 pm')
4076
4077         # choose default media type and highest quality for now
4078         stream = max([s for s in streams if int(s["media_type"]) == 0],
4079                      key=lambda s: int(s["quality"]))
4080
4081         # there's two possibilities: RTMP stream or HTTP download
4082         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4083         if stream['rtmp_url']:
4084             self.to_screen(u'RTMP download detected')
4085             assert stream['video_url'].startswith('mp4:')
4086             info["url"] = stream["rtmp_url"]
4087             info["play_path"] = stream['video_url']
4088         else:
4089             assert stream["video_url"].endswith('.mp4')
4090             info["url"] = stream["video_url"]
4091         return [info]
4092
4093 class ZDFIE(InfoExtractor):
4094     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4095     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4096     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4097     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4098     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4099
4100     def _real_extract(self, url):
4101         mobj = re.match(self._VALID_URL, url)
4102         if mobj is None:
4103             raise ExtractorError(u'Invalid URL: %s' % url)
4104         video_id = mobj.group('video_id')
4105
4106         html = self._download_webpage(url, video_id)
4107         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4108         if streams is None:
4109             raise ExtractorError(u'No media url found.')
4110
4111         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4112         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4113         # choose first/default media type and highest quality for now
4114         for s in streams:        #find 300 - dsl1000mbit
4115             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4116                 stream_=s
4117                 break
4118         for s in streams:        #find veryhigh - dsl2000mbit
4119             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4120                 stream_=s
4121                 break
4122         if stream_ is None:
4123             raise ExtractorError(u'No stream found.')
4124
4125         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4126
4127         self.report_extraction(video_id)
4128         mobj = re.search(self._TITLE, html)
4129         if mobj is None:
4130             raise ExtractorError(u'Cannot extract title')
4131         title = unescapeHTML(mobj.group('title'))
4132
4133         mobj = re.search(self._MMS_STREAM, media_link)
4134         if mobj is None:
4135             mobj = re.search(self._RTSP_STREAM, media_link)
4136             if mobj is None:
4137                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4138         mms_url = mobj.group('video_url')
4139
4140         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4141         if mobj is None:
4142             raise ExtractorError(u'Cannot extract extention')
4143         ext = mobj.group('ext')
4144
4145         return [{'id': video_id,
4146                  'url': mms_url,
4147                  'title': title,
4148                  'ext': ext
4149                  }]
4150
4151 class TumblrIE(InfoExtractor):
4152     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4153
4154     def _real_extract(self, url):
4155         m_url = re.match(self._VALID_URL, url)
4156         video_id = m_url.group('id')
4157         blog = m_url.group('blog_name')
4158
4159         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4160         webpage = self._download_webpage(url, video_id)
4161
4162         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4163         video = re.search(re_video, webpage)
4164         if video is None:
4165             self.to_screen("No video found")
4166             return []
4167         video_url = video.group('video_url')
4168         ext = video.group('ext')
4169
4170         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4171         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4172
4173         # The only place where you can get a title, it's not complete,
4174         # but searching in other places doesn't work for all videos
4175         re_title = r'<title>(?P<title>.*?)</title>'
4176         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4177
4178         return [{'id': video_id,
4179                  'url': video_url,
4180                  'title': title,
4181                  'thumbnail': thumb,
4182                  'ext': ext
4183                  }]
4184
4185 class BandcampIE(InfoExtractor):
4186     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4187
4188     def _real_extract(self, url):
4189         mobj = re.match(self._VALID_URL, url)
4190         title = mobj.group('title')
4191         webpage = self._download_webpage(url, title)
4192         # We get the link to the free download page
4193         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4194         if m_download is None:
4195             raise ExtractorError(u'No free songs founded')
4196
4197         download_link = m_download.group(1)
4198         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4199                        webpage, re.MULTILINE|re.DOTALL).group('id')
4200
4201         download_webpage = self._download_webpage(download_link, id,
4202                                                   'Downloading free downloads page')
4203         # We get the dictionary of the track from some javascrip code
4204         info = re.search(r'items: (.*?),$',
4205                          download_webpage, re.MULTILINE).group(1)
4206         info = json.loads(info)[0]
4207         # We pick mp3-320 for now, until format selection can be easily implemented.
4208         mp3_info = info[u'downloads'][u'mp3-320']
4209         # If we try to use this url it says the link has expired
4210         initial_url = mp3_info[u'url']
4211         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4212         m_url = re.match(re_url, initial_url)
4213         #We build the url we will use to get the final track url
4214         # This url is build in Bandcamp in the script download_bunde_*.js
4215         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4216         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4217         # If we could correctly generate the .rand field the url would be
4218         #in the "download_url" key
4219         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4220
4221         track_info = {'id':id,
4222                       'title' : info[u'title'],
4223                       'ext' : 'mp3',
4224                       'url' : final_url,
4225                       'thumbnail' : info[u'thumb_url'],
4226                       'uploader' : info[u'artist']
4227                       }
4228
4229         return [track_info]
4230
4231 class RedTubeIE(InfoExtractor):
4232     """Information Extractor for redtube"""
4233     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4234
4235     def _real_extract(self,url):
4236         mobj = re.match(self._VALID_URL, url)
4237         if mobj is None:
4238             raise ExtractorError(u'Invalid URL: %s' % url)
4239
4240         video_id = mobj.group('id')
4241         video_extension = 'mp4'
4242         webpage = self._download_webpage(url, video_id)
4243         self.report_extraction(video_id)
4244         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4245
4246         if mobj is None:
4247             raise ExtractorError(u'Unable to extract media URL')
4248
4249         video_url = mobj.group(1)
4250         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4251         if mobj is None:
4252             raise ExtractorError(u'Unable to extract title')
4253         video_title = mobj.group(1)
4254
4255         return [{
4256             'id':       video_id,
4257             'url':      video_url,
4258             'ext':      video_extension,
4259             'title':    video_title,
4260         }]
4261
4262 class InaIE(InfoExtractor):
4263     """Information Extractor for Ina.fr"""
4264     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4265
4266     def _real_extract(self,url):
4267         mobj = re.match(self._VALID_URL, url)
4268
4269         video_id = mobj.group('id')
4270         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4271         video_extension = 'mp4'
4272         webpage = self._download_webpage(mrss_url, video_id)
4273
4274         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4275         if mobj is None:
4276             raise ExtractorError(u'Unable to extract media URL')
4277         video_url = mobj.group(1)
4278
4279         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4280         if mobj is None:
4281             raise ExtractorError(u'Unable to extract title')
4282         video_title = mobj.group(1)
4283
4284         return [{
4285             'id':       video_id,
4286             'url':      video_url,
4287             'ext':      video_extension,
4288             'title':    video_title,
4289         }]
4290
4291 class HowcastIE(InfoExtractor):
4292     """Information Extractor for Howcast.com"""
4293     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4294
4295     def _real_extract(self, url):
4296         mobj = re.match(self._VALID_URL, url)
4297
4298         video_id = mobj.group('id')
4299         webpage_url = 'http://www.howcast.com/videos/' + video_id
4300         webpage = self._download_webpage(webpage_url, video_id)
4301
4302         self.report_extraction(video_id)
4303
4304         mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4305         if mobj is None:
4306             raise ExtractorError(u'Unable to extract video URL')
4307         video_url = mobj.group(1)
4308
4309         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4310         if mobj is None:
4311             raise ExtractorError(u'Unable to extract title')
4312         video_title = mobj.group(1) or mobj.group(2)
4313
4314         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4315         if mobj is None:
4316             self._downloader.report_warning(u'unable to extract description')
4317             video_description = None
4318         else:
4319             video_description = mobj.group(1) or mobj.group(2)
4320
4321         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4322         if mobj is None:
4323             raise ExtractorError(u'Unable to extract thumbnail')
4324         thumbnail = mobj.group(1)
4325
4326         return [{
4327             'id':       video_id,
4328             'url':      video_url,
4329             'ext':      'mp4',
4330             'title':    video_title,
4331             'description': video_description,
4332             'thumbnail': thumbnail,
4333         }]
4334
4335 class VineIE(InfoExtractor):
4336     """Information Extractor for Vine.co"""
4337     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4338
4339     def _real_extract(self, url):
4340
4341         mobj = re.match(self._VALID_URL, url)
4342
4343         video_id = mobj.group('id')
4344         webpage_url = 'https://vine.co/v/' + video_id
4345         webpage = self._download_webpage(webpage_url, video_id)
4346
4347         self.report_extraction(video_id)
4348
4349         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4350         if mobj is None:
4351             raise ExtractorError(u'Unable to extract video URL')
4352         video_url = mobj.group(1)
4353
4354         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4355         if mobj is None:
4356             raise ExtractorError(u'Unable to extract title')
4357         video_title = mobj.group(1)
4358
4359         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4360         if mobj is None:
4361             raise ExtractorError(u'Unable to extract thumbnail')
4362         thumbnail = mobj.group(1)
4363
4364         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4365         if mobj is None:
4366             raise ExtractorError(u'Unable to extract uploader')
4367         uploader = mobj.group(1)
4368
4369         return [{
4370             'id':        video_id,
4371             'url':       video_url,
4372             'ext':       'mp4',
4373             'title':     video_title,
4374             'thumbnail': thumbnail,
4375             'uploader':  uploader,
4376         }]
4377
4378 class FlickrIE(InfoExtractor):
4379     """Information Extractor for Flickr videos"""
4380     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4381
4382     def _real_extract(self, url):
4383         mobj = re.match(self._VALID_URL, url)
4384
4385         video_id = mobj.group('id')
4386         video_uploader_id = mobj.group('uploader_id')
4387         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4388         webpage = self._download_webpage(webpage_url, video_id)
4389
4390         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4391         if mobj is None:
4392             raise ExtractorError(u'Unable to extract video secret')
4393         secret = mobj.group(1)
4394
4395         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4396         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4397
4398         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4399         if mobj is None:
4400             raise ExtractorError(u'Unable to extract node_id')
4401         node_id = mobj.group(1)
4402
4403         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4404         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4405
4406         self.report_extraction(video_id)
4407
4408         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4409         if mobj is None:
4410             raise ExtractorError(u'Unable to extract video url')
4411         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4412
4413         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4414         if mobj is None:
4415             raise ExtractorError(u'Unable to extract title')
4416         video_title = mobj.group(1) or mobj.group(2)
4417
4418         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4419         if mobj is None:
4420             self._downloader.report_warning(u'unable to extract description')
4421             video_description = None
4422         else:
4423             video_description = mobj.group(1) or mobj.group(2)
4424
4425         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4426         if mobj is None:
4427             raise ExtractorError(u'Unable to extract thumbnail')
4428         thumbnail = mobj.group(1) or mobj.group(2)
4429
4430         return [{
4431             'id':          video_id,
4432             'url':         video_url,
4433             'ext':         'mp4',
4434             'title':       video_title,
4435             'description': video_description,
4436             'thumbnail':   thumbnail,
4437             'uploader_id': video_uploader_id,
4438         }]
4439
4440 class TeamcocoIE(InfoExtractor):
4441     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4442
4443     def _real_extract(self, url):
4444         mobj = re.match(self._VALID_URL, url)
4445         if mobj is None:
4446             raise ExtractorError(u'Invalid URL: %s' % url)
4447         url_title = mobj.group('url_title')
4448         webpage = self._download_webpage(url, url_title)
4449
4450         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4451         video_id = mobj.group(1)
4452
4453         self.report_extraction(video_id)
4454
4455         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4456         if mobj is None:
4457             raise ExtractorError(u'Unable to extract title')
4458         video_title = mobj.group(1)
4459
4460         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4461         if mobj is None:
4462             raise ExtractorError(u'Unable to extract thumbnail')
4463         thumbnail = mobj.group(1)
4464
4465         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4466         if mobj is None:
4467             raise ExtractorError(u'Unable to extract description')
4468         description = mobj.group(1)
4469
4470         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4471         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4472         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4473         if mobj is None:
4474             raise ExtractorError(u'Unable to extract video url')
4475         video_url = mobj.group(1)
4476
4477         return [{
4478             'id':          video_id,
4479             'url':         video_url,
4480             'ext':         'mp4',
4481             'title':       video_title,
4482             'thumbnail':   thumbnail,
4483             'description': description,
4484         }]
4485
4486 class XHamsterIE(InfoExtractor):
4487     """Information Extractor for xHamster"""
4488     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4489
4490     def _real_extract(self,url):
4491         mobj = re.match(self._VALID_URL, url)
4492
4493         video_id = mobj.group('id')
4494         mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4495         webpage = self._download_webpage(mrss_url, video_id)
4496         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4497         if mobj is None:
4498             raise ExtractorError(u'Unable to extract media URL')
4499         if len(mobj.group('server')) == 0:
4500             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4501         else:
4502             video_url = mobj.group('server')+'/key='+mobj.group('file')
4503         video_extension = video_url.split('.')[-1]
4504
4505         mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4506         if mobj is None:
4507             raise ExtractorError(u'Unable to extract title')
4508         video_title = unescapeHTML(mobj.group('title'))
4509
4510         mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4511         if mobj is None:
4512             video_description = u''
4513         else:
4514             video_description = unescapeHTML(mobj.group('description'))
4515
4516         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4517         if mobj is None:
4518             raise ExtractorError(u'Unable to extract upload date')
4519         video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4520
4521         mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4522         if mobj is None:
4523             video_uploader_id = u'anonymous'
4524         else:
4525             video_uploader_id = mobj.group('uploader_id')
4526
4527         mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4528         if mobj is None:
4529             raise ExtractorError(u'Unable to extract thumbnail URL')
4530         video_thumbnail = mobj.group('thumbnail')
4531
4532         return [{
4533             'id':       video_id,
4534             'url':      video_url,
4535             'ext':      video_extension,
4536             'title':    video_title,
4537             'description': video_description,
4538             'upload_date': video_upload_date,
4539             'uploader_id': video_uploader_id,
4540             'thumbnail': video_thumbnail
4541         }]
4542
4543 class HypemIE(InfoExtractor):
4544     """Information Extractor for hypem"""
4545     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4546
4547     def _real_extract(self, url):
4548         mobj = re.match(self._VALID_URL, url)
4549         if mobj is None:
4550             raise ExtractorError(u'Invalid URL: %s' % url)
4551         track_id = mobj.group(1)
4552
4553         data = { 'ax': 1, 'ts': time.time() }
4554         data_encoded = compat_urllib_parse.urlencode(data)
4555         complete_url = url + "?" + data_encoded
4556         request = compat_urllib_request.Request(complete_url)
4557         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4558         cookie = urlh.headers.get('Set-Cookie', '')
4559
4560         self.report_extraction(track_id)
4561         mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4562         if mobj is None:
4563             raise ExtractorError(u'Unable to extrack tracks')
4564         html_tracks = mobj.group(1).strip()
4565         try:
4566             track_list = json.loads(html_tracks)
4567             track = track_list[u'tracks'][0]
4568         except ValueError:
4569             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4570
4571         key = track[u"key"]
4572         track_id = track[u"id"]
4573         artist = track[u"artist"]
4574         title = track[u"song"]
4575
4576         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4577         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4578         request.add_header('cookie', cookie)
4579         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4580         try:
4581             song_data = json.loads(song_data_json)
4582         except ValueError:
4583             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4584         final_url = song_data[u"url"]
4585
4586         return [{
4587             'id':       track_id,
4588             'url':      final_url,
4589             'ext':      "mp3",
4590             'title':    title,
4591             'artist':   artist,
4592         }]
4593
4594 class Vbox7IE(InfoExtractor):
4595     """Information Extractor for Vbox7"""
4596     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4597
4598     def _real_extract(self,url):
4599         mobj = re.match(self._VALID_URL, url)
4600         if mobj is None:
4601             raise ExtractorError(u'Invalid URL: %s' % url)
4602         video_id = mobj.group(1)
4603
4604         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4605         redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4606         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4607
4608         title = re.search(r'<title>(.*)</title>', webpage)
4609         title = (title.group(1)).split('/')[0].strip()
4610
4611         ext = "flv"
4612         info_url = "http://vbox7.com/play/magare.do"
4613         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4614         info_request = compat_urllib_request.Request(info_url, data)
4615         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4616         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4617         if info_response is None:
4618             raise ExtractorError(u'Unable to extract the media url')
4619         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4620
4621         return [{
4622             'id':        video_id,
4623             'url':       final_url,
4624             'ext':       ext,
4625             'title':     title,
4626             'thumbnail': thumbnail_url,
4627         }]
4628
4629 def gen_extractors():
4630     """ Return a list of an instance of every supported extractor.
4631     The order does matter; the first extractor matched is the one handling the URL.
4632     """
4633     return [
4634         YoutubePlaylistIE(),
4635         YoutubeChannelIE(),
4636         YoutubeUserIE(),
4637         YoutubeSearchIE(),
4638         YoutubeIE(),
4639         MetacafeIE(),
4640         DailymotionIE(),
4641         GoogleSearchIE(),
4642         PhotobucketIE(),
4643         YahooIE(),
4644         YahooSearchIE(),
4645         DepositFilesIE(),
4646         FacebookIE(),
4647         BlipTVIE(),
4648         BlipTVUserIE(),
4649         VimeoIE(),
4650         MyVideoIE(),
4651         ComedyCentralIE(),
4652         EscapistIE(),
4653         CollegeHumorIE(),
4654         XVideosIE(),
4655         SoundcloudSetIE(),
4656         SoundcloudIE(),
4657         InfoQIE(),
4658         MixcloudIE(),
4659         StanfordOpenClassroomIE(),
4660         MTVIE(),
4661         YoukuIE(),
4662         XNXXIE(),
4663         YouJizzIE(),
4664         PornotubeIE(),
4665         YouPornIE(),
4666         GooglePlusIE(),
4667         ArteTvIE(),
4668         NBAIE(),
4669         WorldStarHipHopIE(),
4670         JustinTVIE(),
4671         FunnyOrDieIE(),
4672         SteamIE(),
4673         UstreamIE(),
4674         RBMARadioIE(),
4675         EightTracksIE(),
4676         KeekIE(),
4677         TEDIE(),
4678         MySpassIE(),
4679         SpiegelIE(),
4680         LiveLeakIE(),
4681         ARDIE(),
4682         ZDFIE(),
4683         TumblrIE(),
4684         BandcampIE(),
4685         RedTubeIE(),
4686         InaIE(),
4687         HowcastIE(),
4688         VineIE(),
4689         FlickrIE(),
4690         TeamcocoIE(),
4691         XHamsterIE(),
4692         HypemIE(),
4693         Vbox7IE(),
4694         GenericIE()
4695     ]
4696
4697 def get_info_extractor(ie_name):
4698     """Returns the info extractor class with the given ie_name"""
4699     return globals()[ie_name+'IE']