_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194 class SearchInfoExtractor(InfoExtractor):
 195     """
 196     Base class for paged search queries extractors.
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 199     """
 200
 201     @classmethod
 202     def _make_valid_url(cls):
 203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 204
 205     @classmethod
 206     def suitable(cls, url):
 207         return re.match(cls._make_valid_url(), url) is not None
 208
 209     def _real_extract(self, query):
 210         mobj = re.match(self._make_valid_url(), query)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid search query "%s"' % query)
 213
 214         prefix = mobj.group('prefix')
 215         query = mobj.group('query')
 216         if prefix == '':
 217             return self._get_n_results(query, 1)
 218         elif prefix == 'all':
 219             return self._get_n_results(query, self._MAX_RESULTS)
 220         else:
 221             n = int(prefix)
 222             if n <= 0:
 223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 224             elif n > self._MAX_RESULTS:
 225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 226                 n = self._MAX_RESULTS
 227             return self._get_n_results(query, n)
 228
 229     def _get_n_results(self, query, n):
 230         """Get a specified number of results for a query"""
 231         raise NotImplementedError("This method must be implemented by sublclasses")
 232
 233
 234 class YoutubeIE(InfoExtractor):
 235     """Information extractor for youtube.com."""
 236
 237     _VALID_URL = r"""^
 238                      (
 239                          (?:https?://)?                                       # http(s):// (optional)
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 243                          (?:                                                  # the various things that can precede the ID:
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 245                              |(?:                                             # or the v= param in all its forms
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 249                                  v=
 250                              )
 251                          )?                                                   # optional -> youtube.com/xxxx is OK
 252                      )?                                                       # all until now is optional -> you can pass the naked ID
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 254                      (?(1).+)?                                                # if we found the ID, everything can follow
 255                      $"""
 256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 260     _NETRC_MACHINE = 'youtube'
 261     # Listed in order of quality
 262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 264     _video_extensions = {
 265         '13': '3gp',
 266         '17': 'mp4',
 267         '18': 'mp4',
 268         '22': 'mp4',
 269         '37': 'mp4',
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 271         '43': 'webm',
 272         '44': 'webm',
 273         '45': 'webm',
 274         '46': 'webm',
 275     }
 276     _video_dimensions = {
 277         '5': '240x400',
 278         '6': '???',
 279         '13': '???',
 280         '17': '144x176',
 281         '18': '360x640',
 282         '22': '720x1280',
 283         '34': '360x640',
 284         '35': '480x854',
 285         '37': '1080x1920',
 286         '38': '3072x4096',
 287         '43': '360x640',
 288         '44': '480x854',
 289         '45': '720x1280',
 290         '46': '1080x1920',
 291     }
 292     IE_NAME = u'youtube'
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         """Receives a URL and returns True if suitable for this IE."""
 297         if YoutubePlaylistIE.suitable(url): return False
 298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 299
 300     def report_lang(self):
 301         """Report attempt to set language."""
 302         self.to_screen(u'Setting language')
 303
 304     def report_login(self):
 305         """Report attempt to log in."""
 306         self.to_screen(u'Logging in')
 307
 308     def report_video_webpage_download(self, video_id):
 309         """Report attempt to download video webpage."""
 310         self.to_screen(u'%s: Downloading video webpage' % video_id)
 311
 312     def report_video_info_webpage_download(self, video_id):
 313         """Report attempt to download video info webpage."""
 314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 315
 316     def report_video_subtitles_download(self, video_id):
 317         """Report attempt to download video info webpage."""
 318         self.to_screen(u'%s: Checking available subtitles' % video_id)
 319
 320     def report_video_subtitles_request(self, video_id, sub_lang, format):
 321         """Report attempt to download video info webpage."""
 322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 323
 324     def report_video_subtitles_available(self, video_id, sub_lang_list):
 325         """Report available subtitles."""
 326         sub_lang = ",".join(list(sub_lang_list.keys()))
 327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 328
 329     def report_information_extraction(self, video_id):
 330         """Report attempt to extract video information."""
 331         self.to_screen(u'%s: Extracting video information' % video_id)
 332
 333     def report_unavailable_format(self, video_id, format):
 334         """Report extracted video URL."""
 335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 336
 337     def report_rtmp_download(self):
 338         """Indicate the download will use the RTMP protocol."""
 339         self.to_screen(u'RTMP download detected')
 340
 341     def _get_available_subtitles(self, video_id):
 342         self.report_video_subtitles_download(video_id)
 343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 344         try:
 345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 350         if not sub_lang_list:
 351             return (u'video doesn\'t have subtitles', None)
 352         return sub_lang_list
 353
 354     def _list_available_subtitles(self, video_id):
 355         sub_lang_list = self._get_available_subtitles(video_id)
 356         self.report_video_subtitles_available(video_id, sub_lang_list)
 357
 358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 359         """
 360         Return tuple:
 361         (error_message, sub_lang, sub)
 362         """
 363         self.report_video_subtitles_request(video_id, sub_lang, format)
 364         params = compat_urllib_parse.urlencode({
 365             'lang': sub_lang,
 366             'name': sub_name,
 367             'v': video_id,
 368             'fmt': format,
 369         })
 370         url = 'http://www.youtube.com/api/timedtext?' + params
 371         try:
 372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 375         if not sub:
 376             return (u'Did not fetch video subtitles', None, None)
 377         return (None, sub_lang, sub)
 378
 379     def _request_automatic_caption(self, video_id, webpage):
 380         """We need the webpage for getting the captions url, pass it as an
 381            argument to speed up the process."""
 382         sub_lang = self._downloader.params.get('subtitleslang')
 383         sub_format = self._downloader.params.get('subtitlesformat')
 384         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 385         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 386         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 387         if mobj is None:
 388             return [(err_msg, None, None)]
 389         player_config = json.loads(mobj.group(1))
 390         try:
 391             args = player_config[u'args']
 392             caption_url = args[u'ttsurl']
 393             timestamp = args[u'timestamp']
 394             params = compat_urllib_parse.urlencode({
 395                 'lang': 'en',
 396                 'tlang': sub_lang,
 397                 'fmt': sub_format,
 398                 'ts': timestamp,
 399                 'kind': 'asr',
 400             })
 401             subtitles_url = caption_url + '&' + params
 402             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 403             return [(None, sub_lang, sub)]
 404         except KeyError:
 405             return [(err_msg, None, None)]
 406
 407     def _extract_subtitle(self, video_id):
 408         """
 409         Return a list with a tuple:
 410         [(error_message, sub_lang, sub)]
 411         """
 412         sub_lang_list = self._get_available_subtitles(video_id)
 413         sub_format = self._downloader.params.get('subtitlesformat')
 414         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 415             return [(sub_lang_list[0], None, None)]
 416         if self._downloader.params.get('subtitleslang', False):
 417             sub_lang = self._downloader.params.get('subtitleslang')
 418         elif 'en' in sub_lang_list:
 419             sub_lang = 'en'
 420         else:
 421             sub_lang = list(sub_lang_list.keys())[0]
 422         if not sub_lang in sub_lang_list:
 423             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 424
 425         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 426         return [subtitle]
 427
 428     def _extract_all_subtitles(self, video_id):
 429         sub_lang_list = self._get_available_subtitles(video_id)
 430         sub_format = self._downloader.params.get('subtitlesformat')
 431         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 432             return [(sub_lang_list[0], None, None)]
 433         subtitles = []
 434         for sub_lang in sub_lang_list:
 435             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 436             subtitles.append(subtitle)
 437         return subtitles
 438
 439     def _print_formats(self, formats):
 440         print('Available formats:')
 441         for x in formats:
 442             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 443
 444     def _real_initialize(self):
 445         if self._downloader is None:
 446             return
 447
 448         username = None
 449         password = None
 450         downloader_params = self._downloader.params
 451
 452         # Attempt to use provided username and password or .netrc data
 453         if downloader_params.get('username', None) is not None:
 454             username = downloader_params['username']
 455             password = downloader_params['password']
 456         elif downloader_params.get('usenetrc', False):
 457             try:
 458                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 459                 if info is not None:
 460                     username = info[0]
 461                     password = info[2]
 462                 else:
 463                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 464             except (IOError, netrc.NetrcParseError) as err:
 465                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 466                 return
 467
 468         # Set language
 469         request = compat_urllib_request.Request(self._LANG_URL)
 470         try:
 471             self.report_lang()
 472             compat_urllib_request.urlopen(request).read()
 473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 474             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 475             return
 476
 477         # No authentication to be performed
 478         if username is None:
 479             return
 480
 481         request = compat_urllib_request.Request(self._LOGIN_URL)
 482         try:
 483             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 485             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 486             return
 487
 488         galx = None
 489         dsh = None
 490         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 491         if match:
 492           galx = match.group(1)
 493
 494         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 495         if match:
 496           dsh = match.group(1)
 497
 498         # Log in
 499         login_form_strs = {
 500                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 501                 u'Email': username,
 502                 u'GALX': galx,
 503                 u'Passwd': password,
 504                 u'PersistentCookie': u'yes',
 505                 u'_utf8': u'霱',
 506                 u'bgresponse': u'js_disabled',
 507                 u'checkConnection': u'',
 508                 u'checkedDomains': u'youtube',
 509                 u'dnConn': u'',
 510                 u'dsh': dsh,
 511                 u'pstMsg': u'0',
 512                 u'rmShown': u'1',
 513                 u'secTok': u'',
 514                 u'signIn': u'Sign in',
 515                 u'timeStmp': u'',
 516                 u'service': u'youtube',
 517                 u'uilel': u'3',
 518                 u'hl': u'en_US',
 519         }
 520         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 521         # chokes on unicode
 522         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 523         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 524         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 525         try:
 526             self.report_login()
 527             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 528             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 529                 self._downloader.report_warning(u'unable to log in: bad username or password')
 530                 return
 531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 532             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 533             return
 534
 535         # Confirm age
 536         age_form = {
 537                 'next_url':     '/',
 538                 'action_confirm':   'Confirm',
 539                 }
 540         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 541         try:
 542             self.report_age_confirmation()
 543             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 545             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 546
 547     def _extract_id(self, url):
 548         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 549         if mobj is None:
 550             raise ExtractorError(u'Invalid URL: %s' % url)
 551         video_id = mobj.group(2)
 552         return video_id
 553
 554     def _real_extract(self, url):
 555         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 556         mobj = re.search(self._NEXT_URL_RE, url)
 557         if mobj:
 558             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 559         video_id = self._extract_id(url)
 560
 561         # Get video webpage
 562         self.report_video_webpage_download(video_id)
 563         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 564         request = compat_urllib_request.Request(url)
 565         try:
 566             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 568             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 569
 570         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 571
 572         # Attempt to extract SWF player URL
 573         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 574         if mobj is not None:
 575             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 576         else:
 577             player_url = None
 578
 579         # Get video info
 580         self.report_video_info_webpage_download(video_id)
 581         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 582             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 583                     % (video_id, el_type))
 584             video_info_webpage = self._download_webpage(video_info_url, video_id,
 585                                     note=False,
 586                                     errnote='unable to download video info webpage')
 587             video_info = compat_parse_qs(video_info_webpage)
 588             if 'token' in video_info:
 589                 break
 590         if 'token' not in video_info:
 591             if 'reason' in video_info:
 592                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 593             else:
 594                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 595
 596         # Check for "rental" videos
 597         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 598             raise ExtractorError(u'"rental" videos not supported')
 599
 600         # Start extracting information
 601         self.report_information_extraction(video_id)
 602
 603         # uploader
 604         if 'author' not in video_info:
 605             raise ExtractorError(u'Unable to extract uploader name')
 606         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 607
 608         # uploader_id
 609         video_uploader_id = None
 610         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 611         if mobj is not None:
 612             video_uploader_id = mobj.group(1)
 613         else:
 614             self._downloader.report_warning(u'unable to extract uploader nickname')
 615
 616         # title
 617         if 'title' not in video_info:
 618             raise ExtractorError(u'Unable to extract video title')
 619         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 620
 621         # thumbnail image
 622         if 'thumbnail_url' not in video_info:
 623             self._downloader.report_warning(u'unable to extract video thumbnail')
 624             video_thumbnail = ''
 625         else:   # don't panic if we can't find it
 626             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 627
 628         # upload date
 629         upload_date = None
 630         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 631         if mobj is not None:
 632             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 633             upload_date = unified_strdate(upload_date)
 634
 635         # description
 636         video_description = get_element_by_id("eow-description", video_webpage)
 637         if video_description:
 638             video_description = clean_html(video_description)
 639         else:
 640             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 641             if fd_mobj:
 642                 video_description = unescapeHTML(fd_mobj.group(1))
 643             else:
 644                 video_description = u''
 645
 646         # subtitles
 647         video_subtitles = None
 648
 649         if self._downloader.params.get('writesubtitles', False):
 650             video_subtitles = self._extract_subtitle(video_id)
 651             if video_subtitles:
 652                 (sub_error, sub_lang, sub) = video_subtitles[0]
 653                 if sub_error:
 654                     # We try with the automatic captions
 655                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 656                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 657                     if sub is not None:
 658                         pass
 659                     else:
 660                         # We report the original error
 661                         self._downloader.report_error(sub_error)
 662
 663         if self._downloader.params.get('allsubtitles', False):
 664             video_subtitles = self._extract_all_subtitles(video_id)
 665             for video_subtitle in video_subtitles:
 666                 (sub_error, sub_lang, sub) = video_subtitle
 667                 if sub_error:
 668                     self._downloader.report_error(sub_error)
 669
 670         if self._downloader.params.get('listsubtitles', False):
 671             sub_lang_list = self._list_available_subtitles(video_id)
 672             return
 673
 674         if 'length_seconds' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video duration')
 676             video_duration = ''
 677         else:
 678             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 679
 680         # token
 681         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 682
 683         # Decide which formats to download
 684         req_format = self._downloader.params.get('format', None)
 685
 686         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 687             self.report_rtmp_download()
 688             video_url_list = [(None, video_info['conn'][0])]
 689         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 690             url_map = {}
 691             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 692                 url_data = compat_parse_qs(url_data_str)
 693                 if 'itag' in url_data and 'url' in url_data:
 694                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 695                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 696                     url_map[url_data['itag'][0]] = url
 697
 698             format_limit = self._downloader.params.get('format_limit', None)
 699             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 700             if format_limit is not None and format_limit in available_formats:
 701                 format_list = available_formats[available_formats.index(format_limit):]
 702             else:
 703                 format_list = available_formats
 704             existing_formats = [x for x in format_list if x in url_map]
 705             if len(existing_formats) == 0:
 706                 raise ExtractorError(u'no known formats available for video')
 707             if self._downloader.params.get('listformats', None):
 708                 self._print_formats(existing_formats)
 709                 return
 710             if req_format is None or req_format == 'best':
 711                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 712             elif req_format == 'worst':
 713                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 714             elif req_format in ('-1', 'all'):
 715                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 716             else:
 717                 # Specific formats. We pick the first in a slash-delimeted sequence.
 718                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 719                 req_formats = req_format.split('/')
 720                 video_url_list = None
 721                 for rf in req_formats:
 722                     if rf in url_map:
 723                         video_url_list = [(rf, url_map[rf])]
 724                         break
 725                 if video_url_list is None:
 726                     raise ExtractorError(u'requested format not available')
 727         else:
 728             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 729
 730         results = []
 731         for format_param, video_real_url in video_url_list:
 732             # Extension
 733             video_extension = self._video_extensions.get(format_param, 'flv')
 734
 735             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 736                                               self._video_dimensions.get(format_param, '???'))
 737
 738             results.append({
 739                 'id':       video_id,
 740                 'url':      video_real_url,
 741                 'uploader': video_uploader,
 742                 'uploader_id': video_uploader_id,
 743                 'upload_date':  upload_date,
 744                 'title':    video_title,
 745                 'ext':      video_extension,
 746                 'format':   video_format,
 747                 'thumbnail':    video_thumbnail,
 748                 'description':  video_description,
 749                 'player_url':   player_url,
 750                 'subtitles':    video_subtitles,
 751                 'duration':     video_duration
 752             })
 753         return results
 754
 755
 756 class MetacafeIE(InfoExtractor):
 757     """Information Extractor for metacafe.com."""
 758
 759     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 760     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 761     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 762     IE_NAME = u'metacafe'
 763
 764     def report_disclaimer(self):
 765         """Report disclaimer retrieval."""
 766         self.to_screen(u'Retrieving disclaimer')
 767
 768     def _real_initialize(self):
 769         # Retrieve disclaimer
 770         request = compat_urllib_request.Request(self._DISCLAIMER)
 771         try:
 772             self.report_disclaimer()
 773             disclaimer = compat_urllib_request.urlopen(request).read()
 774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 775             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 776
 777         # Confirm age
 778         disclaimer_form = {
 779             'filters': '0',
 780             'submit': "Continue - I'm over 18",
 781             }
 782         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 783         try:
 784             self.report_age_confirmation()
 785             disclaimer = compat_urllib_request.urlopen(request).read()
 786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 787             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             raise ExtractorError(u'Invalid URL: %s' % url)
 794
 795         video_id = mobj.group(1)
 796
 797         # Check if video comes from YouTube
 798         mobj2 = re.match(r'^yt-(.*)$', video_id)
 799         if mobj2 is not None:
 800             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 801
 802         # Retrieve video webpage to extract further information
 803         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 808         if mobj is not None:
 809             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810             video_extension = mediaURL[-3:]
 811
 812             # Extract gdaKey if available
 813             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 814             if mobj is None:
 815                 video_url = mediaURL
 816             else:
 817                 gdaKey = mobj.group(1)
 818                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 819         else:
 820             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 821             if mobj is None:
 822                 raise ExtractorError(u'Unable to extract media URL')
 823             vardict = compat_parse_qs(mobj.group(1))
 824             if 'mediaData' not in vardict:
 825                 raise ExtractorError(u'Unable to extract media URL')
 826             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 827             if mobj is None:
 828                 raise ExtractorError(u'Unable to extract media URL')
 829             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 830             video_extension = mediaURL[-3:]
 831             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 832
 833         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 834         if mobj is None:
 835             raise ExtractorError(u'Unable to extract title')
 836         video_title = mobj.group(1).decode('utf-8')
 837
 838         mobj = re.search(r'submitter=(.*?);', webpage)
 839         if mobj is None:
 840             raise ExtractorError(u'Unable to extract uploader nickname')
 841         video_uploader = mobj.group(1)
 842
 843         return [{
 844             'id':       video_id.decode('utf-8'),
 845             'url':      video_url.decode('utf-8'),
 846             'uploader': video_uploader.decode('utf-8'),
 847             'upload_date':  None,
 848             'title':    video_title,
 849             'ext':      video_extension.decode('utf-8'),
 850         }]
 851
 852 class DailymotionIE(InfoExtractor):
 853     """Information Extractor for Dailymotion"""
 854
 855     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 856     IE_NAME = u'dailymotion'
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1).split('_')[0].split('?')[0]
 865
 866         video_extension = 'mp4'
 867
 868         # Retrieve video webpage to extract further information
 869         request = compat_urllib_request.Request(url)
 870         request.add_header('Cookie', 'family_filter=off')
 871         webpage = self._download_webpage(request, video_id)
 872
 873         # Extract URL, uploader and title from webpage
 874         self.report_extraction(video_id)
 875         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 876         if mobj is None:
 877             raise ExtractorError(u'Unable to extract media URL')
 878         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 879
 880         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 881             if key in flashvars:
 882                 max_quality = key
 883                 self.to_screen(u'Using %s' % key)
 884                 break
 885         else:
 886             raise ExtractorError(u'Unable to extract video URL')
 887
 888         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 889         if mobj is None:
 890             raise ExtractorError(u'Unable to extract video URL')
 891
 892         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 893
 894         # TODO: support choosing qualities
 895
 896         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 897         if mobj is None:
 898             raise ExtractorError(u'Unable to extract title')
 899         video_title = unescapeHTML(mobj.group('title'))
 900
 901         video_uploader = None
 902         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 903         if mobj is None:
 904             # lookin for official user
 905             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 906             if mobj_official is None:
 907                 self._downloader.report_warning(u'unable to extract uploader nickname')
 908             else:
 909                 video_uploader = mobj_official.group(1)
 910         else:
 911             video_uploader = mobj.group(1)
 912
 913         video_upload_date = None
 914         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 915         if mobj is not None:
 916             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 917
 918         return [{
 919             'id':       video_id,
 920             'url':      video_url,
 921             'uploader': video_uploader,
 922             'upload_date':  video_upload_date,
 923             'title':    video_title,
 924             'ext':      video_extension,
 925         }]
 926
 927
 928 class PhotobucketIE(InfoExtractor):
 929     """Information extractor for photobucket.com."""
 930
 931     # TODO: the original _VALID_URL was:
 932     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 933     # Check if it's necessary to keep the old extracion process
 934     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 935     IE_NAME = u'photobucket'
 936
 937     def _real_extract(self, url):
 938         # Extract id from URL
 939         mobj = re.match(self._VALID_URL, url)
 940         if mobj is None:
 941             raise ExtractorError(u'Invalid URL: %s' % url)
 942
 943         video_id = mobj.group('id')
 944
 945         video_extension = mobj.group('ext')
 946
 947         # Retrieve video webpage to extract further information
 948         webpage = self._download_webpage(url, video_id)
 949
 950         # Extract URL, uploader, and title from webpage
 951         self.report_extraction(video_id)
 952         # We try first by looking the javascript code:
 953         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 954         if mobj is not None:
 955             info = json.loads(mobj.group('json'))
 956             return [{
 957                 'id':       video_id,
 958                 'url':      info[u'downloadUrl'],
 959                 'uploader': info[u'username'],
 960                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 961                 'title':    info[u'title'],
 962                 'ext':      video_extension,
 963                 'thumbnail': info[u'thumbUrl'],
 964             }]
 965
 966         # We try looking in other parts of the webpage
 967         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 968         if mobj is None:
 969             raise ExtractorError(u'Unable to extract media URL')
 970         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 971
 972         video_url = mediaURL
 973
 974         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 975         if mobj is None:
 976             raise ExtractorError(u'Unable to extract title')
 977         video_title = mobj.group(1).decode('utf-8')
 978
 979         video_uploader = mobj.group(2).decode('utf-8')
 980
 981         return [{
 982             'id':       video_id.decode('utf-8'),
 983             'url':      video_url.decode('utf-8'),
 984             'uploader': video_uploader,
 985             'upload_date':  None,
 986             'title':    video_title,
 987             'ext':      video_extension.decode('utf-8'),
 988         }]
 989
 990
 991 class YahooIE(InfoExtractor):
 992     """Information extractor for screen.yahoo.com."""
 993     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 994
 995     def _real_extract(self, url):
 996         mobj = re.match(self._VALID_URL, url)
 997         if mobj is None:
 998             raise ExtractorError(u'Invalid URL: %s' % url)
 999         video_id = mobj.group('id')
1000         webpage = self._download_webpage(url, video_id)
1001         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1002
1003         if m_id is None:
1004             # TODO: Check which url parameters are required
1005             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1011                         '''
1012             self.report_extraction(video_id)
1013             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1014             if m_info is None:
1015                 raise ExtractorError(u'Unable to extract video info')
1016             video_title = m_info.group('title')
1017             video_description = m_info.group('description')
1018             video_thumb = m_info.group('thumb')
1019             video_date = m_info.group('date')
1020             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1021
1022             # TODO: Find a way to get mp4 videos
1023             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026             video_url = m_rest.group('url')
1027             video_path = m_rest.group('path')
1028             if m_rest is None:
1029                 raise ExtractorError(u'Unable to extract video url')
1030
1031         else: # We have to use a different method if another id is defined
1032             long_id = m_id.group('new_id')
1033             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036             info = json.loads(json_str)
1037             res = info[u'query'][u'results'][u'mediaObj'][0]
1038             stream = res[u'streams'][0]
1039             video_path = stream[u'path']
1040             video_url = stream[u'host']
1041             meta = res[u'meta']
1042             video_title = meta[u'title']
1043             video_description = meta[u'description']
1044             video_thumb = meta[u'thumbnail']
1045             video_date = None # I can't find it
1046
1047         info_dict = {
1048                      'id': video_id,
1049                      'url': video_url,
1050                      'play_path': video_path,
1051                      'title':video_title,
1052                      'description': video_description,
1053                      'thumbnail': video_thumb,
1054                      'upload_date': video_date,
1055                      'ext': 'flv',
1056                      }
1057         return info_dict
1058
1059 class VimeoIE(InfoExtractor):
1060     """Information extractor for vimeo.com."""
1061
1062     # _VALID_URL matches Vimeo URLs
1063     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1064     IE_NAME = u'vimeo'
1065
1066     def _real_extract(self, url, new_video=True):
1067         # Extract ID from URL
1068         mobj = re.match(self._VALID_URL, url)
1069         if mobj is None:
1070             raise ExtractorError(u'Invalid URL: %s' % url)
1071
1072         video_id = mobj.group('id')
1073         if not mobj.group('proto'):
1074             url = 'https://' + url
1075         if mobj.group('direct_link') or mobj.group('pro'):
1076             url = 'https://vimeo.com/' + video_id
1077
1078         # Retrieve video webpage to extract further information
1079         request = compat_urllib_request.Request(url, None, std_headers)
1080         webpage = self._download_webpage(request, video_id)
1081
1082         # Now we begin extracting as much information as we can from what we
1083         # retrieved. First we extract the information common to all extractors,
1084         # and latter we extract those that are Vimeo specific.
1085         self.report_extraction(video_id)
1086
1087         # Extract the config JSON
1088         try:
1089             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090             config = json.loads(config)
1091         except:
1092             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1094             else:
1095                 raise ExtractorError(u'Unable to extract info section')
1096
1097         # Extract title
1098         video_title = config["video"]["title"]
1099
1100         # Extract uploader and uploader_id
1101         video_uploader = config["video"]["owner"]["name"]
1102         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1103
1104         # Extract video thumbnail
1105         video_thumbnail = config["video"]["thumbnail"]
1106
1107         # Extract video description
1108         video_description = get_element_by_attribute("itemprop", "description", webpage)
1109         if video_description: video_description = clean_html(video_description)
1110         else: video_description = u''
1111
1112         # Extract upload date
1113         video_upload_date = None
1114         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115         if mobj is not None:
1116             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1117
1118         # Vimeo specific: extract request signature and timestamp
1119         sig = config['request']['signature']
1120         timestamp = config['request']['timestamp']
1121
1122         # Vimeo specific: extract video codec and quality information
1123         # First consider quality, then codecs, then take everything
1124         # TODO bind to format param
1125         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126         files = { 'hd': [], 'sd': [], 'other': []}
1127         for codec_name, codec_extension in codecs:
1128             if codec_name in config["video"]["files"]:
1129                 if 'hd' in config["video"]["files"][codec_name]:
1130                     files['hd'].append((codec_name, codec_extension, 'hd'))
1131                 elif 'sd' in config["video"]["files"][codec_name]:
1132                     files['sd'].append((codec_name, codec_extension, 'sd'))
1133                 else:
1134                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135
1136         for quality in ('hd', 'sd', 'other'):
1137             if len(files[quality]) > 0:
1138                 video_quality = files[quality][0][2]
1139                 video_codec = files[quality][0][0]
1140                 video_extension = files[quality][0][1]
1141                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1142                 break
1143         else:
1144             raise ExtractorError(u'No known codec found')
1145
1146         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1148
1149         return [{
1150             'id':       video_id,
1151             'url':      video_url,
1152             'uploader': video_uploader,
1153             'uploader_id': video_uploader_id,
1154             'upload_date':  video_upload_date,
1155             'title':    video_title,
1156             'ext':      video_extension,
1157             'thumbnail':    video_thumbnail,
1158             'description':  video_description,
1159         }]
1160
1161
1162 class ArteTvIE(InfoExtractor):
1163     """arte.tv information extractor."""
1164
1165     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166     _LIVE_URL = r'index-[0-9]+\.html$'
1167
1168     IE_NAME = u'arte.tv'
1169
1170     def fetch_webpage(self, url):
1171         request = compat_urllib_request.Request(url)
1172         try:
1173             self.report_download_webpage(url)
1174             webpage = compat_urllib_request.urlopen(request).read()
1175         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177         except ValueError as err:
1178             raise ExtractorError(u'Invalid URL: %s' % url)
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             raise ExtractorError(u'Invalid URL: %s' % url)
1188
1189         for (i, key, err) in matchTuples:
1190             if mobj.group(i) is None:
1191                 raise ExtractorError(err)
1192             else:
1193                 info[key] = mobj.group(i)
1194
1195         return info
1196
1197     def extractLiveStream(self, url):
1198         video_lang = url.split('/')[-4]
1199         info = self.grep_webpage(
1200             url,
1201             r'src="(.*?/videothek_js.*?\.js)',
1202             0,
1203             [
1204                 (1, 'url', u'Invalid URL: %s' % url)
1205             ]
1206         )
1207         http_host = url.split('/')[2]
1208         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212                 '(http://.*?\.swf).*?' +
1213                 '(rtmp://.*?)\'',
1214             re.DOTALL,
1215             [
1216                 (1, 'path',   u'could not extract video path: %s' % url),
1217                 (2, 'player', u'could not extract video player: %s' % url),
1218                 (3, 'url',    u'could not extract video url: %s' % url)
1219             ]
1220         )
1221         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223     def extractPlus7Stream(self, url):
1224         video_lang = url.split('/')[-3]
1225         info = self.grep_webpage(
1226             url,
1227             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228             0,
1229             [
1230                 (1, 'url', u'Invalid URL: %s' % url)
1231             ]
1232         )
1233         next_url = compat_urllib_parse.unquote(info.get('url'))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237             0,
1238             [
1239                 (1, 'url', u'Could not find <video> tag: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video id="(.*?)".*?>.*?' +
1247                 '<name>(.*?)</name>.*?' +
1248                 '<dateVideo>(.*?)</dateVideo>.*?' +
1249                 '<url quality="hd">(.*?)</url>',
1250             re.DOTALL,
1251             [
1252                 (1, 'id',    u'could not extract video id: %s' % url),
1253                 (2, 'title', u'could not extract video title: %s' % url),
1254                 (3, 'date',  u'could not extract video date: %s' % url),
1255                 (4, 'url',   u'could not extract video url: %s' % url)
1256             ]
1257         )
1258
1259         return {
1260             'id':           info.get('id'),
1261             'url':          compat_urllib_parse.unquote(info.get('url')),
1262             'uploader':     u'arte.tv',
1263             'upload_date':  unified_strdate(info.get('date')),
1264             'title':        info.get('title').decode('utf-8'),
1265             'ext':          u'mp4',
1266             'format':       u'NA',
1267             'player_url':   None,
1268         }
1269
1270     def _real_extract(self, url):
1271         video_id = url.split('/')[-1]
1272         self.report_extraction(video_id)
1273
1274         if re.search(self._LIVE_URL, video_id) is not None:
1275             self.extractLiveStream(url)
1276             return
1277         else:
1278             info = self.extractPlus7Stream(url)
1279
1280         return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284     """Generic last-resort information extractor."""
1285
1286     _VALID_URL = r'.*'
1287     IE_NAME = u'generic'
1288
1289     def report_download_webpage(self, video_id):
1290         """Report webpage download."""
1291         if not self._downloader.params.get('test', False):
1292             self._downloader.report_warning(u'Falling back on generic information extractor.')
1293         super(GenericIE, self).report_download_webpage(video_id)
1294
1295     def report_following_redirect(self, new_url):
1296         """Report information extraction."""
1297         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1298
1299     def _test_redirect(self, url):
1300         """Check if it is a redirect, like url shorteners, in case return the new url."""
1301         class HeadRequest(compat_urllib_request.Request):
1302             def get_method(self):
1303                 return "HEAD"
1304
1305         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1306             """
1307             Subclass the HTTPRedirectHandler to make it use our
1308             HeadRequest also on the redirected URL
1309             """
1310             def redirect_request(self, req, fp, code, msg, headers, newurl):
1311                 if code in (301, 302, 303, 307):
1312                     newurl = newurl.replace(' ', '%20')
1313                     newheaders = dict((k,v) for k,v in req.headers.items()
1314                                       if k.lower() not in ("content-length", "content-type"))
1315                     return HeadRequest(newurl,
1316                                        headers=newheaders,
1317                                        origin_req_host=req.get_origin_req_host(),
1318                                        unverifiable=True)
1319                 else:
1320                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1321
1322         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1323             """
1324             Fallback to GET if HEAD is not allowed (405 HTTP error)
1325             """
1326             def http_error_405(self, req, fp, code, msg, headers):
1327                 fp.read()
1328                 fp.close()
1329
1330                 newheaders = dict((k,v) for k,v in req.headers.items()
1331                                   if k.lower() not in ("content-length", "content-type"))
1332                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1333                                                  headers=newheaders,
1334                                                  origin_req_host=req.get_origin_req_host(),
1335                                                  unverifiable=True))
1336
1337         # Build our opener
1338         opener = compat_urllib_request.OpenerDirector()
1339         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340                         HTTPMethodFallback, HEADRedirectHandler,
1341                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342             opener.add_handler(handler())
1343
1344         response = opener.open(HeadRequest(url))
1345         if response is None:
1346             raise ExtractorError(u'Invalid URL protocol')
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             raise ExtractorError(u'Invalid URL: %s' % url)
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             # Try to find twitter cards info
1378             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1379         if mobj is None:
1380             raise ExtractorError(u'Invalid URL: %s' % url)
1381
1382         # It's possible that one of the regexes
1383         # matched, but returned an empty group:
1384         if mobj.group(1) is None:
1385             raise ExtractorError(u'Invalid URL: %s' % url)
1386
1387         video_url = compat_urllib_parse.unquote(mobj.group(1))
1388         video_id = os.path.basename(video_url)
1389
1390         # here's a fun little line of code for you:
1391         video_extension = os.path.splitext(video_id)[1][1:]
1392         video_id = os.path.splitext(video_id)[0]
1393
1394         # it's tempting to parse this further, but you would
1395         # have to take into account all the variations like
1396         #   Video Title - Site Name
1397         #   Site Name | Video Title
1398         #   Video Title - Tagline | Site Name
1399         # and so on and so forth; it's just not practical
1400         mobj = re.search(r'<title>(.*)</title>', webpage)
1401         if mobj is None:
1402             raise ExtractorError(u'Unable to extract title')
1403         video_title = mobj.group(1)
1404
1405         # video uploader is domain name
1406         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1407         if mobj is None:
1408             raise ExtractorError(u'Unable to extract title')
1409         video_uploader = mobj.group(1)
1410
1411         return [{
1412             'id':       video_id,
1413             'url':      video_url,
1414             'uploader': video_uploader,
1415             'upload_date':  None,
1416             'title':    video_title,
1417             'ext':      video_extension,
1418         }]
1419
1420
1421 class YoutubeSearchIE(SearchInfoExtractor):
1422     """Information Extractor for YouTube search queries."""
1423     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1424     _MAX_RESULTS = 1000
1425     IE_NAME = u'youtube:search'
1426     _SEARCH_KEY = 'ytsearch'
1427
1428     def report_download_page(self, query, pagenum):
1429         """Report attempt to download search page with given number."""
1430         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1431
1432     def _get_n_results(self, query, n):
1433         """Get a specified number of results for a query"""
1434
1435         video_ids = []
1436         pagenum = 0
1437         limit = n
1438
1439         while (50 * pagenum) < limit:
1440             self.report_download_page(query, pagenum+1)
1441             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1442             request = compat_urllib_request.Request(result_url)
1443             try:
1444                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1445             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1446                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1447             api_response = json.loads(data)['data']
1448
1449             if not 'items' in api_response:
1450                 raise ExtractorError(u'[youtube] No video results')
1451
1452             new_ids = list(video['id'] for video in api_response['items'])
1453             video_ids += new_ids
1454
1455             limit = min(n, api_response['totalItems'])
1456             pagenum += 1
1457
1458         if len(video_ids) > n:
1459             video_ids = video_ids[:n]
1460         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1461         return self.playlist_result(videos, query)
1462
1463
1464 class GoogleSearchIE(SearchInfoExtractor):
1465     """Information Extractor for Google Video search queries."""
1466     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1467     _MAX_RESULTS = 1000
1468     IE_NAME = u'video.google:search'
1469     _SEARCH_KEY = 'gvsearch'
1470
1471     def _get_n_results(self, query, n):
1472         """Get a specified number of results for a query"""
1473
1474         res = {
1475             '_type': 'playlist',
1476             'id': query,
1477             'entries': []
1478         }
1479
1480         for pagenum in itertools.count(1):
1481             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1482             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1483                                              note='Downloading result page ' + str(pagenum))
1484
1485             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1486                 e = {
1487                     '_type': 'url',
1488                     'url': mobj.group(1)
1489                 }
1490                 res['entries'].append(e)
1491
1492             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1493                 return res
1494
1495 class YahooSearchIE(SearchInfoExtractor):
1496     """Information Extractor for Yahoo! Video search queries."""
1497
1498     _MAX_RESULTS = 1000
1499     IE_NAME = u'screen.yahoo:search'
1500     _SEARCH_KEY = 'yvsearch'
1501
1502     def _get_n_results(self, query, n):
1503         """Get a specified number of results for a query"""
1504
1505         res = {
1506             '_type': 'playlist',
1507             'id': query,
1508             'entries': []
1509         }
1510         for pagenum in itertools.count(0):
1511             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1512             webpage = self._download_webpage(result_url, query,
1513                                              note='Downloading results page '+str(pagenum+1))
1514             info = json.loads(webpage)
1515             m = info[u'm']
1516             results = info[u'results']
1517
1518             for (i, r) in enumerate(results):
1519                 if (pagenum * 30) +i >= n:
1520                     break
1521                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1522                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1523                 res['entries'].append(e)
1524             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1525                 break
1526
1527         return res
1528
1529
1530 class YoutubePlaylistIE(InfoExtractor):
1531     """Information Extractor for YouTube playlists."""
1532
1533     _VALID_URL = r"""(?:
1534                         (?:https?://)?
1535                         (?:\w+\.)?
1536                         youtube\.com/
1537                         (?:
1538                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1539                            \? (?:.*?&)*? (?:p|a|list)=
1540                         |  p/
1541                         )
1542                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1543                         .*
1544                      |
1545                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1546                      )"""
1547     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1548     _MAX_RESULTS = 50
1549     IE_NAME = u'youtube:playlist'
1550
1551     @classmethod
1552     def suitable(cls, url):
1553         """Receives a URL and returns True if suitable for this IE."""
1554         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1555
1556     def _real_extract(self, url):
1557         # Extract playlist id
1558         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1559         if mobj is None:
1560             raise ExtractorError(u'Invalid URL: %s' % url)
1561
1562         # Download playlist videos from API
1563         playlist_id = mobj.group(1) or mobj.group(2)
1564         page_num = 1
1565         videos = []
1566
1567         while True:
1568             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1569             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1570
1571             try:
1572                 response = json.loads(page)
1573             except ValueError as err:
1574                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1575
1576             if 'feed' not in response:
1577                 raise ExtractorError(u'Got a malformed response from YouTube API')
1578             playlist_title = response['feed']['title']['$t']
1579             if 'entry' not in response['feed']:
1580                 # Number of videos is a multiple of self._MAX_RESULTS
1581                 break
1582
1583             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1584                         for entry in response['feed']['entry']
1585                         if 'content' in entry ]
1586
1587             if len(response['feed']['entry']) < self._MAX_RESULTS:
1588                 break
1589             page_num += 1
1590
1591         videos = [v[1] for v in sorted(videos)]
1592
1593         url_results = [self.url_result(url, 'Youtube') for url in videos]
1594         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1595
1596
1597 class YoutubeChannelIE(InfoExtractor):
1598     """Information Extractor for YouTube channels."""
1599
1600     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1601     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1602     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1603     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1604     IE_NAME = u'youtube:channel'
1605
1606     def extract_videos_from_page(self, page):
1607         ids_in_page = []
1608         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1609             if mobj.group(1) not in ids_in_page:
1610                 ids_in_page.append(mobj.group(1))
1611         return ids_in_page
1612
1613     def _real_extract(self, url):
1614         # Extract channel id
1615         mobj = re.match(self._VALID_URL, url)
1616         if mobj is None:
1617             raise ExtractorError(u'Invalid URL: %s' % url)
1618
1619         # Download channel page
1620         channel_id = mobj.group(1)
1621         video_ids = []
1622         pagenum = 1
1623
1624         url = self._TEMPLATE_URL % (channel_id, pagenum)
1625         page = self._download_webpage(url, channel_id,
1626                                       u'Downloading page #%s' % pagenum)
1627
1628         # Extract video identifiers
1629         ids_in_page = self.extract_videos_from_page(page)
1630         video_ids.extend(ids_in_page)
1631
1632         # Download any subsequent channel pages using the json-based channel_ajax query
1633         if self._MORE_PAGES_INDICATOR in page:
1634             while True:
1635                 pagenum = pagenum + 1
1636
1637                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1638                 page = self._download_webpage(url, channel_id,
1639                                               u'Downloading page #%s' % pagenum)
1640
1641                 page = json.loads(page)
1642
1643                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1644                 video_ids.extend(ids_in_page)
1645
1646                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1647                     break
1648
1649         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1650
1651         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1652         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1653         return [self.playlist_result(url_entries, channel_id)]
1654
1655
1656 class YoutubeUserIE(InfoExtractor):
1657     """Information Extractor for YouTube users."""
1658
1659     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1660     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1661     _GDATA_PAGE_SIZE = 50
1662     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1663     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1664     IE_NAME = u'youtube:user'
1665
1666     def _real_extract(self, url):
1667         # Extract username
1668         mobj = re.match(self._VALID_URL, url)
1669         if mobj is None:
1670             raise ExtractorError(u'Invalid URL: %s' % url)
1671
1672         username = mobj.group(1)
1673
1674         # Download video ids using YouTube Data API. Result size per
1675         # query is limited (currently to 50 videos) so we need to query
1676         # page by page until there are no video ids - it means we got
1677         # all of them.
1678
1679         video_ids = []
1680         pagenum = 0
1681
1682         while True:
1683             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1684
1685             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1686             page = self._download_webpage(gdata_url, username,
1687                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1688
1689             # Extract video identifiers
1690             ids_in_page = []
1691
1692             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1693                 if mobj.group(1) not in ids_in_page:
1694                     ids_in_page.append(mobj.group(1))
1695
1696             video_ids.extend(ids_in_page)
1697
1698             # A little optimization - if current page is not
1699             # "full", ie. does not contain PAGE_SIZE video ids then
1700             # we can assume that this page is the last one - there
1701             # are no more ids on further pages - no need to query
1702             # again.
1703
1704             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1705                 break
1706
1707             pagenum += 1
1708
1709         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1710         url_results = [self.url_result(url, 'Youtube') for url in urls]
1711         return [self.playlist_result(url_results, playlist_title = username)]
1712
1713
1714 class BlipTVUserIE(InfoExtractor):
1715     """Information Extractor for blip.tv users."""
1716
1717     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1718     _PAGE_SIZE = 12
1719     IE_NAME = u'blip.tv:user'
1720
1721     def _real_extract(self, url):
1722         # Extract username
1723         mobj = re.match(self._VALID_URL, url)
1724         if mobj is None:
1725             raise ExtractorError(u'Invalid URL: %s' % url)
1726
1727         username = mobj.group(1)
1728
1729         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1730
1731         page = self._download_webpage(url, username, u'Downloading user page')
1732         mobj = re.search(r'data-users-id="([^"]+)"', page)
1733         page_base = page_base % mobj.group(1)
1734
1735
1736         # Download video ids using BlipTV Ajax calls. Result size per
1737         # query is limited (currently to 12 videos) so we need to query
1738         # page by page until there are no video ids - it means we got
1739         # all of them.
1740
1741         video_ids = []
1742         pagenum = 1
1743
1744         while True:
1745             url = page_base + "&page=" + str(pagenum)
1746             page = self._download_webpage(url, username,
1747                                           u'Downloading video ids from page %d' % pagenum)
1748
1749             # Extract video identifiers
1750             ids_in_page = []
1751
1752             for mobj in re.finditer(r'href="/([^"]+)"', page):
1753                 if mobj.group(1) not in ids_in_page:
1754                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1755
1756             video_ids.extend(ids_in_page)
1757
1758             # A little optimization - if current page is not
1759             # "full", ie. does not contain PAGE_SIZE video ids then
1760             # we can assume that this page is the last one - there
1761             # are no more ids on further pages - no need to query
1762             # again.
1763
1764             if len(ids_in_page) < self._PAGE_SIZE:
1765                 break
1766
1767             pagenum += 1
1768
1769         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1770         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1771         return [self.playlist_result(url_entries, playlist_title = username)]
1772
1773
1774 class DepositFilesIE(InfoExtractor):
1775     """Information extractor for depositfiles.com"""
1776
1777     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1778
1779     def _real_extract(self, url):
1780         file_id = url.split('/')[-1]
1781         # Rebuild url in english locale
1782         url = 'http://depositfiles.com/en/files/' + file_id
1783
1784         # Retrieve file webpage with 'Free download' button pressed
1785         free_download_indication = { 'gateway_result' : '1' }
1786         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1787         try:
1788             self.report_download_webpage(file_id)
1789             webpage = compat_urllib_request.urlopen(request).read()
1790         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1791             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1792
1793         # Search for the real file URL
1794         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1795         if (mobj is None) or (mobj.group(1) is None):
1796             # Try to figure out reason of the error.
1797             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1798             if (mobj is not None) and (mobj.group(1) is not None):
1799                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1800                 raise ExtractorError(u'%s' % restriction_message)
1801             else:
1802                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1803
1804         file_url = mobj.group(1)
1805         file_extension = os.path.splitext(file_url)[1][1:]
1806
1807         # Search for file title
1808         mobj = re.search(r'<b title="(.*?)">', webpage)
1809         if mobj is None:
1810             raise ExtractorError(u'Unable to extract title')
1811         file_title = mobj.group(1).decode('utf-8')
1812
1813         return [{
1814             'id':       file_id.decode('utf-8'),
1815             'url':      file_url.decode('utf-8'),
1816             'uploader': None,
1817             'upload_date':  None,
1818             'title':    file_title,
1819             'ext':      file_extension.decode('utf-8'),
1820         }]
1821
1822
1823 class FacebookIE(InfoExtractor):
1824     """Information Extractor for Facebook"""
1825
1826     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1827     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1828     _NETRC_MACHINE = 'facebook'
1829     IE_NAME = u'facebook'
1830
1831     def report_login(self):
1832         """Report attempt to log in."""
1833         self.to_screen(u'Logging in')
1834
1835     def _real_initialize(self):
1836         if self._downloader is None:
1837             return
1838
1839         useremail = None
1840         password = None
1841         downloader_params = self._downloader.params
1842
1843         # Attempt to use provided username and password or .netrc data
1844         if downloader_params.get('username', None) is not None:
1845             useremail = downloader_params['username']
1846             password = downloader_params['password']
1847         elif downloader_params.get('usenetrc', False):
1848             try:
1849                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1850                 if info is not None:
1851                     useremail = info[0]
1852                     password = info[2]
1853                 else:
1854                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1855             except (IOError, netrc.NetrcParseError) as err:
1856                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1857                 return
1858
1859         if useremail is None:
1860             return
1861
1862         # Log in
1863         login_form = {
1864             'email': useremail,
1865             'pass': password,
1866             'login': 'Log+In'
1867             }
1868         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1869         try:
1870             self.report_login()
1871             login_results = compat_urllib_request.urlopen(request).read()
1872             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1873                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1874                 return
1875         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1876             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1877             return
1878
1879     def _real_extract(self, url):
1880         mobj = re.match(self._VALID_URL, url)
1881         if mobj is None:
1882             raise ExtractorError(u'Invalid URL: %s' % url)
1883         video_id = mobj.group('ID')
1884
1885         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1886         webpage = self._download_webpage(url, video_id)
1887
1888         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1889         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1890         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1891         if not m:
1892             raise ExtractorError(u'Cannot parse data')
1893         data = dict(json.loads(m.group(1)))
1894         params_raw = compat_urllib_parse.unquote(data['params'])
1895         params = json.loads(params_raw)
1896         video_data = params['video_data'][0]
1897         video_url = video_data.get('hd_src')
1898         if not video_url:
1899             video_url = video_data['sd_src']
1900         if not video_url:
1901             raise ExtractorError(u'Cannot find video URL')
1902         video_duration = int(video_data['video_duration'])
1903         thumbnail = video_data['thumbnail_src']
1904
1905         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1906         if not m:
1907             raise ExtractorError(u'Cannot find title in webpage')
1908         video_title = unescapeHTML(m.group(1))
1909
1910         info = {
1911             'id': video_id,
1912             'title': video_title,
1913             'url': video_url,
1914             'ext': 'mp4',
1915             'duration': video_duration,
1916             'thumbnail': thumbnail,
1917         }
1918         return [info]
1919
1920
1921 class BlipTVIE(InfoExtractor):
1922     """Information extractor for blip.tv"""
1923
1924     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1925     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1926     IE_NAME = u'blip.tv'
1927
1928     def report_direct_download(self, title):
1929         """Report information extraction."""
1930         self.to_screen(u'%s: Direct download detected' % title)
1931
1932     def _real_extract(self, url):
1933         mobj = re.match(self._VALID_URL, url)
1934         if mobj is None:
1935             raise ExtractorError(u'Invalid URL: %s' % url)
1936
1937         # See https://github.com/rg3/youtube-dl/issues/857
1938         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1939         if api_mobj is not None:
1940             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1941         urlp = compat_urllib_parse_urlparse(url)
1942         if urlp.path.startswith('/play/'):
1943             request = compat_urllib_request.Request(url)
1944             response = compat_urllib_request.urlopen(request)
1945             redirecturl = response.geturl()
1946             rurlp = compat_urllib_parse_urlparse(redirecturl)
1947             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1948             url = 'http://blip.tv/a/a-' + file_id
1949             return self._real_extract(url)
1950
1951
1952         if '?' in url:
1953             cchar = '&'
1954         else:
1955             cchar = '?'
1956         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1957         request = compat_urllib_request.Request(json_url)
1958         request.add_header('User-Agent', 'iTunes/10.6.1')
1959         self.report_extraction(mobj.group(1))
1960         info = None
1961         try:
1962             urlh = compat_urllib_request.urlopen(request)
1963             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1964                 basename = url.split('/')[-1]
1965                 title,ext = os.path.splitext(basename)
1966                 title = title.decode('UTF-8')
1967                 ext = ext.replace('.', '')
1968                 self.report_direct_download(title)
1969                 info = {
1970                     'id': title,
1971                     'url': url,
1972                     'uploader': None,
1973                     'upload_date': None,
1974                     'title': title,
1975                     'ext': ext,
1976                     'urlhandle': urlh
1977                 }
1978         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1979             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1980         if info is None: # Regular URL
1981             try:
1982                 json_code_bytes = urlh.read()
1983                 json_code = json_code_bytes.decode('utf-8')
1984             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1985                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1986
1987             try:
1988                 json_data = json.loads(json_code)
1989                 if 'Post' in json_data:
1990                     data = json_data['Post']
1991                 else:
1992                     data = json_data
1993
1994                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1995                 video_url = data['media']['url']
1996                 umobj = re.match(self._URL_EXT, video_url)
1997                 if umobj is None:
1998                     raise ValueError('Can not determine filename extension')
1999                 ext = umobj.group(1)
2000
2001                 info = {
2002                     'id': data['item_id'],
2003                     'url': video_url,
2004                     'uploader': data['display_name'],
2005                     'upload_date': upload_date,
2006                     'title': data['title'],
2007                     'ext': ext,
2008                     'format': data['media']['mimeType'],
2009                     'thumbnail': data['thumbnailUrl'],
2010                     'description': data['description'],
2011                     'player_url': data['embedUrl'],
2012                     'user_agent': 'iTunes/10.6.1',
2013                 }
2014             except (ValueError,KeyError) as err:
2015                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2016
2017         return [info]
2018
2019
2020 class MyVideoIE(InfoExtractor):
2021     """Information Extractor for myvideo.de."""
2022
2023     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2024     IE_NAME = u'myvideo'
2025
2026     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2027     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2028     # https://github.com/rg3/youtube-dl/pull/842
2029     def __rc4crypt(self,data, key):
2030         x = 0
2031         box = list(range(256))
2032         for i in list(range(256)):
2033             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2034             box[i], box[x] = box[x], box[i]
2035         x = 0
2036         y = 0
2037         out = ''
2038         for char in data:
2039             x = (x + 1) % 256
2040             y = (y + box[x]) % 256
2041             box[x], box[y] = box[y], box[x]
2042             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2043         return out
2044
2045     def __md5(self,s):
2046         return hashlib.md5(s).hexdigest().encode()
2047
2048     def _real_extract(self,url):
2049         mobj = re.match(self._VALID_URL, url)
2050         if mobj is None:
2051             raise ExtractorError(u'invalid URL: %s' % url)
2052
2053         video_id = mobj.group(1)
2054
2055         GK = (
2056           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2057           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2058           b'TnpsbA0KTVRkbU1tSTRNdz09'
2059         )
2060
2061         # Get video webpage
2062         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2063         webpage = self._download_webpage(webpage_url, video_id)
2064
2065         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2066         if mobj is not None:
2067             self.report_extraction(video_id)
2068             video_url = mobj.group(1) + '.flv'
2069
2070             mobj = re.search('<title>([^<]+)</title>', webpage)
2071             if mobj is None:
2072                 raise ExtractorError(u'Unable to extract title')
2073             video_title = mobj.group(1)
2074
2075             mobj = re.search('[.](.+?)$', video_url)
2076             if mobj is None:
2077                 raise ExtractorError(u'Unable to extract extention')
2078             video_ext = mobj.group(1)
2079
2080             return [{
2081                 'id':       video_id,
2082                 'url':      video_url,
2083                 'uploader': None,
2084                 'upload_date':  None,
2085                 'title':    video_title,
2086                 'ext':      u'flv',
2087             }]
2088
2089         # try encxml
2090         mobj = re.search('var flashvars={(.+?)}', webpage)
2091         if mobj is None:
2092             raise ExtractorError(u'Unable to extract video')
2093
2094         params = {}
2095         encxml = ''
2096         sec = mobj.group(1)
2097         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2098             if not a == '_encxml':
2099                 params[a] = b
2100             else:
2101                 encxml = compat_urllib_parse.unquote(b)
2102         if not params.get('domain'):
2103             params['domain'] = 'www.myvideo.de'
2104         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2105         if 'flash_playertype=MTV' in xmldata_url:
2106             self._downloader.report_warning(u'avoiding MTV player')
2107             xmldata_url = (
2108                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2109                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2110             ) % video_id
2111
2112         # get enc data
2113         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2114         enc_data_b = binascii.unhexlify(enc_data)
2115         sk = self.__md5(
2116             base64.b64decode(base64.b64decode(GK)) +
2117             self.__md5(
2118                 str(video_id).encode('utf-8')
2119             )
2120         )
2121         dec_data = self.__rc4crypt(enc_data_b, sk)
2122
2123         # extracting infos
2124         self.report_extraction(video_id)
2125
2126         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2127         if mobj is None:
2128             raise ExtractorError(u'unable to extract rtmpurl')
2129         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2130         if 'myvideo2flash' in video_rtmpurl:
2131             self._downloader.report_warning(u'forcing RTMPT ...')
2132             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2133
2134         # extract non rtmp videos
2135         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2136             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2137             if mobj is None:
2138                 raise ExtractorError(u'unable to extract url')
2139             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2140
2141         mobj = re.search('source=\'(.*?)\'', dec_data)
2142         if mobj is None:
2143             raise ExtractorError(u'unable to extract swfobj')
2144         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2145
2146         if not video_file.endswith('f4m'):
2147             ppath, prefix = video_file.split('.')
2148             video_playpath = '%s:%s' % (prefix, ppath)
2149             video_hls_playlist = ''
2150         else:
2151             video_playpath = ''
2152             video_hls_playlist = (
2153                 video_filepath + video_file
2154             ).replace('.f4m', '.m3u8')
2155
2156         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2157         if mobj is None:
2158             raise ExtractorError(u'unable to extract swfobj')
2159         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2160
2161         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2162         if mobj is None:
2163             raise ExtractorError(u'unable to extract title')
2164         video_title = mobj.group(1)
2165
2166         return [{
2167             'id':                 video_id,
2168             'url':                video_rtmpurl,
2169             'tc_url':             video_rtmpurl,
2170             'uploader':           None,
2171             'upload_date':        None,
2172             'title':              video_title,
2173             'ext':                u'flv',
2174             'play_path':          video_playpath,
2175             'video_file':         video_file,
2176             'video_hls_playlist': video_hls_playlist,
2177             'player_url':         video_swfobj,
2178         }]
2179
2180 class ComedyCentralIE(InfoExtractor):
2181     """Information extractor for The Daily Show and Colbert Report """
2182
2183     # urls can be abbreviations like :thedailyshow or :colbert
2184     # urls for episodes like:
2185     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2186     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2187     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2188     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2189                       |(https?://)?(www\.)?
2190                           (?P<showname>thedailyshow|colbertnation)\.com/
2191                          (full-episodes/(?P<episode>.*)|
2192                           (?P<clip>
2193                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2194                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2195                      $"""
2196
2197     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2198
2199     _video_extensions = {
2200         '3500': 'mp4',
2201         '2200': 'mp4',
2202         '1700': 'mp4',
2203         '1200': 'mp4',
2204         '750': 'mp4',
2205         '400': 'mp4',
2206     }
2207     _video_dimensions = {
2208         '3500': '1280x720',
2209         '2200': '960x540',
2210         '1700': '768x432',
2211         '1200': '640x360',
2212         '750': '512x288',
2213         '400': '384x216',
2214     }
2215
2216     @classmethod
2217     def suitable(cls, url):
2218         """Receives a URL and returns True if suitable for this IE."""
2219         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2220
2221     def _print_formats(self, formats):
2222         print('Available formats:')
2223         for x in formats:
2224             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2225
2226
2227     def _real_extract(self, url):
2228         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2229         if mobj is None:
2230             raise ExtractorError(u'Invalid URL: %s' % url)
2231
2232         if mobj.group('shortname'):
2233             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2234                 url = u'http://www.thedailyshow.com/full-episodes/'
2235             else:
2236                 url = u'http://www.colbertnation.com/full-episodes/'
2237             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2238             assert mobj is not None
2239
2240         if mobj.group('clip'):
2241             if mobj.group('showname') == 'thedailyshow':
2242                 epTitle = mobj.group('tdstitle')
2243             else:
2244                 epTitle = mobj.group('cntitle')
2245             dlNewest = False
2246         else:
2247             dlNewest = not mobj.group('episode')
2248             if dlNewest:
2249                 epTitle = mobj.group('showname')
2250             else:
2251                 epTitle = mobj.group('episode')
2252
2253         self.report_extraction(epTitle)
2254         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2255         if dlNewest:
2256             url = htmlHandle.geturl()
2257             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2258             if mobj is None:
2259                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2260             if mobj.group('episode') == '':
2261                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2262             epTitle = mobj.group('episode')
2263
2264         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2265
2266         if len(mMovieParams) == 0:
2267             # The Colbert Report embeds the information in a without
2268             # a URL prefix; so extract the alternate reference
2269             # and then add the URL prefix manually.
2270
2271             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2272             if len(altMovieParams) == 0:
2273                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2274             else:
2275                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2276
2277         uri = mMovieParams[0][1]
2278         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2279         indexXml = self._download_webpage(indexUrl, epTitle,
2280                                           u'Downloading show index',
2281                                           u'unable to download episode index')
2282
2283         results = []
2284
2285         idoc = xml.etree.ElementTree.fromstring(indexXml)
2286         itemEls = idoc.findall('.//item')
2287         for partNum,itemEl in enumerate(itemEls):
2288             mediaId = itemEl.findall('./guid')[0].text
2289             shortMediaId = mediaId.split(':')[-1]
2290             showId = mediaId.split(':')[-2].replace('.com', '')
2291             officialTitle = itemEl.findall('./title')[0].text
2292             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2293
2294             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2295                         compat_urllib_parse.urlencode({'uri': mediaId}))
2296             configXml = self._download_webpage(configUrl, epTitle,
2297                                                u'Downloading configuration for %s' % shortMediaId)
2298
2299             cdoc = xml.etree.ElementTree.fromstring(configXml)
2300             turls = []
2301             for rendition in cdoc.findall('.//rendition'):
2302                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2303                 turls.append(finfo)
2304
2305             if len(turls) == 0:
2306                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2307                 continue
2308
2309             if self._downloader.params.get('listformats', None):
2310                 self._print_formats([i[0] for i in turls])
2311                 return
2312
2313             # For now, just pick the highest bitrate
2314             format,rtmp_video_url = turls[-1]
2315
2316             # Get the format arg from the arg stream
2317             req_format = self._downloader.params.get('format', None)
2318
2319             # Select format if we can find one
2320             for f,v in turls:
2321                 if f == req_format:
2322                     format, rtmp_video_url = f, v
2323                     break
2324
2325             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2326             if not m:
2327                 raise ExtractorError(u'Cannot transform RTMP url')
2328             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2329             video_url = base + m.group('finalid')
2330
2331             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2332             info = {
2333                 'id': shortMediaId,
2334                 'url': video_url,
2335                 'uploader': showId,
2336                 'upload_date': officialDate,
2337                 'title': effTitle,
2338                 'ext': 'mp4',
2339                 'format': format,
2340                 'thumbnail': None,
2341                 'description': officialTitle,
2342             }
2343             results.append(info)
2344
2345         return results
2346
2347
2348 class EscapistIE(InfoExtractor):
2349     """Information extractor for The Escapist """
2350
2351     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2352     IE_NAME = u'escapist'
2353
2354     def _real_extract(self, url):
2355         mobj = re.match(self._VALID_URL, url)
2356         if mobj is None:
2357             raise ExtractorError(u'Invalid URL: %s' % url)
2358         showName = mobj.group('showname')
2359         videoId = mobj.group('episode')
2360
2361         self.report_extraction(showName)
2362         webPage = self._download_webpage(url, showName)
2363
2364         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2365         description = unescapeHTML(descMatch.group(1))
2366         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2367         imgUrl = unescapeHTML(imgMatch.group(1))
2368         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2369         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2370         configUrlMatch = re.search('config=(.*)$', playerUrl)
2371         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2372
2373         configJSON = self._download_webpage(configUrl, showName,
2374                                             u'Downloading configuration',
2375                                             u'unable to download configuration')
2376
2377         # Technically, it's JavaScript, not JSON
2378         configJSON = configJSON.replace("'", '"')
2379
2380         try:
2381             config = json.loads(configJSON)
2382         except (ValueError,) as err:
2383             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2384
2385         playlist = config['playlist']
2386         videoUrl = playlist[1]['url']
2387
2388         info = {
2389             'id': videoId,
2390             'url': videoUrl,
2391             'uploader': showName,
2392             'upload_date': None,
2393             'title': showName,
2394             'ext': 'mp4',
2395             'thumbnail': imgUrl,
2396             'description': description,
2397             'player_url': playerUrl,
2398         }
2399
2400         return [info]
2401
2402 class CollegeHumorIE(InfoExtractor):
2403     """Information extractor for collegehumor.com"""
2404
2405     _WORKING = False
2406     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2407     IE_NAME = u'collegehumor'
2408
2409     def report_manifest(self, video_id):
2410         """Report information extraction."""
2411         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2412
2413     def _real_extract(self, url):
2414         mobj = re.match(self._VALID_URL, url)
2415         if mobj is None:
2416             raise ExtractorError(u'Invalid URL: %s' % url)
2417         video_id = mobj.group('videoid')
2418
2419         info = {
2420             'id': video_id,
2421             'uploader': None,
2422             'upload_date': None,
2423         }
2424
2425         self.report_extraction(video_id)
2426         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2427         try:
2428             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2429         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2430             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2431
2432         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2433         try:
2434             videoNode = mdoc.findall('./video')[0]
2435             info['description'] = videoNode.findall('./description')[0].text
2436             info['title'] = videoNode.findall('./caption')[0].text
2437             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2438             manifest_url = videoNode.findall('./file')[0].text
2439         except IndexError:
2440             raise ExtractorError(u'Invalid metadata XML file')
2441
2442         manifest_url += '?hdcore=2.10.3'
2443         self.report_manifest(video_id)
2444         try:
2445             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2446         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448
2449         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2450         try:
2451             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2452             node_id = media_node.attrib['url']
2453             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2454         except IndexError as err:
2455             raise ExtractorError(u'Invalid manifest file')
2456
2457         url_pr = compat_urllib_parse_urlparse(manifest_url)
2458         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2459
2460         info['url'] = url
2461         info['ext'] = 'f4f'
2462         return [info]
2463
2464
2465 class XVideosIE(InfoExtractor):
2466     """Information extractor for xvideos.com"""
2467
2468     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2469     IE_NAME = u'xvideos'
2470
2471     def _real_extract(self, url):
2472         mobj = re.match(self._VALID_URL, url)
2473         if mobj is None:
2474             raise ExtractorError(u'Invalid URL: %s' % url)
2475         video_id = mobj.group(1)
2476
2477         webpage = self._download_webpage(url, video_id)
2478
2479         self.report_extraction(video_id)
2480
2481
2482         # Extract video URL
2483         mobj = re.search(r'flv_url=(.+?)&', webpage)
2484         if mobj is None:
2485             raise ExtractorError(u'Unable to extract video url')
2486         video_url = compat_urllib_parse.unquote(mobj.group(1))
2487
2488
2489         # Extract title
2490         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2491         if mobj is None:
2492             raise ExtractorError(u'Unable to extract video title')
2493         video_title = mobj.group(1)
2494
2495
2496         # Extract video thumbnail
2497         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2498         if mobj is None:
2499             raise ExtractorError(u'Unable to extract video thumbnail')
2500         video_thumbnail = mobj.group(0)
2501
2502         info = {
2503             'id': video_id,
2504             'url': video_url,
2505             'uploader': None,
2506             'upload_date': None,
2507             'title': video_title,
2508             'ext': 'flv',
2509             'thumbnail': video_thumbnail,
2510             'description': None,
2511         }
2512
2513         return [info]
2514
2515
2516 class SoundcloudIE(InfoExtractor):
2517     """Information extractor for soundcloud.com
2518        To access the media, the uid of the song and a stream token
2519        must be extracted from the page source and the script must make
2520        a request to media.soundcloud.com/crossdomain.xml. Then
2521        the media can be grabbed by requesting from an url composed
2522        of the stream token and uid
2523      """
2524
2525     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2526     IE_NAME = u'soundcloud'
2527
2528     def report_resolve(self, video_id):
2529         """Report information extraction."""
2530         self.to_screen(u'%s: Resolving id' % video_id)
2531
2532     def _real_extract(self, url):
2533         mobj = re.match(self._VALID_URL, url)
2534         if mobj is None:
2535             raise ExtractorError(u'Invalid URL: %s' % url)
2536
2537         # extract uploader (which is in the url)
2538         uploader = mobj.group(1)
2539         # extract simple title (uploader + slug of song title)
2540         slug_title =  mobj.group(2)
2541         simple_title = uploader + u'-' + slug_title
2542         full_title = '%s/%s' % (uploader, slug_title)
2543
2544         self.report_resolve(full_title)
2545
2546         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2547         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2548         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2549
2550         info = json.loads(info_json)
2551         video_id = info['id']
2552         self.report_extraction(full_title)
2553
2554         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2555         stream_json = self._download_webpage(streams_url, full_title,
2556                                              u'Downloading stream definitions',
2557                                              u'unable to download stream definitions')
2558
2559         streams = json.loads(stream_json)
2560         mediaURL = streams['http_mp3_128_url']
2561         upload_date = unified_strdate(info['created_at'])
2562
2563         return [{
2564             'id':       info['id'],
2565             'url':      mediaURL,
2566             'uploader': info['user']['username'],
2567             'upload_date': upload_date,
2568             'title':    info['title'],
2569             'ext':      u'mp3',
2570             'description': info['description'],
2571         }]
2572
2573 class SoundcloudSetIE(InfoExtractor):
2574     """Information extractor for soundcloud.com sets
2575        To access the media, the uid of the song and a stream token
2576        must be extracted from the page source and the script must make
2577        a request to media.soundcloud.com/crossdomain.xml. Then
2578        the media can be grabbed by requesting from an url composed
2579        of the stream token and uid
2580      """
2581
2582     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2583     IE_NAME = u'soundcloud:set'
2584
2585     def report_resolve(self, video_id):
2586         """Report information extraction."""
2587         self.to_screen(u'%s: Resolving id' % video_id)
2588
2589     def _real_extract(self, url):
2590         mobj = re.match(self._VALID_URL, url)
2591         if mobj is None:
2592             raise ExtractorError(u'Invalid URL: %s' % url)
2593
2594         # extract uploader (which is in the url)
2595         uploader = mobj.group(1)
2596         # extract simple title (uploader + slug of song title)
2597         slug_title =  mobj.group(2)
2598         simple_title = uploader + u'-' + slug_title
2599         full_title = '%s/sets/%s' % (uploader, slug_title)
2600
2601         self.report_resolve(full_title)
2602
2603         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2604         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2605         info_json = self._download_webpage(resolv_url, full_title)
2606
2607         videos = []
2608         info = json.loads(info_json)
2609         if 'errors' in info:
2610             for err in info['errors']:
2611                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2612             return
2613
2614         self.report_extraction(full_title)
2615         for track in info['tracks']:
2616             video_id = track['id']
2617
2618             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2619             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2620
2621             self.report_extraction(video_id)
2622             streams = json.loads(stream_json)
2623             mediaURL = streams['http_mp3_128_url']
2624
2625             videos.append({
2626                 'id':       video_id,
2627                 'url':      mediaURL,
2628                 'uploader': track['user']['username'],
2629                 'upload_date':  unified_strdate(track['created_at']),
2630                 'title':    track['title'],
2631                 'ext':      u'mp3',
2632                 'description': track['description'],
2633             })
2634         return videos
2635
2636
2637 class InfoQIE(InfoExtractor):
2638     """Information extractor for infoq.com"""
2639     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2640
2641     def _real_extract(self, url):
2642         mobj = re.match(self._VALID_URL, url)
2643         if mobj is None:
2644             raise ExtractorError(u'Invalid URL: %s' % url)
2645
2646         webpage = self._download_webpage(url, video_id=url)
2647         self.report_extraction(url)
2648
2649         # Extract video URL
2650         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2651         if mobj is None:
2652             raise ExtractorError(u'Unable to extract video url')
2653         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2654         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2655
2656         # Extract title
2657         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2658         if mobj is None:
2659             raise ExtractorError(u'Unable to extract video title')
2660         video_title = mobj.group(1)
2661
2662         # Extract description
2663         video_description = u'No description available.'
2664         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2665         if mobj is not None:
2666             video_description = mobj.group(1)
2667
2668         video_filename = video_url.split('/')[-1]
2669         video_id, extension = video_filename.split('.')
2670
2671         info = {
2672             'id': video_id,
2673             'url': video_url,
2674             'uploader': None,
2675             'upload_date': None,
2676             'title': video_title,
2677             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2678             'thumbnail': None,
2679             'description': video_description,
2680         }
2681
2682         return [info]
2683
2684 class MixcloudIE(InfoExtractor):
2685     """Information extractor for www.mixcloud.com"""
2686
2687     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2688     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2689     IE_NAME = u'mixcloud'
2690
2691     def report_download_json(self, file_id):
2692         """Report JSON download."""
2693         self.to_screen(u'Downloading json')
2694
2695     def get_urls(self, jsonData, fmt, bitrate='best'):
2696         """Get urls from 'audio_formats' section in json"""
2697         file_url = None
2698         try:
2699             bitrate_list = jsonData[fmt]
2700             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2701                 bitrate = max(bitrate_list) # select highest
2702
2703             url_list = jsonData[fmt][bitrate]
2704         except TypeError: # we have no bitrate info.
2705             url_list = jsonData[fmt]
2706         return url_list
2707
2708     def check_urls(self, url_list):
2709         """Returns 1st active url from list"""
2710         for url in url_list:
2711             try:
2712                 compat_urllib_request.urlopen(url)
2713                 return url
2714             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715                 url = None
2716
2717         return None
2718
2719     def _print_formats(self, formats):
2720         print('Available formats:')
2721         for fmt in formats.keys():
2722             for b in formats[fmt]:
2723                 try:
2724                     ext = formats[fmt][b][0]
2725                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2726                 except TypeError: # we have no bitrate info
2727                     ext = formats[fmt][0]
2728                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2729                     break
2730
2731     def _real_extract(self, url):
2732         mobj = re.match(self._VALID_URL, url)
2733         if mobj is None:
2734             raise ExtractorError(u'Invalid URL: %s' % url)
2735         # extract uploader & filename from url
2736         uploader = mobj.group(1).decode('utf-8')
2737         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2738
2739         # construct API request
2740         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2741         # retrieve .json file with links to files
2742         request = compat_urllib_request.Request(file_url)
2743         try:
2744             self.report_download_json(file_url)
2745             jsonData = compat_urllib_request.urlopen(request).read()
2746         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2748
2749         # parse JSON
2750         json_data = json.loads(jsonData)
2751         player_url = json_data['player_swf_url']
2752         formats = dict(json_data['audio_formats'])
2753
2754         req_format = self._downloader.params.get('format', None)
2755         bitrate = None
2756
2757         if self._downloader.params.get('listformats', None):
2758             self._print_formats(formats)
2759             return
2760
2761         if req_format is None or req_format == 'best':
2762             for format_param in formats.keys():
2763                 url_list = self.get_urls(formats, format_param)
2764                 # check urls
2765                 file_url = self.check_urls(url_list)
2766                 if file_url is not None:
2767                     break # got it!
2768         else:
2769             if req_format not in formats:
2770                 raise ExtractorError(u'Format is not available')
2771
2772             url_list = self.get_urls(formats, req_format)
2773             file_url = self.check_urls(url_list)
2774             format_param = req_format
2775
2776         return [{
2777             'id': file_id.decode('utf-8'),
2778             'url': file_url.decode('utf-8'),
2779             'uploader': uploader.decode('utf-8'),
2780             'upload_date': None,
2781             'title': json_data['name'],
2782             'ext': file_url.split('.')[-1].decode('utf-8'),
2783             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2784             'thumbnail': json_data['thumbnail_url'],
2785             'description': json_data['description'],
2786             'player_url': player_url.decode('utf-8'),
2787         }]
2788
2789 class StanfordOpenClassroomIE(InfoExtractor):
2790     """Information extractor for Stanford's Open ClassRoom"""
2791
2792     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2793     IE_NAME = u'stanfordoc'
2794
2795     def _real_extract(self, url):
2796         mobj = re.match(self._VALID_URL, url)
2797         if mobj is None:
2798             raise ExtractorError(u'Invalid URL: %s' % url)
2799
2800         if mobj.group('course') and mobj.group('video'): # A specific video
2801             course = mobj.group('course')
2802             video = mobj.group('video')
2803             info = {
2804                 'id': course + '_' + video,
2805                 'uploader': None,
2806                 'upload_date': None,
2807             }
2808
2809             self.report_extraction(info['id'])
2810             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2811             xmlUrl = baseUrl + video + '.xml'
2812             try:
2813                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2814             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2816             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2817             try:
2818                 info['title'] = mdoc.findall('./title')[0].text
2819                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2820             except IndexError:
2821                 raise ExtractorError(u'Invalid metadata XML file')
2822             info['ext'] = info['url'].rpartition('.')[2]
2823             return [info]
2824         elif mobj.group('course'): # A course page
2825             course = mobj.group('course')
2826             info = {
2827                 'id': course,
2828                 'type': 'playlist',
2829                 'uploader': None,
2830                 'upload_date': None,
2831             }
2832
2833             coursepage = self._download_webpage(url, info['id'],
2834                                         note='Downloading course info page',
2835                                         errnote='Unable to download course info page')
2836
2837             m = re.search('<h1>([^<]+)</h1>', coursepage)
2838             if m:
2839                 info['title'] = unescapeHTML(m.group(1))
2840             else:
2841                 info['title'] = info['id']
2842
2843             m = re.search('<description>([^<]+)</description>', coursepage)
2844             if m:
2845                 info['description'] = unescapeHTML(m.group(1))
2846
2847             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2848             info['list'] = [
2849                 {
2850                     'type': 'reference',
2851                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2852                 }
2853                     for vpage in links]
2854             results = []
2855             for entry in info['list']:
2856                 assert entry['type'] == 'reference'
2857                 results += self.extract(entry['url'])
2858             return results
2859         else: # Root page
2860             info = {
2861                 'id': 'Stanford OpenClassroom',
2862                 'type': 'playlist',
2863                 'uploader': None,
2864                 'upload_date': None,
2865             }
2866
2867             self.report_download_webpage(info['id'])
2868             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869             try:
2870                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2871             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2873
2874             info['title'] = info['id']
2875
2876             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2877             info['list'] = [
2878                 {
2879                     'type': 'reference',
2880                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2881                 }
2882                     for cpage in links]
2883
2884             results = []
2885             for entry in info['list']:
2886                 assert entry['type'] == 'reference'
2887                 results += self.extract(entry['url'])
2888             return results
2889
2890 class MTVIE(InfoExtractor):
2891     """Information extractor for MTV.com"""
2892
2893     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2894     IE_NAME = u'mtv'
2895
2896     def _real_extract(self, url):
2897         mobj = re.match(self._VALID_URL, url)
2898         if mobj is None:
2899             raise ExtractorError(u'Invalid URL: %s' % url)
2900         if not mobj.group('proto'):
2901             url = 'http://' + url
2902         video_id = mobj.group('videoid')
2903
2904         webpage = self._download_webpage(url, video_id)
2905
2906         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2907         if mobj is None:
2908             raise ExtractorError(u'Unable to extract song name')
2909         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2910         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2911         if mobj is None:
2912             raise ExtractorError(u'Unable to extract performer')
2913         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2914         video_title = performer + ' - ' + song_name
2915
2916         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2917         if mobj is None:
2918             raise ExtractorError(u'Unable to mtvn_uri')
2919         mtvn_uri = mobj.group(1)
2920
2921         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2922         if mobj is None:
2923             raise ExtractorError(u'Unable to extract content id')
2924         content_id = mobj.group(1)
2925
2926         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2927         self.report_extraction(video_id)
2928         request = compat_urllib_request.Request(videogen_url)
2929         try:
2930             metadataXml = compat_urllib_request.urlopen(request).read()
2931         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2932             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2933
2934         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2935         renditions = mdoc.findall('.//rendition')
2936
2937         # For now, always pick the highest quality.
2938         rendition = renditions[-1]
2939
2940         try:
2941             _,_,ext = rendition.attrib['type'].partition('/')
2942             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2943             video_url = rendition.find('./src').text
2944         except KeyError:
2945             raise ExtractorError('Invalid rendition field.')
2946
2947         info = {
2948             'id': video_id,
2949             'url': video_url,
2950             'uploader': performer,
2951             'upload_date': None,
2952             'title': video_title,
2953             'ext': ext,
2954             'format': format,
2955         }
2956
2957         return [info]
2958
2959
2960 class YoukuIE(InfoExtractor):
2961     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2962
2963     def _gen_sid(self):
2964         nowTime = int(time.time() * 1000)
2965         random1 = random.randint(1000,1998)
2966         random2 = random.randint(1000,9999)
2967
2968         return "%d%d%d" %(nowTime,random1,random2)
2969
2970     def _get_file_ID_mix_string(self, seed):
2971         mixed = []
2972         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2973         seed = float(seed)
2974         for i in range(len(source)):
2975             seed  =  (seed * 211 + 30031 ) % 65536
2976             index  =  math.floor(seed / 65536 * len(source) )
2977             mixed.append(source[int(index)])
2978             source.remove(source[int(index)])
2979         #return ''.join(mixed)
2980         return mixed
2981
2982     def _get_file_id(self, fileId, seed):
2983         mixed = self._get_file_ID_mix_string(seed)
2984         ids = fileId.split('*')
2985         realId = []
2986         for ch in ids:
2987             if ch:
2988                 realId.append(mixed[int(ch)])
2989         return ''.join(realId)
2990
2991     def _real_extract(self, url):
2992         mobj = re.match(self._VALID_URL, url)
2993         if mobj is None:
2994             raise ExtractorError(u'Invalid URL: %s' % url)
2995         video_id = mobj.group('ID')
2996
2997         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2998
2999         jsondata = self._download_webpage(info_url, video_id)
3000
3001         self.report_extraction(video_id)
3002         try:
3003             config = json.loads(jsondata)
3004
3005             video_title =  config['data'][0]['title']
3006             seed = config['data'][0]['seed']
3007
3008             format = self._downloader.params.get('format', None)
3009             supported_format = list(config['data'][0]['streamfileids'].keys())
3010
3011             if format is None or format == 'best':
3012                 if 'hd2' in supported_format:
3013                     format = 'hd2'
3014                 else:
3015                     format = 'flv'
3016                 ext = u'flv'
3017             elif format == 'worst':
3018                 format = 'mp4'
3019                 ext = u'mp4'
3020             else:
3021                 format = 'flv'
3022                 ext = u'flv'
3023
3024
3025             fileid = config['data'][0]['streamfileids'][format]
3026             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3027         except (UnicodeDecodeError, ValueError, KeyError):
3028             raise ExtractorError(u'Unable to extract info section')
3029
3030         files_info=[]
3031         sid = self._gen_sid()
3032         fileid = self._get_file_id(fileid, seed)
3033
3034         #column 8,9 of fileid represent the segment number
3035         #fileid[7:9] should be changed
3036         for index, key in enumerate(keys):
3037
3038             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3039             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3040
3041             info = {
3042                 'id': '%s_part%02d' % (video_id, index),
3043                 'url': download_url,
3044                 'uploader': None,
3045                 'upload_date': None,
3046                 'title': video_title,
3047                 'ext': ext,
3048             }
3049             files_info.append(info)
3050
3051         return files_info
3052
3053
3054 class XNXXIE(InfoExtractor):
3055     """Information extractor for xnxx.com"""
3056
3057     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3058     IE_NAME = u'xnxx'
3059     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3060     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3061     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3062
3063     def _real_extract(self, url):
3064         mobj = re.match(self._VALID_URL, url)
3065         if mobj is None:
3066             raise ExtractorError(u'Invalid URL: %s' % url)
3067         video_id = mobj.group(1)
3068
3069         # Get webpage content
3070         webpage = self._download_webpage(url, video_id)
3071
3072         result = re.search(self.VIDEO_URL_RE, webpage)
3073         if result is None:
3074             raise ExtractorError(u'Unable to extract video url')
3075         video_url = compat_urllib_parse.unquote(result.group(1))
3076
3077         result = re.search(self.VIDEO_TITLE_RE, webpage)
3078         if result is None:
3079             raise ExtractorError(u'Unable to extract video title')
3080         video_title = result.group(1)
3081
3082         result = re.search(self.VIDEO_THUMB_RE, webpage)
3083         if result is None:
3084             raise ExtractorError(u'Unable to extract video thumbnail')
3085         video_thumbnail = result.group(1)
3086
3087         return [{
3088             'id': video_id,
3089             'url': video_url,
3090             'uploader': None,
3091             'upload_date': None,
3092             'title': video_title,
3093             'ext': 'flv',
3094             'thumbnail': video_thumbnail,
3095             'description': None,
3096         }]
3097
3098
3099 class GooglePlusIE(InfoExtractor):
3100     """Information extractor for plus.google.com."""
3101
3102     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3103     IE_NAME = u'plus.google'
3104
3105     def report_extract_entry(self, url):
3106         """Report downloading extry"""
3107         self.to_screen(u'Downloading entry: %s' % url)
3108
3109     def report_date(self, upload_date):
3110         """Report downloading extry"""
3111         self.to_screen(u'Entry date: %s' % upload_date)
3112
3113     def report_uploader(self, uploader):
3114         """Report downloading extry"""
3115         self.to_screen(u'Uploader: %s' % uploader)
3116
3117     def report_title(self, video_title):
3118         """Report downloading extry"""
3119         self.to_screen(u'Title: %s' % video_title)
3120
3121     def report_extract_vid_page(self, video_page):
3122         """Report information extraction."""
3123         self.to_screen(u'Extracting video page: %s' % video_page)
3124
3125     def _real_extract(self, url):
3126         # Extract id from URL
3127         mobj = re.match(self._VALID_URL, url)
3128         if mobj is None:
3129             raise ExtractorError(u'Invalid URL: %s' % url)
3130
3131         post_url = mobj.group(0)
3132         video_id = mobj.group(1)
3133
3134         video_extension = 'flv'
3135
3136         # Step 1, Retrieve post webpage to extract further information
3137         self.report_extract_entry(post_url)
3138         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3139
3140         # Extract update date
3141         upload_date = None
3142         pattern = 'title="Timestamp">(.*?)</a>'
3143         mobj = re.search(pattern, webpage)
3144         if mobj:
3145             upload_date = mobj.group(1)
3146             # Convert timestring to a format suitable for filename
3147             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3148             upload_date = upload_date.strftime('%Y%m%d')
3149         self.report_date(upload_date)
3150
3151         # Extract uploader
3152         uploader = None
3153         pattern = r'rel\="author".*?>(.*?)</a>'
3154         mobj = re.search(pattern, webpage)
3155         if mobj:
3156             uploader = mobj.group(1)
3157         self.report_uploader(uploader)
3158
3159         # Extract title
3160         # Get the first line for title
3161         video_title = u'NA'
3162         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3163         mobj = re.search(pattern, webpage)
3164         if mobj:
3165             video_title = mobj.group(1)
3166         self.report_title(video_title)
3167
3168         # Step 2, Stimulate clicking the image box to launch video
3169         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3170         mobj = re.search(pattern, webpage)
3171         if mobj is None:
3172             raise ExtractorError(u'Unable to extract video page URL')
3173
3174         video_page = mobj.group(1)
3175         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3176         self.report_extract_vid_page(video_page)
3177
3178
3179         # Extract video links on video page
3180         """Extract video links of all sizes"""
3181         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3182         mobj = re.findall(pattern, webpage)
3183         if len(mobj) == 0:
3184             raise ExtractorError(u'Unable to extract video links')
3185
3186         # Sort in resolution
3187         links = sorted(mobj)
3188
3189         # Choose the lowest of the sort, i.e. highest resolution
3190         video_url = links[-1]
3191         # Only get the url. The resolution part in the tuple has no use anymore
3192         video_url = video_url[-1]
3193         # Treat escaped \u0026 style hex
3194         try:
3195             video_url = video_url.decode("unicode_escape")
3196         except AttributeError: # Python 3
3197             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3198
3199
3200         return [{
3201             'id':       video_id,
3202             'url':      video_url,
3203             'uploader': uploader,
3204             'upload_date':  upload_date,
3205             'title':    video_title,
3206             'ext':      video_extension,
3207         }]
3208
3209 class NBAIE(InfoExtractor):
3210     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3211     IE_NAME = u'nba'
3212
3213     def _real_extract(self, url):
3214         mobj = re.match(self._VALID_URL, url)
3215         if mobj is None:
3216             raise ExtractorError(u'Invalid URL: %s' % url)
3217
3218         video_id = mobj.group(1)
3219         if video_id.endswith('/index.html'):
3220             video_id = video_id[:-len('/index.html')]
3221
3222         webpage = self._download_webpage(url, video_id)
3223
3224         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3225         def _findProp(rexp, default=None):
3226             m = re.search(rexp, webpage)
3227             if m:
3228                 return unescapeHTML(m.group(1))
3229             else:
3230                 return default
3231
3232         shortened_video_id = video_id.rpartition('/')[2]
3233         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3234         info = {
3235             'id': shortened_video_id,
3236             'url': video_url,
3237             'ext': 'mp4',
3238             'title': title,
3239             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3240             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3241         }
3242         return [info]
3243
3244 class JustinTVIE(InfoExtractor):
3245     """Information extractor for justin.tv and twitch.tv"""
3246     # TODO: One broadcast may be split into multiple videos. The key
3247     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3248     # starts at 1 and increases. Can we treat all parts as one video?
3249
3250     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3251         (?:
3252             (?P<channelid>[^/]+)|
3253             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3254             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3255         )
3256         /?(?:\#.*)?$
3257         """
3258     _JUSTIN_PAGE_LIMIT = 100
3259     IE_NAME = u'justin.tv'
3260
3261     def report_download_page(self, channel, offset):
3262         """Report attempt to download a single page of videos."""
3263         self.to_screen(u'%s: Downloading video information from %d to %d' %
3264                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3265
3266     # Return count of items, list of *valid* items
3267     def _parse_page(self, url, video_id):
3268         webpage = self._download_webpage(url, video_id,
3269                                          u'Downloading video info JSON',
3270                                          u'unable to download video info JSON')
3271
3272         response = json.loads(webpage)
3273         if type(response) != list:
3274             error_text = response.get('error', 'unknown error')
3275             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3276         info = []
3277         for clip in response:
3278             video_url = clip['video_file_url']
3279             if video_url:
3280                 video_extension = os.path.splitext(video_url)[1][1:]
3281                 video_date = re.sub('-', '', clip['start_time'][:10])
3282                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3283                 video_id = clip['id']
3284                 video_title = clip.get('title', video_id)
3285                 info.append({
3286                     'id': video_id,
3287                     'url': video_url,
3288                     'title': video_title,
3289                     'uploader': clip.get('channel_name', video_uploader_id),
3290                     'uploader_id': video_uploader_id,
3291                     'upload_date': video_date,
3292                     'ext': video_extension,
3293                 })
3294         return (len(response), info)
3295
3296     def _real_extract(self, url):
3297         mobj = re.match(self._VALID_URL, url)
3298         if mobj is None:
3299             raise ExtractorError(u'invalid URL: %s' % url)
3300
3301         api_base = 'http://api.justin.tv'
3302         paged = False
3303         if mobj.group('channelid'):
3304             paged = True
3305             video_id = mobj.group('channelid')
3306             api = api_base + '/channel/archives/%s.json' % video_id
3307         elif mobj.group('chapterid'):
3308             chapter_id = mobj.group('chapterid')
3309
3310             webpage = self._download_webpage(url, chapter_id)
3311             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3312             if not m:
3313                 raise ExtractorError(u'Cannot find archive of a chapter')
3314             archive_id = m.group(1)
3315
3316             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3317             chapter_info_xml = self._download_webpage(api, chapter_id,
3318                                              note=u'Downloading chapter information',
3319                                              errnote=u'Chapter information download failed')
3320             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3321             for a in doc.findall('.//archive'):
3322                 if archive_id == a.find('./id').text:
3323                     break
3324             else:
3325                 raise ExtractorError(u'Could not find chapter in chapter information')
3326
3327             video_url = a.find('./video_file_url').text
3328             video_ext = video_url.rpartition('.')[2] or u'flv'
3329
3330             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3331             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3332                                    note='Downloading chapter metadata',
3333                                    errnote='Download of chapter metadata failed')
3334             chapter_info = json.loads(chapter_info_json)
3335
3336             bracket_start = int(doc.find('.//bracket_start').text)
3337             bracket_end = int(doc.find('.//bracket_end').text)
3338
3339             # TODO determine start (and probably fix up file)
3340             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3341             #video_url += u'?start=' + TODO:start_timestamp
3342             # bracket_start is 13290, but we want 51670615
3343             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3344                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3345
3346             info = {
3347                 'id': u'c' + chapter_id,
3348                 'url': video_url,
3349                 'ext': video_ext,
3350                 'title': chapter_info['title'],
3351                 'thumbnail': chapter_info['preview'],
3352                 'description': chapter_info['description'],
3353                 'uploader': chapter_info['channel']['display_name'],
3354                 'uploader_id': chapter_info['channel']['name'],
3355             }
3356             return [info]
3357         else:
3358             video_id = mobj.group('videoid')
3359             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3360
3361         self.report_extraction(video_id)
3362
3363         info = []
3364         offset = 0
3365         limit = self._JUSTIN_PAGE_LIMIT
3366         while True:
3367             if paged:
3368                 self.report_download_page(video_id, offset)
3369             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3370             page_count, page_info = self._parse_page(page_url, video_id)
3371             info.extend(page_info)
3372             if not paged or page_count != limit:
3373                 break
3374             offset += limit
3375         return info
3376
3377 class FunnyOrDieIE(InfoExtractor):
3378     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3379
3380     def _real_extract(self, url):
3381         mobj = re.match(self._VALID_URL, url)
3382         if mobj is None:
3383             raise ExtractorError(u'invalid URL: %s' % url)
3384
3385         video_id = mobj.group('id')
3386         webpage = self._download_webpage(url, video_id)
3387
3388         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3389         if not m:
3390             raise ExtractorError(u'Unable to find video information')
3391         video_url = unescapeHTML(m.group('url'))
3392
3393         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3394         if not m:
3395             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3396             if not m:
3397                 raise ExtractorError(u'Cannot find video title')
3398         title = clean_html(m.group('title'))
3399
3400         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3401         if m:
3402             desc = unescapeHTML(m.group('desc'))
3403         else:
3404             desc = None
3405
3406         info = {
3407             'id': video_id,
3408             'url': video_url,
3409             'ext': 'mp4',
3410             'title': title,
3411             'description': desc,
3412         }
3413         return [info]
3414
3415 class SteamIE(InfoExtractor):
3416     _VALID_URL = r"""http://store\.steampowered\.com/
3417                 (agecheck/)?
3418                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3419                 (?P<gameID>\d+)/?
3420                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3421                 """
3422
3423     @classmethod
3424     def suitable(cls, url):
3425         """Receives a URL and returns True if suitable for this IE."""
3426         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3427
3428     def _real_extract(self, url):
3429         m = re.match(self._VALID_URL, url, re.VERBOSE)
3430         gameID = m.group('gameID')
3431         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3432         self.report_age_confirmation()
3433         webpage = self._download_webpage(videourl, gameID)
3434         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3435
3436         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3437         mweb = re.finditer(urlRE, webpage)
3438         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3439         titles = re.finditer(namesRE, webpage)
3440         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3441         thumbs = re.finditer(thumbsRE, webpage)
3442         videos = []
3443         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3444             video_id = vid.group('videoID')
3445             title = vtitle.group('videoName')
3446             video_url = vid.group('videoURL')
3447             video_thumb = thumb.group('thumbnail')
3448             if not video_url:
3449                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3450             info = {
3451                 'id':video_id,
3452                 'url':video_url,
3453                 'ext': 'flv',
3454                 'title': unescapeHTML(title),
3455                 'thumbnail': video_thumb
3456                   }
3457             videos.append(info)
3458         return [self.playlist_result(videos, gameID, game_title)]
3459
3460 class UstreamIE(InfoExtractor):
3461     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3462     IE_NAME = u'ustream'
3463
3464     def _real_extract(self, url):
3465         m = re.match(self._VALID_URL, url)
3466         video_id = m.group('videoID')
3467         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3468         webpage = self._download_webpage(url, video_id)
3469         self.report_extraction(video_id)
3470         try:
3471             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3472             title = m.group('title')
3473             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3474                           webpage, re.DOTALL)
3475             uploader = unescapeHTML(m.group('uploader').strip())
3476             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3477             thumb = m.group('thumb')
3478         except AttributeError:
3479             raise ExtractorError(u'Unable to extract info')
3480         info = {
3481                 'id':video_id,
3482                 'url':video_url,
3483                 'ext': 'flv',
3484                 'title': title,
3485                 'uploader': uploader,
3486                 'thumbnail': thumb,
3487                   }
3488         return info
3489
3490 class WorldStarHipHopIE(InfoExtractor):
3491     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3492     IE_NAME = u'WorldStarHipHop'
3493
3494     def _real_extract(self, url):
3495         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3496
3497         m = re.match(self._VALID_URL, url)
3498         video_id = m.group('id')
3499
3500         webpage_src = self._download_webpage(url, video_id)
3501
3502         mobj = re.search(_src_url, webpage_src)
3503
3504         if mobj is not None:
3505             video_url = mobj.group(1)
3506             if 'mp4' in video_url:
3507                 ext = 'mp4'
3508             else:
3509                 ext = 'flv'
3510         else:
3511             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3512
3513         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3514
3515         if mobj is None:
3516             raise ExtractorError(u'Cannot determine title')
3517         title = mobj.group(1)
3518
3519         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3520         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3521         if mobj is not None:
3522             thumbnail = mobj.group(1)
3523         else:
3524             _title = r"""candytitles.*>(.*)</span>"""
3525             mobj = re.search(_title, webpage_src)
3526             if mobj is not None:
3527                 title = mobj.group(1)
3528             thumbnail = None
3529
3530         results = [{
3531                     'id': video_id,
3532                     'url' : video_url,
3533                     'title' : title,
3534                     'thumbnail' : thumbnail,
3535                     'ext' : ext,
3536                     }]
3537         return results
3538
3539 class RBMARadioIE(InfoExtractor):
3540     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3541
3542     def _real_extract(self, url):
3543         m = re.match(self._VALID_URL, url)
3544         video_id = m.group('videoID')
3545
3546         webpage = self._download_webpage(url, video_id)
3547         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3548         if not m:
3549             raise ExtractorError(u'Cannot find metadata')
3550         json_data = m.group(1)
3551
3552         try:
3553             data = json.loads(json_data)
3554         except ValueError as e:
3555             raise ExtractorError(u'Invalid JSON: ' + str(e))
3556
3557         video_url = data['akamai_url'] + '&cbr=256'
3558         url_parts = compat_urllib_parse_urlparse(video_url)
3559         video_ext = url_parts.path.rpartition('.')[2]
3560         info = {
3561                 'id': video_id,
3562                 'url': video_url,
3563                 'ext': video_ext,
3564                 'title': data['title'],
3565                 'description': data.get('teaser_text'),
3566                 'location': data.get('country_of_origin'),
3567                 'uploader': data.get('host', {}).get('name'),
3568                 'uploader_id': data.get('host', {}).get('slug'),
3569                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3570                 'duration': data.get('duration'),
3571         }
3572         return [info]
3573
3574
3575 class YouPornIE(InfoExtractor):
3576     """Information extractor for youporn.com."""
3577     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3578
3579     def _print_formats(self, formats):
3580         """Print all available formats"""
3581         print(u'Available formats:')
3582         print(u'ext\t\tformat')
3583         print(u'---------------------------------')
3584         for format in formats:
3585             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3586
3587     def _specific(self, req_format, formats):
3588         for x in formats:
3589             if(x["format"]==req_format):
3590                 return x
3591         return None
3592
3593     def _real_extract(self, url):
3594         mobj = re.match(self._VALID_URL, url)
3595         if mobj is None:
3596             raise ExtractorError(u'Invalid URL: %s' % url)
3597
3598         video_id = mobj.group('videoid')
3599
3600         req = compat_urllib_request.Request(url)
3601         req.add_header('Cookie', 'age_verified=1')
3602         webpage = self._download_webpage(req, video_id)
3603
3604         # Get the video title
3605         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3606         if result is None:
3607             raise ExtractorError(u'Unable to extract video title')
3608         video_title = result.group('title').strip()
3609
3610         # Get the video date
3611         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3612         if result is None:
3613             self._downloader.report_warning(u'unable to extract video date')
3614             upload_date = None
3615         else:
3616             upload_date = unified_strdate(result.group('date').strip())
3617
3618         # Get the video uploader
3619         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3620         if result is None:
3621             self._downloader.report_warning(u'unable to extract uploader')
3622             video_uploader = None
3623         else:
3624             video_uploader = result.group('uploader').strip()
3625             video_uploader = clean_html( video_uploader )
3626
3627         # Get all of the formats available
3628         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3629         result = re.search(DOWNLOAD_LIST_RE, webpage)
3630         if result is None:
3631             raise ExtractorError(u'Unable to extract download list')
3632         download_list_html = result.group('download_list').strip()
3633
3634         # Get all of the links from the page
3635         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3636         links = re.findall(LINK_RE, download_list_html)
3637         if(len(links) == 0):
3638             raise ExtractorError(u'ERROR: no known formats available for video')
3639
3640         self.to_screen(u'Links found: %d' % len(links))
3641
3642         formats = []
3643         for link in links:
3644
3645             # A link looks like this:
3646             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3647             # A path looks like this:
3648             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3649             video_url = unescapeHTML( link )
3650             path = compat_urllib_parse_urlparse( video_url ).path
3651             extension = os.path.splitext( path )[1][1:]
3652             format = path.split('/')[4].split('_')[:2]
3653             size = format[0]
3654             bitrate = format[1]
3655             format = "-".join( format )
3656             title = u'%s-%s-%s' % (video_title, size, bitrate)
3657
3658             formats.append({
3659                 'id': video_id,
3660                 'url': video_url,
3661                 'uploader': video_uploader,
3662                 'upload_date': upload_date,
3663                 'title': title,
3664                 'ext': extension,
3665                 'format': format,
3666                 'thumbnail': None,
3667                 'description': None,
3668                 'player_url': None
3669             })
3670
3671         if self._downloader.params.get('listformats', None):
3672             self._print_formats(formats)
3673             return
3674
3675         req_format = self._downloader.params.get('format', None)
3676         self.to_screen(u'Format: %s' % req_format)
3677
3678         if req_format is None or req_format == 'best':
3679             return [formats[0]]
3680         elif req_format == 'worst':
3681             return [formats[-1]]
3682         elif req_format in ('-1', 'all'):
3683             return formats
3684         else:
3685             format = self._specific( req_format, formats )
3686             if result is None:
3687                 raise ExtractorError(u'Requested format not available')
3688             return [format]
3689
3690
3691
3692 class PornotubeIE(InfoExtractor):
3693     """Information extractor for pornotube.com."""
3694     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3695
3696     def _real_extract(self, url):
3697         mobj = re.match(self._VALID_URL, url)
3698         if mobj is None:
3699             raise ExtractorError(u'Invalid URL: %s' % url)
3700
3701         video_id = mobj.group('videoid')
3702         video_title = mobj.group('title')
3703
3704         # Get webpage content
3705         webpage = self._download_webpage(url, video_id)
3706
3707         # Get the video URL
3708         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3709         result = re.search(VIDEO_URL_RE, webpage)
3710         if result is None:
3711             raise ExtractorError(u'Unable to extract video url')
3712         video_url = compat_urllib_parse.unquote(result.group('url'))
3713
3714         #Get the uploaded date
3715         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3716         result = re.search(VIDEO_UPLOADED_RE, webpage)
3717         if result is None:
3718             raise ExtractorError(u'Unable to extract video title')
3719         upload_date = unified_strdate(result.group('date'))
3720
3721         info = {'id': video_id,
3722                 'url': video_url,
3723                 'uploader': None,
3724                 'upload_date': upload_date,
3725                 'title': video_title,
3726                 'ext': 'flv',
3727                 'format': 'flv'}
3728
3729         return [info]
3730
3731 class YouJizzIE(InfoExtractor):
3732     """Information extractor for youjizz.com."""
3733     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3734
3735     def _real_extract(self, url):
3736         mobj = re.match(self._VALID_URL, url)
3737         if mobj is None:
3738             raise ExtractorError(u'Invalid URL: %s' % url)
3739
3740         video_id = mobj.group('videoid')
3741
3742         # Get webpage content
3743         webpage = self._download_webpage(url, video_id)
3744
3745         # Get the video title
3746         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3747         if result is None:
3748             raise ExtractorError(u'ERROR: unable to extract video title')
3749         video_title = result.group('title').strip()
3750
3751         # Get the embed page
3752         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3753         if result is None:
3754             raise ExtractorError(u'ERROR: unable to extract embed page')
3755
3756         embed_page_url = result.group(0).strip()
3757         video_id = result.group('videoid')
3758
3759         webpage = self._download_webpage(embed_page_url, video_id)
3760
3761         # Get the video URL
3762         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3763         if result is None:
3764             raise ExtractorError(u'ERROR: unable to extract video url')
3765         video_url = result.group('source')
3766
3767         info = {'id': video_id,
3768                 'url': video_url,
3769                 'title': video_title,
3770                 'ext': 'flv',
3771                 'format': 'flv',
3772                 'player_url': embed_page_url}
3773
3774         return [info]
3775
3776 class EightTracksIE(InfoExtractor):
3777     IE_NAME = '8tracks'
3778     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3779
3780     def _real_extract(self, url):
3781         mobj = re.match(self._VALID_URL, url)
3782         if mobj is None:
3783             raise ExtractorError(u'Invalid URL: %s' % url)
3784         playlist_id = mobj.group('id')
3785
3786         webpage = self._download_webpage(url, playlist_id)
3787
3788         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3789         if not m:
3790             raise ExtractorError(u'Cannot find trax information')
3791         json_like = m.group(1)
3792         data = json.loads(json_like)
3793
3794         session = str(random.randint(0, 1000000000))
3795         mix_id = data['id']
3796         track_count = data['tracks_count']
3797         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3798         next_url = first_url
3799         res = []
3800         for i in itertools.count():
3801             api_json = self._download_webpage(next_url, playlist_id,
3802                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3803                 errnote=u'Failed to download song information')
3804             api_data = json.loads(api_json)
3805             track_data = api_data[u'set']['track']
3806             info = {
3807                 'id': track_data['id'],
3808                 'url': track_data['track_file_stream_url'],
3809                 'title': track_data['performer'] + u' - ' + track_data['name'],
3810                 'raw_title': track_data['name'],
3811                 'uploader_id': data['user']['login'],
3812                 'ext': 'm4a',
3813             }
3814             res.append(info)
3815             if api_data['set']['at_last_track']:
3816                 break
3817             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3818         return res
3819
3820 class KeekIE(InfoExtractor):
3821     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3822     IE_NAME = u'keek'
3823
3824     def _real_extract(self, url):
3825         m = re.match(self._VALID_URL, url)
3826         video_id = m.group('videoID')
3827         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3828         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3829         webpage = self._download_webpage(url, video_id)
3830         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3831         title = unescapeHTML(m.group('title'))
3832         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3833         uploader = clean_html(m.group('uploader'))
3834         info = {
3835                 'id': video_id,
3836                 'url': video_url,
3837                 'ext': 'mp4',
3838                 'title': title,
3839                 'thumbnail': thumbnail,
3840                 'uploader': uploader
3841         }
3842         return [info]
3843
3844 class TEDIE(InfoExtractor):
3845     _VALID_URL=r'''http://www\.ted\.com/
3846                    (
3847                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3848                         |
3849                         ((?P<type_talk>talks)) # We have a simple talk
3850                    )
3851                    (/lang/(.*?))? # The url may contain the language
3852                    /(?P<name>\w+) # Here goes the name and then ".html"
3853                    '''
3854
3855     @classmethod
3856     def suitable(cls, url):
3857         """Receives a URL and returns True if suitable for this IE."""
3858         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3859
3860     def _real_extract(self, url):
3861         m=re.match(self._VALID_URL, url, re.VERBOSE)
3862         if m.group('type_talk'):
3863             return [self._talk_info(url)]
3864         else :
3865             playlist_id=m.group('playlist_id')
3866             name=m.group('name')
3867             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3868             return [self._playlist_videos_info(url,name,playlist_id)]
3869
3870     def _talk_video_link(self,mediaSlug):
3871         '''Returns the video link for that mediaSlug'''
3872         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3873
3874     def _playlist_videos_info(self,url,name,playlist_id=0):
3875         '''Returns the videos of the playlist'''
3876         video_RE=r'''
3877                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3878                      ([.\s]*?)data-playlist_item_id="(\d+)"
3879                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3880                      '''
3881         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3882         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3883         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3884         m_names=re.finditer(video_name_RE,webpage)
3885
3886         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3887         m_playlist = re.search(playlist_RE, webpage)
3888         playlist_title = m_playlist.group('playlist_title')
3889
3890         playlist_entries = []
3891         for m_video, m_name in zip(m_videos,m_names):
3892             video_id=m_video.group('video_id')
3893             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3894             playlist_entries.append(self.url_result(talk_url, 'TED'))
3895         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3896
3897     def _talk_info(self, url, video_id=0):
3898         """Return the video for the talk in the url"""
3899         m=re.match(self._VALID_URL, url,re.VERBOSE)
3900         videoName=m.group('name')
3901         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3902         # If the url includes the language we get the title translated
3903         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3904         title=re.search(title_RE, webpage).group('title')
3905         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3906                         "id":(?P<videoID>[\d]+).*?
3907                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3908         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3909         thumb_match=re.search(thumb_RE,webpage)
3910         info_match=re.search(info_RE,webpage,re.VERBOSE)
3911         video_id=info_match.group('videoID')
3912         mediaSlug=info_match.group('mediaSlug')
3913         video_url=self._talk_video_link(mediaSlug)
3914         info = {
3915                 'id': video_id,
3916                 'url': video_url,
3917                 'ext': 'mp4',
3918                 'title': title,
3919                 'thumbnail': thumb_match.group('thumbnail')
3920                 }
3921         return info
3922
3923 class MySpassIE(InfoExtractor):
3924     _VALID_URL = r'http://www.myspass.de/.*'
3925
3926     def _real_extract(self, url):
3927         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3928
3929         # video id is the last path element of the URL
3930         # usually there is a trailing slash, so also try the second but last
3931         url_path = compat_urllib_parse_urlparse(url).path
3932         url_parent_path, video_id = os.path.split(url_path)
3933         if not video_id:
3934             _, video_id = os.path.split(url_parent_path)
3935
3936         # get metadata
3937         metadata_url = META_DATA_URL_TEMPLATE % video_id
3938         metadata_text = self._download_webpage(metadata_url, video_id)
3939         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3940
3941         # extract values from metadata
3942         url_flv_el = metadata.find('url_flv')
3943         if url_flv_el is None:
3944             raise ExtractorError(u'Unable to extract download url')
3945         video_url = url_flv_el.text
3946         extension = os.path.splitext(video_url)[1][1:]
3947         title_el = metadata.find('title')
3948         if title_el is None:
3949             raise ExtractorError(u'Unable to extract title')
3950         title = title_el.text
3951         format_id_el = metadata.find('format_id')
3952         if format_id_el is None:
3953             format = ext
3954         else:
3955             format = format_id_el.text
3956         description_el = metadata.find('description')
3957         if description_el is not None:
3958             description = description_el.text
3959         else:
3960             description = None
3961         imagePreview_el = metadata.find('imagePreview')
3962         if imagePreview_el is not None:
3963             thumbnail = imagePreview_el.text
3964         else:
3965             thumbnail = None
3966         info = {
3967             'id': video_id,
3968             'url': video_url,
3969             'title': title,
3970             'ext': extension,
3971             'format': format,
3972             'thumbnail': thumbnail,
3973             'description': description
3974         }
3975         return [info]
3976
3977 class SpiegelIE(InfoExtractor):
3978     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3979
3980     def _real_extract(self, url):
3981         m = re.match(self._VALID_URL, url)
3982         video_id = m.group('videoID')
3983
3984         webpage = self._download_webpage(url, video_id)
3985         m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
3986         if not m:
3987             raise ExtractorError(u'Cannot find title')
3988         video_title = unescapeHTML(m.group(1))
3989
3990         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3991         xml_code = self._download_webpage(xml_url, video_id,
3992                     note=u'Downloading XML', errnote=u'Failed to download XML')
3993
3994         idoc = xml.etree.ElementTree.fromstring(xml_code)
3995         last_type = idoc[-1]
3996         filename = last_type.findall('./filename')[0].text
3997         duration = float(last_type.findall('./duration')[0].text)
3998
3999         video_url = 'http://video2.spiegel.de/flash/' + filename
4000         video_ext = filename.rpartition('.')[2]
4001         info = {
4002             'id': video_id,
4003             'url': video_url,
4004             'ext': video_ext,
4005             'title': video_title,
4006             'duration': duration,
4007         }
4008         return [info]
4009
4010 class LiveLeakIE(InfoExtractor):
4011
4012     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4013     IE_NAME = u'liveleak'
4014
4015     def _real_extract(self, url):
4016         mobj = re.match(self._VALID_URL, url)
4017         if mobj is None:
4018             raise ExtractorError(u'Invalid URL: %s' % url)
4019
4020         video_id = mobj.group('video_id')
4021
4022         webpage = self._download_webpage(url, video_id)
4023
4024         m = re.search(r'file: "(.*?)",', webpage)
4025         if not m:
4026             raise ExtractorError(u'Unable to find video url')
4027         video_url = m.group(1)
4028
4029         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4030         if not m:
4031             raise ExtractorError(u'Cannot find video title')
4032         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4033
4034         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4035         if m:
4036             desc = unescapeHTML(m.group('desc'))
4037         else:
4038             desc = None
4039
4040         m = re.search(r'By:.*?(\w+)</a>', webpage)
4041         if m:
4042             uploader = clean_html(m.group(1))
4043         else:
4044             uploader = None
4045
4046         info = {
4047             'id':  video_id,
4048             'url': video_url,
4049             'ext': 'mp4',
4050             'title': title,
4051             'description': desc,
4052             'uploader': uploader
4053         }
4054
4055         return [info]
4056
4057 class ARDIE(InfoExtractor):
4058     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4059     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4060     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4061
4062     def _real_extract(self, url):
4063         # determine video id from url
4064         m = re.match(self._VALID_URL, url)
4065
4066         numid = re.search(r'documentId=([0-9]+)', url)
4067         if numid:
4068             video_id = numid.group(1)
4069         else:
4070             video_id = m.group('video_id')
4071
4072         # determine title and media streams from webpage
4073         html = self._download_webpage(url, video_id)
4074         title = re.search(self._TITLE, html).group('title')
4075         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4076         if not streams:
4077             assert '"fsk"' in html
4078             raise ExtractorError(u'This video is only available after 8:00 pm')
4079
4080         # choose default media type and highest quality for now
4081         stream = max([s for s in streams if int(s["media_type"]) == 0],
4082                      key=lambda s: int(s["quality"]))
4083
4084         # there's two possibilities: RTMP stream or HTTP download
4085         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4086         if stream['rtmp_url']:
4087             self.to_screen(u'RTMP download detected')
4088             assert stream['video_url'].startswith('mp4:')
4089             info["url"] = stream["rtmp_url"]
4090             info["play_path"] = stream['video_url']
4091         else:
4092             assert stream["video_url"].endswith('.mp4')
4093             info["url"] = stream["video_url"]
4094         return [info]
4095
4096 class ZDFIE(InfoExtractor):
4097     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4098     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4099     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4100     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4101     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4102
4103     def _real_extract(self, url):
4104         mobj = re.match(self._VALID_URL, url)
4105         if mobj is None:
4106             raise ExtractorError(u'Invalid URL: %s' % url)
4107         video_id = mobj.group('video_id')
4108
4109         html = self._download_webpage(url, video_id)
4110         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4111         if streams is None:
4112             raise ExtractorError(u'No media url found.')
4113
4114         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4115         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4116         # choose first/default media type and highest quality for now
4117         for s in streams:        #find 300 - dsl1000mbit
4118             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4119                 stream_=s
4120                 break
4121         for s in streams:        #find veryhigh - dsl2000mbit
4122             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4123                 stream_=s
4124                 break
4125         if stream_ is None:
4126             raise ExtractorError(u'No stream found.')
4127
4128         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4129
4130         self.report_extraction(video_id)
4131         mobj = re.search(self._TITLE, html)
4132         if mobj is None:
4133             raise ExtractorError(u'Cannot extract title')
4134         title = unescapeHTML(mobj.group('title'))
4135
4136         mobj = re.search(self._MMS_STREAM, media_link)
4137         if mobj is None:
4138             mobj = re.search(self._RTSP_STREAM, media_link)
4139             if mobj is None:
4140                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4141         mms_url = mobj.group('video_url')
4142
4143         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4144         if mobj is None:
4145             raise ExtractorError(u'Cannot extract extention')
4146         ext = mobj.group('ext')
4147
4148         return [{'id': video_id,
4149                  'url': mms_url,
4150                  'title': title,
4151                  'ext': ext
4152                  }]
4153
4154 class TumblrIE(InfoExtractor):
4155     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4156
4157     def _real_extract(self, url):
4158         m_url = re.match(self._VALID_URL, url)
4159         video_id = m_url.group('id')
4160         blog = m_url.group('blog_name')
4161
4162         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4163         webpage = self._download_webpage(url, video_id)
4164
4165         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4166         video = re.search(re_video, webpage)
4167         if video is None:
4168             self.to_screen("No video found")
4169             return []
4170         video_url = video.group('video_url')
4171         ext = video.group('ext')
4172
4173         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4174         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4175
4176         # The only place where you can get a title, it's not complete,
4177         # but searching in other places doesn't work for all videos
4178         re_title = r'<title>(?P<title>.*?)</title>'
4179         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4180
4181         return [{'id': video_id,
4182                  'url': video_url,
4183                  'title': title,
4184                  'thumbnail': thumb,
4185                  'ext': ext
4186                  }]
4187
4188 class BandcampIE(InfoExtractor):
4189     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4190
4191     def _real_extract(self, url):
4192         mobj = re.match(self._VALID_URL, url)
4193         title = mobj.group('title')
4194         webpage = self._download_webpage(url, title)
4195         # We get the link to the free download page
4196         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4197         if m_download is None:
4198             raise ExtractorError(u'No free songs founded')
4199
4200         download_link = m_download.group(1)
4201         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4202                        webpage, re.MULTILINE|re.DOTALL).group('id')
4203
4204         download_webpage = self._download_webpage(download_link, id,
4205                                                   'Downloading free downloads page')
4206         # We get the dictionary of the track from some javascrip code
4207         info = re.search(r'items: (.*?),$',
4208                          download_webpage, re.MULTILINE).group(1)
4209         info = json.loads(info)[0]
4210         # We pick mp3-320 for now, until format selection can be easily implemented.
4211         mp3_info = info[u'downloads'][u'mp3-320']
4212         # If we try to use this url it says the link has expired
4213         initial_url = mp3_info[u'url']
4214         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4215         m_url = re.match(re_url, initial_url)
4216         #We build the url we will use to get the final track url
4217         # This url is build in Bandcamp in the script download_bunde_*.js
4218         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4219         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4220         # If we could correctly generate the .rand field the url would be
4221         #in the "download_url" key
4222         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4223
4224         track_info = {'id':id,
4225                       'title' : info[u'title'],
4226                       'ext' : 'mp3',
4227                       'url' : final_url,
4228                       'thumbnail' : info[u'thumb_url'],
4229                       'uploader' : info[u'artist']
4230                       }
4231
4232         return [track_info]
4233
4234 class RedTubeIE(InfoExtractor):
4235     """Information Extractor for redtube"""
4236     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4237
4238     def _real_extract(self,url):
4239         mobj = re.match(self._VALID_URL, url)
4240         if mobj is None:
4241             raise ExtractorError(u'Invalid URL: %s' % url)
4242
4243         video_id = mobj.group('id')
4244         video_extension = 'mp4'
4245         webpage = self._download_webpage(url, video_id)
4246         self.report_extraction(video_id)
4247         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4248
4249         if mobj is None:
4250             raise ExtractorError(u'Unable to extract media URL')
4251
4252         video_url = mobj.group(1)
4253         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4254         if mobj is None:
4255             raise ExtractorError(u'Unable to extract title')
4256         video_title = mobj.group(1)
4257
4258         return [{
4259             'id':       video_id,
4260             'url':      video_url,
4261             'ext':      video_extension,
4262             'title':    video_title,
4263         }]
4264
4265 class InaIE(InfoExtractor):
4266     """Information Extractor for Ina.fr"""
4267     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4268
4269     def _real_extract(self,url):
4270         mobj = re.match(self._VALID_URL, url)
4271
4272         video_id = mobj.group('id')
4273         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4274         video_extension = 'mp4'
4275         webpage = self._download_webpage(mrss_url, video_id)
4276
4277         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4278         if mobj is None:
4279             raise ExtractorError(u'Unable to extract media URL')
4280         video_url = mobj.group(1)
4281
4282         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4283         if mobj is None:
4284             raise ExtractorError(u'Unable to extract title')
4285         video_title = mobj.group(1)
4286
4287         return [{
4288             'id':       video_id,
4289             'url':      video_url,
4290             'ext':      video_extension,
4291             'title':    video_title,
4292         }]
4293
4294 class HowcastIE(InfoExtractor):
4295     """Information Extractor for Howcast.com"""
4296     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4297
4298     def _real_extract(self, url):
4299         mobj = re.match(self._VALID_URL, url)
4300
4301         video_id = mobj.group('id')
4302         webpage_url = 'http://www.howcast.com/videos/' + video_id
4303         webpage = self._download_webpage(webpage_url, video_id)
4304
4305         self.report_extraction(video_id)
4306
4307         mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4308         if mobj is None:
4309             raise ExtractorError(u'Unable to extract video URL')
4310         video_url = mobj.group(1)
4311
4312         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4313         if mobj is None:
4314             raise ExtractorError(u'Unable to extract title')
4315         video_title = mobj.group(1) or mobj.group(2)
4316
4317         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4318         if mobj is None:
4319             self._downloader.report_warning(u'unable to extract description')
4320             video_description = None
4321         else:
4322             video_description = mobj.group(1) or mobj.group(2)
4323
4324         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4325         if mobj is None:
4326             raise ExtractorError(u'Unable to extract thumbnail')
4327         thumbnail = mobj.group(1)
4328
4329         return [{
4330             'id':       video_id,
4331             'url':      video_url,
4332             'ext':      'mp4',
4333             'title':    video_title,
4334             'description': video_description,
4335             'thumbnail': thumbnail,
4336         }]
4337
4338 class VineIE(InfoExtractor):
4339     """Information Extractor for Vine.co"""
4340     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4341
4342     def _real_extract(self, url):
4343
4344         mobj = re.match(self._VALID_URL, url)
4345
4346         video_id = mobj.group('id')
4347         webpage_url = 'https://vine.co/v/' + video_id
4348         webpage = self._download_webpage(webpage_url, video_id)
4349
4350         self.report_extraction(video_id)
4351
4352         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4353         if mobj is None:
4354             raise ExtractorError(u'Unable to extract video URL')
4355         video_url = mobj.group(1)
4356
4357         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4358         if mobj is None:
4359             raise ExtractorError(u'Unable to extract title')
4360         video_title = mobj.group(1)
4361
4362         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4363         if mobj is None:
4364             raise ExtractorError(u'Unable to extract thumbnail')
4365         thumbnail = mobj.group(1)
4366
4367         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4368         if mobj is None:
4369             raise ExtractorError(u'Unable to extract uploader')
4370         uploader = mobj.group(1)
4371
4372         return [{
4373             'id':        video_id,
4374             'url':       video_url,
4375             'ext':       'mp4',
4376             'title':     video_title,
4377             'thumbnail': thumbnail,
4378             'uploader':  uploader,
4379         }]
4380
4381 class FlickrIE(InfoExtractor):
4382     """Information Extractor for Flickr videos"""
4383     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4384
4385     def _real_extract(self, url):
4386         mobj = re.match(self._VALID_URL, url)
4387
4388         video_id = mobj.group('id')
4389         video_uploader_id = mobj.group('uploader_id')
4390         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4391         webpage = self._download_webpage(webpage_url, video_id)
4392
4393         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4394         if mobj is None:
4395             raise ExtractorError(u'Unable to extract video secret')
4396         secret = mobj.group(1)
4397
4398         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4399         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4400
4401         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4402         if mobj is None:
4403             raise ExtractorError(u'Unable to extract node_id')
4404         node_id = mobj.group(1)
4405
4406         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4407         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4408
4409         self.report_extraction(video_id)
4410
4411         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4412         if mobj is None:
4413             raise ExtractorError(u'Unable to extract video url')
4414         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4415
4416         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4417         if mobj is None:
4418             raise ExtractorError(u'Unable to extract title')
4419         video_title = mobj.group(1) or mobj.group(2)
4420
4421         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4422         if mobj is None:
4423             self._downloader.report_warning(u'unable to extract description')
4424             video_description = None
4425         else:
4426             video_description = mobj.group(1) or mobj.group(2)
4427
4428         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4429         if mobj is None:
4430             raise ExtractorError(u'Unable to extract thumbnail')
4431         thumbnail = mobj.group(1) or mobj.group(2)
4432
4433         return [{
4434             'id':          video_id,
4435             'url':         video_url,
4436             'ext':         'mp4',
4437             'title':       video_title,
4438             'description': video_description,
4439             'thumbnail':   thumbnail,
4440             'uploader_id': video_uploader_id,
4441         }]
4442
4443 class TeamcocoIE(InfoExtractor):
4444     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4445
4446     def _real_extract(self, url):
4447         mobj = re.match(self._VALID_URL, url)
4448         if mobj is None:
4449             raise ExtractorError(u'Invalid URL: %s' % url)
4450         url_title = mobj.group('url_title')
4451         webpage = self._download_webpage(url, url_title)
4452
4453         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4454         video_id = mobj.group(1)
4455
4456         self.report_extraction(video_id)
4457
4458         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4459         if mobj is None:
4460             raise ExtractorError(u'Unable to extract title')
4461         video_title = mobj.group(1)
4462
4463         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4464         if mobj is None:
4465             raise ExtractorError(u'Unable to extract thumbnail')
4466         thumbnail = mobj.group(1)
4467
4468         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4469         if mobj is None:
4470             raise ExtractorError(u'Unable to extract description')
4471         description = mobj.group(1)
4472
4473         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4474         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4475         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4476         if mobj is None:
4477             raise ExtractorError(u'Unable to extract video url')
4478         video_url = mobj.group(1)
4479
4480         return [{
4481             'id':          video_id,
4482             'url':         video_url,
4483             'ext':         'mp4',
4484             'title':       video_title,
4485             'thumbnail':   thumbnail,
4486             'description': description,
4487         }]
4488
4489 class XHamsterIE(InfoExtractor):
4490     """Information Extractor for xHamster"""
4491     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4492
4493     def _real_extract(self,url):
4494         mobj = re.match(self._VALID_URL, url)
4495
4496         video_id = mobj.group('id')
4497         mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4498         webpage = self._download_webpage(mrss_url, video_id)
4499         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4500         if mobj is None:
4501             raise ExtractorError(u'Unable to extract media URL')
4502         if len(mobj.group('server')) == 0:
4503             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4504         else:
4505             video_url = mobj.group('server')+'/key='+mobj.group('file')
4506         video_extension = video_url.split('.')[-1]
4507
4508         mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4509         if mobj is None:
4510             raise ExtractorError(u'Unable to extract title')
4511         video_title = unescapeHTML(mobj.group('title'))
4512
4513         mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4514         if mobj is None:
4515             video_description = u''
4516         else:
4517             video_description = unescapeHTML(mobj.group('description'))
4518
4519         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4520         if mobj is None:
4521             raise ExtractorError(u'Unable to extract upload date')
4522         video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4523
4524         mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4525         if mobj is None:
4526             video_uploader_id = u'anonymous'
4527         else:
4528             video_uploader_id = mobj.group('uploader_id')
4529
4530         mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4531         if mobj is None:
4532             raise ExtractorError(u'Unable to extract thumbnail URL')
4533         video_thumbnail = mobj.group('thumbnail')
4534
4535         return [{
4536             'id':       video_id,
4537             'url':      video_url,
4538             'ext':      video_extension,
4539             'title':    video_title,
4540             'description': video_description,
4541             'upload_date': video_upload_date,
4542             'uploader_id': video_uploader_id,
4543             'thumbnail': video_thumbnail
4544         }]
4545
4546 class HypemIE(InfoExtractor):
4547     """Information Extractor for hypem"""
4548     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4549
4550     def _real_extract(self, url):
4551         mobj = re.match(self._VALID_URL, url)
4552         if mobj is None:
4553             raise ExtractorError(u'Invalid URL: %s' % url)
4554         track_id = mobj.group(1)
4555
4556         data = { 'ax': 1, 'ts': time.time() }
4557         data_encoded = compat_urllib_parse.urlencode(data)
4558         complete_url = url + "?" + data_encoded
4559         request = compat_urllib_request.Request(complete_url)
4560         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4561         cookie = urlh.headers.get('Set-Cookie', '')
4562
4563         self.report_extraction(track_id)
4564         mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4565         if mobj is None:
4566             raise ExtractorError(u'Unable to extrack tracks')
4567         html_tracks = mobj.group(1).strip()
4568         try:
4569             track_list = json.loads(html_tracks)
4570             track = track_list[u'tracks'][0]
4571         except ValueError:
4572             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4573
4574         key = track[u"key"]
4575         track_id = track[u"id"]
4576         artist = track[u"artist"]
4577         title = track[u"song"]
4578
4579         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4580         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4581         request.add_header('cookie', cookie)
4582         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4583         try:
4584             song_data = json.loads(song_data_json)
4585         except ValueError:
4586             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4587         final_url = song_data[u"url"]
4588
4589         return [{
4590             'id':       track_id,
4591             'url':      final_url,
4592             'ext':      "mp3",
4593             'title':    title,
4594             'artist':   artist,
4595         }]
4596
4597 class Vbox7IE(InfoExtractor):
4598     """Information Extractor for Vbox7"""
4599     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4600
4601     def _real_extract(self,url):
4602         mobj = re.match(self._VALID_URL, url)
4603         if mobj is None:
4604             raise ExtractorError(u'Invalid URL: %s' % url)
4605         video_id = mobj.group(1)
4606
4607         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4608         redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4609         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4610
4611         title = re.search(r'<title>(.*)</title>', webpage)
4612         title = (title.group(1)).split('/')[0].strip()
4613
4614         ext = "flv"
4615         info_url = "http://vbox7.com/play/magare.do"
4616         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4617         info_request = compat_urllib_request.Request(info_url, data)
4618         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4619         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4620         if info_response is None:
4621             raise ExtractorError(u'Unable to extract the media url')
4622         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4623
4624         return [{
4625             'id':        video_id,
4626             'url':       final_url,
4627             'ext':       ext,
4628             'title':     title,
4629             'thumbnail': thumbnail_url,
4630         }]
4631
4632 def gen_extractors():
4633     """ Return a list of an instance of every supported extractor.
4634     The order does matter; the first extractor matched is the one handling the URL.
4635     """
4636     return [
4637         YoutubePlaylistIE(),
4638         YoutubeChannelIE(),
4639         YoutubeUserIE(),
4640         YoutubeSearchIE(),
4641         YoutubeIE(),
4642         MetacafeIE(),
4643         DailymotionIE(),
4644         GoogleSearchIE(),
4645         PhotobucketIE(),
4646         YahooIE(),
4647         YahooSearchIE(),
4648         DepositFilesIE(),
4649         FacebookIE(),
4650         BlipTVIE(),
4651         BlipTVUserIE(),
4652         VimeoIE(),
4653         MyVideoIE(),
4654         ComedyCentralIE(),
4655         EscapistIE(),
4656         CollegeHumorIE(),
4657         XVideosIE(),
4658         SoundcloudSetIE(),
4659         SoundcloudIE(),
4660         InfoQIE(),
4661         MixcloudIE(),
4662         StanfordOpenClassroomIE(),
4663         MTVIE(),
4664         YoukuIE(),
4665         XNXXIE(),
4666         YouJizzIE(),
4667         PornotubeIE(),
4668         YouPornIE(),
4669         GooglePlusIE(),
4670         ArteTvIE(),
4671         NBAIE(),
4672         WorldStarHipHopIE(),
4673         JustinTVIE(),
4674         FunnyOrDieIE(),
4675         SteamIE(),
4676         UstreamIE(),
4677         RBMARadioIE(),
4678         EightTracksIE(),
4679         KeekIE(),
4680         TEDIE(),
4681         MySpassIE(),
4682         SpiegelIE(),
4683         LiveLeakIE(),
4684         ARDIE(),
4685         ZDFIE(),
4686         TumblrIE(),
4687         BandcampIE(),
4688         RedTubeIE(),
4689         InaIE(),
4690         HowcastIE(),
4691         VineIE(),
4692         FlickrIE(),
4693         TeamcocoIE(),
4694         XHamsterIE(),
4695         HypemIE(),
4696         Vbox7IE(),
4697         GenericIE()
4698     ]
4699
4700 def get_info_extractor(ie_name):
4701     """Returns the info extractor class with the given ie_name"""
4702     return globals()[ie_name+'IE']