_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194 class SearchInfoExtractor(InfoExtractor):
 195     """
 196     Base class for paged search queries extractors.
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 199     """
 200
 201     @classmethod
 202     def _make_valid_url(cls):
 203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 204
 205     @classmethod
 206     def suitable(cls, url):
 207         return re.match(cls._make_valid_url(), url) is not None
 208
 209     def _real_extract(self, query):
 210         mobj = re.match(self._make_valid_url(), query)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid search query "%s"' % query)
 213
 214         prefix = mobj.group('prefix')
 215         query = mobj.group('query')
 216         if prefix == '':
 217             return self._get_n_results(query, 1)
 218         elif prefix == 'all':
 219             return self._get_n_results(query, self._MAX_RESULTS)
 220         else:
 221             n = int(prefix)
 222             if n <= 0:
 223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 224             elif n > self._MAX_RESULTS:
 225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 226                 n = self._MAX_RESULTS
 227             return self._get_n_results(query, n)
 228
 229     def _get_n_results(self, query, n):
 230         """Get a specified number of results for a query"""
 231         raise NotImplementedError("This method must be implemented by sublclasses")
 232
 233
 234 class YoutubeIE(InfoExtractor):
 235     """Information extractor for youtube.com."""
 236
 237     _VALID_URL = r"""^
 238                      (
 239                          (?:https?://)?                                       # http(s):// (optional)
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 243                          (?:                                                  # the various things that can precede the ID:
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 245                              |(?:                                             # or the v= param in all its forms
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 249                                  v=
 250                              )
 251                          )?                                                   # optional -> youtube.com/xxxx is OK
 252                      )?                                                       # all until now is optional -> you can pass the naked ID
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 254                      (?(1).+)?                                                # if we found the ID, everything can follow
 255                      $"""
 256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 260     _NETRC_MACHINE = 'youtube'
 261     # Listed in order of quality
 262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 264     _video_extensions = {
 265         '13': '3gp',
 266         '17': 'mp4',
 267         '18': 'mp4',
 268         '22': 'mp4',
 269         '37': 'mp4',
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 271         '43': 'webm',
 272         '44': 'webm',
 273         '45': 'webm',
 274         '46': 'webm',
 275     }
 276     _video_dimensions = {
 277         '5': '240x400',
 278         '6': '???',
 279         '13': '???',
 280         '17': '144x176',
 281         '18': '360x640',
 282         '22': '720x1280',
 283         '34': '360x640',
 284         '35': '480x854',
 285         '37': '1080x1920',
 286         '38': '3072x4096',
 287         '43': '360x640',
 288         '44': '480x854',
 289         '45': '720x1280',
 290         '46': '1080x1920',
 291     }
 292     IE_NAME = u'youtube'
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         """Receives a URL and returns True if suitable for this IE."""
 297         if YoutubePlaylistIE.suitable(url): return False
 298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 299
 300     def report_lang(self):
 301         """Report attempt to set language."""
 302         self.to_screen(u'Setting language')
 303
 304     def report_login(self):
 305         """Report attempt to log in."""
 306         self.to_screen(u'Logging in')
 307
 308     def report_video_webpage_download(self, video_id):
 309         """Report attempt to download video webpage."""
 310         self.to_screen(u'%s: Downloading video webpage' % video_id)
 311
 312     def report_video_info_webpage_download(self, video_id):
 313         """Report attempt to download video info webpage."""
 314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 315
 316     def report_video_subtitles_download(self, video_id):
 317         """Report attempt to download video info webpage."""
 318         self.to_screen(u'%s: Checking available subtitles' % video_id)
 319
 320     def report_video_subtitles_request(self, video_id, sub_lang, format):
 321         """Report attempt to download video info webpage."""
 322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 323
 324     def report_video_subtitles_available(self, video_id, sub_lang_list):
 325         """Report available subtitles."""
 326         sub_lang = ",".join(list(sub_lang_list.keys()))
 327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 328
 329     def report_information_extraction(self, video_id):
 330         """Report attempt to extract video information."""
 331         self.to_screen(u'%s: Extracting video information' % video_id)
 332
 333     def report_unavailable_format(self, video_id, format):
 334         """Report extracted video URL."""
 335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 336
 337     def report_rtmp_download(self):
 338         """Indicate the download will use the RTMP protocol."""
 339         self.to_screen(u'RTMP download detected')
 340
 341     def _get_available_subtitles(self, video_id):
 342         self.report_video_subtitles_download(video_id)
 343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 344         try:
 345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 350         if not sub_lang_list:
 351             return (u'video doesn\'t have subtitles', None)
 352         return sub_lang_list
 353
 354     def _list_available_subtitles(self, video_id):
 355         sub_lang_list = self._get_available_subtitles(video_id)
 356         self.report_video_subtitles_available(video_id, sub_lang_list)
 357
 358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 359         """
 360         Return tuple:
 361         (error_message, sub_lang, sub)
 362         """
 363         self.report_video_subtitles_request(video_id, sub_lang, format)
 364         params = compat_urllib_parse.urlencode({
 365             'lang': sub_lang,
 366             'name': sub_name,
 367             'v': video_id,
 368             'fmt': format,
 369         })
 370         url = 'http://www.youtube.com/api/timedtext?' + params
 371         try:
 372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 375         if not sub:
 376             return (u'Did not fetch video subtitles', None, None)
 377         return (None, sub_lang, sub)
 378
 379     def _request_automatic_caption(self, video_id, webpage):
 380         """We need the webpage for getting the captions url, pass it as an
 381            argument to speed up the process."""
 382         sub_lang = self._downloader.params.get('subtitleslang')
 383         sub_format = self._downloader.params.get('subtitlesformat')
 384         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 385         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 386         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 387         if mobj is None:
 388             return [(err_msg, None, None)]
 389         player_config = json.loads(mobj.group(1))
 390         try:
 391             args = player_config[u'args']
 392             caption_url = args[u'ttsurl']
 393             timestamp = args[u'timestamp']
 394             params = compat_urllib_parse.urlencode({
 395                 'lang': 'en',
 396                 'tlang': sub_lang,
 397                 'fmt': sub_format,
 398                 'ts': timestamp,
 399                 'kind': 'asr',
 400             })
 401             subtitles_url = caption_url + '&' + params
 402             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 403             return [(None, sub_lang, sub)]
 404         except KeyError:
 405             return [(err_msg, None, None)]
 406
 407     def _extract_subtitle(self, video_id):
 408         """
 409         Return a list with a tuple:
 410         [(error_message, sub_lang, sub)]
 411         """
 412         sub_lang_list = self._get_available_subtitles(video_id)
 413         sub_format = self._downloader.params.get('subtitlesformat')
 414         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 415             return [(sub_lang_list[0], None, None)]
 416         if self._downloader.params.get('subtitleslang', False):
 417             sub_lang = self._downloader.params.get('subtitleslang')
 418         elif 'en' in sub_lang_list:
 419             sub_lang = 'en'
 420         else:
 421             sub_lang = list(sub_lang_list.keys())[0]
 422         if not sub_lang in sub_lang_list:
 423             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 424
 425         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 426         return [subtitle]
 427
 428     def _extract_all_subtitles(self, video_id):
 429         sub_lang_list = self._get_available_subtitles(video_id)
 430         sub_format = self._downloader.params.get('subtitlesformat')
 431         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 432             return [(sub_lang_list[0], None, None)]
 433         subtitles = []
 434         for sub_lang in sub_lang_list:
 435             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 436             subtitles.append(subtitle)
 437         return subtitles
 438
 439     def _print_formats(self, formats):
 440         print('Available formats:')
 441         for x in formats:
 442             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 443
 444     def _real_initialize(self):
 445         if self._downloader is None:
 446             return
 447
 448         username = None
 449         password = None
 450         downloader_params = self._downloader.params
 451
 452         # Attempt to use provided username and password or .netrc data
 453         if downloader_params.get('username', None) is not None:
 454             username = downloader_params['username']
 455             password = downloader_params['password']
 456         elif downloader_params.get('usenetrc', False):
 457             try:
 458                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 459                 if info is not None:
 460                     username = info[0]
 461                     password = info[2]
 462                 else:
 463                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 464             except (IOError, netrc.NetrcParseError) as err:
 465                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 466                 return
 467
 468         # Set language
 469         request = compat_urllib_request.Request(self._LANG_URL)
 470         try:
 471             self.report_lang()
 472             compat_urllib_request.urlopen(request).read()
 473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 474             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 475             return
 476
 477         # No authentication to be performed
 478         if username is None:
 479             return
 480
 481         request = compat_urllib_request.Request(self._LOGIN_URL)
 482         try:
 483             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 485             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 486             return
 487
 488         galx = None
 489         dsh = None
 490         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 491         if match:
 492           galx = match.group(1)
 493
 494         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 495         if match:
 496           dsh = match.group(1)
 497
 498         # Log in
 499         login_form_strs = {
 500                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 501                 u'Email': username,
 502                 u'GALX': galx,
 503                 u'Passwd': password,
 504                 u'PersistentCookie': u'yes',
 505                 u'_utf8': u'霱',
 506                 u'bgresponse': u'js_disabled',
 507                 u'checkConnection': u'',
 508                 u'checkedDomains': u'youtube',
 509                 u'dnConn': u'',
 510                 u'dsh': dsh,
 511                 u'pstMsg': u'0',
 512                 u'rmShown': u'1',
 513                 u'secTok': u'',
 514                 u'signIn': u'Sign in',
 515                 u'timeStmp': u'',
 516                 u'service': u'youtube',
 517                 u'uilel': u'3',
 518                 u'hl': u'en_US',
 519         }
 520         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 521         # chokes on unicode
 522         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 523         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 524         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 525         try:
 526             self.report_login()
 527             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 528             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 529                 self._downloader.report_warning(u'unable to log in: bad username or password')
 530                 return
 531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 532             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 533             return
 534
 535         # Confirm age
 536         age_form = {
 537                 'next_url':     '/',
 538                 'action_confirm':   'Confirm',
 539                 }
 540         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 541         try:
 542             self.report_age_confirmation()
 543             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 545             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 546
 547     def _extract_id(self, url):
 548         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 549         if mobj is None:
 550             raise ExtractorError(u'Invalid URL: %s' % url)
 551         video_id = mobj.group(2)
 552         return video_id
 553
 554     def _real_extract(self, url):
 555         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 556         mobj = re.search(self._NEXT_URL_RE, url)
 557         if mobj:
 558             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 559         video_id = self._extract_id(url)
 560
 561         # Get video webpage
 562         self.report_video_webpage_download(video_id)
 563         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 564         request = compat_urllib_request.Request(url)
 565         try:
 566             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 568             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 569
 570         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 571
 572         # Attempt to extract SWF player URL
 573         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 574         if mobj is not None:
 575             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 576         else:
 577             player_url = None
 578
 579         # Get video info
 580         self.report_video_info_webpage_download(video_id)
 581         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 582             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 583                     % (video_id, el_type))
 584             video_info_webpage = self._download_webpage(video_info_url, video_id,
 585                                     note=False,
 586                                     errnote='unable to download video info webpage')
 587             video_info = compat_parse_qs(video_info_webpage)
 588             if 'token' in video_info:
 589                 break
 590         if 'token' not in video_info:
 591             if 'reason' in video_info:
 592                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 593             else:
 594                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 595
 596         # Check for "rental" videos
 597         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 598             raise ExtractorError(u'"rental" videos not supported')
 599
 600         # Start extracting information
 601         self.report_information_extraction(video_id)
 602
 603         # uploader
 604         if 'author' not in video_info:
 605             raise ExtractorError(u'Unable to extract uploader name')
 606         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 607
 608         # uploader_id
 609         video_uploader_id = None
 610         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 611         if mobj is not None:
 612             video_uploader_id = mobj.group(1)
 613         else:
 614             self._downloader.report_warning(u'unable to extract uploader nickname')
 615
 616         # title
 617         if 'title' not in video_info:
 618             raise ExtractorError(u'Unable to extract video title')
 619         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 620
 621         # thumbnail image
 622         if 'thumbnail_url' not in video_info:
 623             self._downloader.report_warning(u'unable to extract video thumbnail')
 624             video_thumbnail = ''
 625         else:   # don't panic if we can't find it
 626             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 627
 628         # upload date
 629         upload_date = None
 630         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 631         if mobj is not None:
 632             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 633             upload_date = unified_strdate(upload_date)
 634
 635         # description
 636         video_description = get_element_by_id("eow-description", video_webpage)
 637         if video_description:
 638             video_description = clean_html(video_description)
 639         else:
 640             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 641             if fd_mobj:
 642                 video_description = unescapeHTML(fd_mobj.group(1))
 643             else:
 644                 video_description = u''
 645
 646         # subtitles
 647         video_subtitles = None
 648
 649         if self._downloader.params.get('writesubtitles', False):
 650             video_subtitles = self._extract_subtitle(video_id)
 651             if video_subtitles:
 652                 (sub_error, sub_lang, sub) = video_subtitles[0]
 653                 if sub_error:
 654                     # We try with the automatic captions
 655                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 656                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 657                     if sub is not None:
 658                         pass
 659                     else:
 660                         # We report the original error
 661                         self._downloader.report_error(sub_error)
 662
 663         if self._downloader.params.get('allsubtitles', False):
 664             video_subtitles = self._extract_all_subtitles(video_id)
 665             for video_subtitle in video_subtitles:
 666                 (sub_error, sub_lang, sub) = video_subtitle
 667                 if sub_error:
 668                     self._downloader.report_error(sub_error)
 669
 670         if self._downloader.params.get('listsubtitles', False):
 671             sub_lang_list = self._list_available_subtitles(video_id)
 672             return
 673
 674         if 'length_seconds' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video duration')
 676             video_duration = ''
 677         else:
 678             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 679
 680         # token
 681         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 682
 683         # Decide which formats to download
 684         req_format = self._downloader.params.get('format', None)
 685
 686         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 687             self.report_rtmp_download()
 688             video_url_list = [(None, video_info['conn'][0])]
 689         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 690             url_map = {}
 691             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 692                 url_data = compat_parse_qs(url_data_str)
 693                 if 'itag' in url_data and 'url' in url_data:
 694                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 695                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 696                     url_map[url_data['itag'][0]] = url
 697
 698             format_limit = self._downloader.params.get('format_limit', None)
 699             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 700             if format_limit is not None and format_limit in available_formats:
 701                 format_list = available_formats[available_formats.index(format_limit):]
 702             else:
 703                 format_list = available_formats
 704             existing_formats = [x for x in format_list if x in url_map]
 705             if len(existing_formats) == 0:
 706                 raise ExtractorError(u'no known formats available for video')
 707             if self._downloader.params.get('listformats', None):
 708                 self._print_formats(existing_formats)
 709                 return
 710             if req_format is None or req_format == 'best':
 711                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 712             elif req_format == 'worst':
 713                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 714             elif req_format in ('-1', 'all'):
 715                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 716             else:
 717                 # Specific formats. We pick the first in a slash-delimeted sequence.
 718                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 719                 req_formats = req_format.split('/')
 720                 video_url_list = None
 721                 for rf in req_formats:
 722                     if rf in url_map:
 723                         video_url_list = [(rf, url_map[rf])]
 724                         break
 725                 if video_url_list is None:
 726                     raise ExtractorError(u'requested format not available')
 727         else:
 728             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 729
 730         results = []
 731         for format_param, video_real_url in video_url_list:
 732             # Extension
 733             video_extension = self._video_extensions.get(format_param, 'flv')
 734
 735             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 736                                               self._video_dimensions.get(format_param, '???'))
 737
 738             results.append({
 739                 'id':       video_id,
 740                 'url':      video_real_url,
 741                 'uploader': video_uploader,
 742                 'uploader_id': video_uploader_id,
 743                 'upload_date':  upload_date,
 744                 'title':    video_title,
 745                 'ext':      video_extension,
 746                 'format':   video_format,
 747                 'thumbnail':    video_thumbnail,
 748                 'description':  video_description,
 749                 'player_url':   player_url,
 750                 'subtitles':    video_subtitles,
 751                 'duration':     video_duration
 752             })
 753         return results
 754
 755
 756 class MetacafeIE(InfoExtractor):
 757     """Information Extractor for metacafe.com."""
 758
 759     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 760     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 761     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 762     IE_NAME = u'metacafe'
 763
 764     def report_disclaimer(self):
 765         """Report disclaimer retrieval."""
 766         self.to_screen(u'Retrieving disclaimer')
 767
 768     def _real_initialize(self):
 769         # Retrieve disclaimer
 770         request = compat_urllib_request.Request(self._DISCLAIMER)
 771         try:
 772             self.report_disclaimer()
 773             disclaimer = compat_urllib_request.urlopen(request).read()
 774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 775             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 776
 777         # Confirm age
 778         disclaimer_form = {
 779             'filters': '0',
 780             'submit': "Continue - I'm over 18",
 781             }
 782         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 783         try:
 784             self.report_age_confirmation()
 785             disclaimer = compat_urllib_request.urlopen(request).read()
 786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 787             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             raise ExtractorError(u'Invalid URL: %s' % url)
 794
 795         video_id = mobj.group(1)
 796
 797         # Check if video comes from YouTube
 798         mobj2 = re.match(r'^yt-(.*)$', video_id)
 799         if mobj2 is not None:
 800             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 801
 802         # Retrieve video webpage to extract further information
 803         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 808         if mobj is not None:
 809             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810             video_extension = mediaURL[-3:]
 811
 812             # Extract gdaKey if available
 813             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 814             if mobj is None:
 815                 video_url = mediaURL
 816             else:
 817                 gdaKey = mobj.group(1)
 818                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 819         else:
 820             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 821             if mobj is None:
 822                 raise ExtractorError(u'Unable to extract media URL')
 823             vardict = compat_parse_qs(mobj.group(1))
 824             if 'mediaData' not in vardict:
 825                 raise ExtractorError(u'Unable to extract media URL')
 826             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 827             if mobj is None:
 828                 raise ExtractorError(u'Unable to extract media URL')
 829             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 830             video_extension = mediaURL[-3:]
 831             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 832
 833         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 834         if mobj is None:
 835             raise ExtractorError(u'Unable to extract title')
 836         video_title = mobj.group(1).decode('utf-8')
 837
 838         mobj = re.search(r'submitter=(.*?);', webpage)
 839         if mobj is None:
 840             raise ExtractorError(u'Unable to extract uploader nickname')
 841         video_uploader = mobj.group(1)
 842
 843         return [{
 844             'id':       video_id.decode('utf-8'),
 845             'url':      video_url.decode('utf-8'),
 846             'uploader': video_uploader.decode('utf-8'),
 847             'upload_date':  None,
 848             'title':    video_title,
 849             'ext':      video_extension.decode('utf-8'),
 850         }]
 851
 852 class DailymotionIE(InfoExtractor):
 853     """Information Extractor for Dailymotion"""
 854
 855     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 856     IE_NAME = u'dailymotion'
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1).split('_')[0].split('?')[0]
 865
 866         video_extension = 'mp4'
 867
 868         # Retrieve video webpage to extract further information
 869         request = compat_urllib_request.Request(url)
 870         request.add_header('Cookie', 'family_filter=off')
 871         webpage = self._download_webpage(request, video_id)
 872
 873         # Extract URL, uploader and title from webpage
 874         self.report_extraction(video_id)
 875         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 876         if mobj is None:
 877             raise ExtractorError(u'Unable to extract media URL')
 878         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 879
 880         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 881             if key in flashvars:
 882                 max_quality = key
 883                 self.to_screen(u'Using %s' % key)
 884                 break
 885         else:
 886             raise ExtractorError(u'Unable to extract video URL')
 887
 888         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 889         if mobj is None:
 890             raise ExtractorError(u'Unable to extract video URL')
 891
 892         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 893
 894         # TODO: support choosing qualities
 895
 896         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 897         if mobj is None:
 898             raise ExtractorError(u'Unable to extract title')
 899         video_title = unescapeHTML(mobj.group('title'))
 900
 901         video_uploader = None
 902         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 903         if mobj is None:
 904             # lookin for official user
 905             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 906             if mobj_official is None:
 907                 self._downloader.report_warning(u'unable to extract uploader nickname')
 908             else:
 909                 video_uploader = mobj_official.group(1)
 910         else:
 911             video_uploader = mobj.group(1)
 912
 913         video_upload_date = None
 914         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 915         if mobj is not None:
 916             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 917
 918         return [{
 919             'id':       video_id,
 920             'url':      video_url,
 921             'uploader': video_uploader,
 922             'upload_date':  video_upload_date,
 923             'title':    video_title,
 924             'ext':      video_extension,
 925         }]
 926
 927
 928 class PhotobucketIE(InfoExtractor):
 929     """Information extractor for photobucket.com."""
 930
 931     # TODO: the original _VALID_URL was:
 932     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 933     # Check if it's necessary to keep the old extracion process
 934     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 935     IE_NAME = u'photobucket'
 936
 937     def _real_extract(self, url):
 938         # Extract id from URL
 939         mobj = re.match(self._VALID_URL, url)
 940         if mobj is None:
 941             raise ExtractorError(u'Invalid URL: %s' % url)
 942
 943         video_id = mobj.group('id')
 944
 945         video_extension = mobj.group('ext')
 946
 947         # Retrieve video webpage to extract further information
 948         webpage = self._download_webpage(url, video_id)
 949
 950         # Extract URL, uploader, and title from webpage
 951         self.report_extraction(video_id)
 952         # We try first by looking the javascript code:
 953         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 954         if mobj is not None:
 955             info = json.loads(mobj.group('json'))
 956             return [{
 957                 'id':       video_id,
 958                 'url':      info[u'downloadUrl'],
 959                 'uploader': info[u'username'],
 960                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 961                 'title':    info[u'title'],
 962                 'ext':      video_extension,
 963                 'thumbnail': info[u'thumbUrl'],
 964             }]
 965
 966         # We try looking in other parts of the webpage
 967         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 968         if mobj is None:
 969             raise ExtractorError(u'Unable to extract media URL')
 970         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 971
 972         video_url = mediaURL
 973
 974         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 975         if mobj is None:
 976             raise ExtractorError(u'Unable to extract title')
 977         video_title = mobj.group(1).decode('utf-8')
 978
 979         video_uploader = mobj.group(2).decode('utf-8')
 980
 981         return [{
 982             'id':       video_id.decode('utf-8'),
 983             'url':      video_url.decode('utf-8'),
 984             'uploader': video_uploader,
 985             'upload_date':  None,
 986             'title':    video_title,
 987             'ext':      video_extension.decode('utf-8'),
 988         }]
 989
 990
 991 class YahooIE(InfoExtractor):
 992     """Information extractor for screen.yahoo.com."""
 993     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 994
 995     def _real_extract(self, url):
 996         mobj = re.match(self._VALID_URL, url)
 997         if mobj is None:
 998             raise ExtractorError(u'Invalid URL: %s' % url)
 999         video_id = mobj.group('id')
1000         webpage = self._download_webpage(url, video_id)
1001         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1002
1003         if m_id is None:
1004             # TODO: Check which url parameters are required
1005             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1011                         '''
1012             self.report_extraction(video_id)
1013             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1014             if m_info is None:
1015                 raise ExtractorError(u'Unable to extract video info')
1016             video_title = m_info.group('title')
1017             video_description = m_info.group('description')
1018             video_thumb = m_info.group('thumb')
1019             video_date = m_info.group('date')
1020             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1021
1022             # TODO: Find a way to get mp4 videos
1023             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026             video_url = m_rest.group('url')
1027             video_path = m_rest.group('path')
1028             if m_rest is None:
1029                 raise ExtractorError(u'Unable to extract video url')
1030
1031         else: # We have to use a different method if another id is defined
1032             long_id = m_id.group('new_id')
1033             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036             info = json.loads(json_str)
1037             res = info[u'query'][u'results'][u'mediaObj'][0]
1038             stream = res[u'streams'][0]
1039             video_path = stream[u'path']
1040             video_url = stream[u'host']
1041             meta = res[u'meta']
1042             video_title = meta[u'title']
1043             video_description = meta[u'description']
1044             video_thumb = meta[u'thumbnail']
1045             video_date = None # I can't find it
1046
1047         info_dict = {
1048                      'id': video_id,
1049                      'url': video_url,
1050                      'play_path': video_path,
1051                      'title':video_title,
1052                      'description': video_description,
1053                      'thumbnail': video_thumb,
1054                      'upload_date': video_date,
1055                      'ext': 'flv',
1056                      }
1057         return info_dict
1058
1059 class VimeoIE(InfoExtractor):
1060     """Information extractor for vimeo.com."""
1061
1062     # _VALID_URL matches Vimeo URLs
1063     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1064     IE_NAME = u'vimeo'
1065
1066     def _verify_video_password(self, url, video_id, webpage):
1067         password = self._downloader.params.get('password', None)
1068         if password is None:
1069             raise ExtractorError(u'This video is protected by a password, use the --password option')
1070         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
1071         data = compat_urllib_parse.urlencode({'password': password,
1072                                               'token': token})
1073         # I didn't manage to use the password with https
1074         if url.startswith('https'):
1075             pass_url = url.replace('https','http')
1076         else:
1077             pass_url = url
1078         password_request = compat_urllib_request.Request(pass_url+'/password', data)
1079         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1080         password_request.add_header('Cookie', 'xsrft=%s' % token)
1081         pass_web = self._download_webpage(password_request, video_id,
1082                                           u'Verifying the password',
1083                                           u'Wrong password')
1084
1085     def _real_extract(self, url, new_video=True):
1086         # Extract ID from URL
1087         mobj = re.match(self._VALID_URL, url)
1088         if mobj is None:
1089             raise ExtractorError(u'Invalid URL: %s' % url)
1090
1091         video_id = mobj.group('id')
1092         if not mobj.group('proto'):
1093             url = 'https://' + url
1094         if mobj.group('direct_link') or mobj.group('pro'):
1095             url = 'https://vimeo.com/' + video_id
1096
1097         # Retrieve video webpage to extract further information
1098         request = compat_urllib_request.Request(url, None, std_headers)
1099         webpage = self._download_webpage(request, video_id)
1100
1101         # Now we begin extracting as much information as we can from what we
1102         # retrieved. First we extract the information common to all extractors,
1103         # and latter we extract those that are Vimeo specific.
1104         self.report_extraction(video_id)
1105
1106         # Extract the config JSON
1107         try:
1108             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1109             config = json.loads(config)
1110         except:
1111             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1112                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1113
1114             if re.search('If so please provide the correct password.', webpage):
1115                 self._verify_video_password(url, video_id, webpage)
1116                 return self._real_extract(url)
1117             else:
1118                 raise ExtractorError(u'Unable to extract info section')
1119
1120         # Extract title
1121         video_title = config["video"]["title"]
1122
1123         # Extract uploader and uploader_id
1124         video_uploader = config["video"]["owner"]["name"]
1125         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1126
1127         # Extract video thumbnail
1128         video_thumbnail = config["video"]["thumbnail"]
1129
1130         # Extract video description
1131         video_description = get_element_by_attribute("itemprop", "description", webpage)
1132         if video_description: video_description = clean_html(video_description)
1133         else: video_description = u''
1134
1135         # Extract upload date
1136         video_upload_date = None
1137         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1138         if mobj is not None:
1139             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1140
1141         # Vimeo specific: extract request signature and timestamp
1142         sig = config['request']['signature']
1143         timestamp = config['request']['timestamp']
1144
1145         # Vimeo specific: extract video codec and quality information
1146         # First consider quality, then codecs, then take everything
1147         # TODO bind to format param
1148         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1149         files = { 'hd': [], 'sd': [], 'other': []}
1150         for codec_name, codec_extension in codecs:
1151             if codec_name in config["video"]["files"]:
1152                 if 'hd' in config["video"]["files"][codec_name]:
1153                     files['hd'].append((codec_name, codec_extension, 'hd'))
1154                 elif 'sd' in config["video"]["files"][codec_name]:
1155                     files['sd'].append((codec_name, codec_extension, 'sd'))
1156                 else:
1157                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1158
1159         for quality in ('hd', 'sd', 'other'):
1160             if len(files[quality]) > 0:
1161                 video_quality = files[quality][0][2]
1162                 video_codec = files[quality][0][0]
1163                 video_extension = files[quality][0][1]
1164                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1165                 break
1166         else:
1167             raise ExtractorError(u'No known codec found')
1168
1169         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1170                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1171
1172         return [{
1173             'id':       video_id,
1174             'url':      video_url,
1175             'uploader': video_uploader,
1176             'uploader_id': video_uploader_id,
1177             'upload_date':  video_upload_date,
1178             'title':    video_title,
1179             'ext':      video_extension,
1180             'thumbnail':    video_thumbnail,
1181             'description':  video_description,
1182         }]
1183
1184
1185 class ArteTvIE(InfoExtractor):
1186     """arte.tv information extractor."""
1187
1188     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1189     _LIVE_URL = r'index-[0-9]+\.html$'
1190
1191     IE_NAME = u'arte.tv'
1192
1193     def fetch_webpage(self, url):
1194         request = compat_urllib_request.Request(url)
1195         try:
1196             self.report_download_webpage(url)
1197             webpage = compat_urllib_request.urlopen(request).read()
1198         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1199             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1200         except ValueError as err:
1201             raise ExtractorError(u'Invalid URL: %s' % url)
1202         return webpage
1203
1204     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1205         page = self.fetch_webpage(url)
1206         mobj = re.search(regex, page, regexFlags)
1207         info = {}
1208
1209         if mobj is None:
1210             raise ExtractorError(u'Invalid URL: %s' % url)
1211
1212         for (i, key, err) in matchTuples:
1213             if mobj.group(i) is None:
1214                 raise ExtractorError(err)
1215             else:
1216                 info[key] = mobj.group(i)
1217
1218         return info
1219
1220     def extractLiveStream(self, url):
1221         video_lang = url.split('/')[-4]
1222         info = self.grep_webpage(
1223             url,
1224             r'src="(.*?/videothek_js.*?\.js)',
1225             0,
1226             [
1227                 (1, 'url', u'Invalid URL: %s' % url)
1228             ]
1229         )
1230         http_host = url.split('/')[2]
1231         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1232         info = self.grep_webpage(
1233             next_url,
1234             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1235                 '(http://.*?\.swf).*?' +
1236                 '(rtmp://.*?)\'',
1237             re.DOTALL,
1238             [
1239                 (1, 'path',   u'could not extract video path: %s' % url),
1240                 (2, 'player', u'could not extract video player: %s' % url),
1241                 (3, 'url',    u'could not extract video url: %s' % url)
1242             ]
1243         )
1244         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1245
1246     def extractPlus7Stream(self, url):
1247         video_lang = url.split('/')[-3]
1248         info = self.grep_webpage(
1249             url,
1250             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1251             0,
1252             [
1253                 (1, 'url', u'Invalid URL: %s' % url)
1254             ]
1255         )
1256         next_url = compat_urllib_parse.unquote(info.get('url'))
1257         info = self.grep_webpage(
1258             next_url,
1259             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1260             0,
1261             [
1262                 (1, 'url', u'Could not find <video> tag: %s' % url)
1263             ]
1264         )
1265         next_url = compat_urllib_parse.unquote(info.get('url'))
1266
1267         info = self.grep_webpage(
1268             next_url,
1269             r'<video id="(.*?)".*?>.*?' +
1270                 '<name>(.*?)</name>.*?' +
1271                 '<dateVideo>(.*?)</dateVideo>.*?' +
1272                 '<url quality="hd">(.*?)</url>',
1273             re.DOTALL,
1274             [
1275                 (1, 'id',    u'could not extract video id: %s' % url),
1276                 (2, 'title', u'could not extract video title: %s' % url),
1277                 (3, 'date',  u'could not extract video date: %s' % url),
1278                 (4, 'url',   u'could not extract video url: %s' % url)
1279             ]
1280         )
1281
1282         return {
1283             'id':           info.get('id'),
1284             'url':          compat_urllib_parse.unquote(info.get('url')),
1285             'uploader':     u'arte.tv',
1286             'upload_date':  unified_strdate(info.get('date')),
1287             'title':        info.get('title').decode('utf-8'),
1288             'ext':          u'mp4',
1289             'format':       u'NA',
1290             'player_url':   None,
1291         }
1292
1293     def _real_extract(self, url):
1294         video_id = url.split('/')[-1]
1295         self.report_extraction(video_id)
1296
1297         if re.search(self._LIVE_URL, video_id) is not None:
1298             self.extractLiveStream(url)
1299             return
1300         else:
1301             info = self.extractPlus7Stream(url)
1302
1303         return [info]
1304
1305
1306 class GenericIE(InfoExtractor):
1307     """Generic last-resort information extractor."""
1308
1309     _VALID_URL = r'.*'
1310     IE_NAME = u'generic'
1311
1312     def report_download_webpage(self, video_id):
1313         """Report webpage download."""
1314         if not self._downloader.params.get('test', False):
1315             self._downloader.report_warning(u'Falling back on generic information extractor.')
1316         super(GenericIE, self).report_download_webpage(video_id)
1317
1318     def report_following_redirect(self, new_url):
1319         """Report information extraction."""
1320         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1321
1322     def _test_redirect(self, url):
1323         """Check if it is a redirect, like url shorteners, in case return the new url."""
1324         class HeadRequest(compat_urllib_request.Request):
1325             def get_method(self):
1326                 return "HEAD"
1327
1328         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1329             """
1330             Subclass the HTTPRedirectHandler to make it use our
1331             HeadRequest also on the redirected URL
1332             """
1333             def redirect_request(self, req, fp, code, msg, headers, newurl):
1334                 if code in (301, 302, 303, 307):
1335                     newurl = newurl.replace(' ', '%20')
1336                     newheaders = dict((k,v) for k,v in req.headers.items()
1337                                       if k.lower() not in ("content-length", "content-type"))
1338                     return HeadRequest(newurl,
1339                                        headers=newheaders,
1340                                        origin_req_host=req.get_origin_req_host(),
1341                                        unverifiable=True)
1342                 else:
1343                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1344
1345         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1346             """
1347             Fallback to GET if HEAD is not allowed (405 HTTP error)
1348             """
1349             def http_error_405(self, req, fp, code, msg, headers):
1350                 fp.read()
1351                 fp.close()
1352
1353                 newheaders = dict((k,v) for k,v in req.headers.items()
1354                                   if k.lower() not in ("content-length", "content-type"))
1355                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1356                                                  headers=newheaders,
1357                                                  origin_req_host=req.get_origin_req_host(),
1358                                                  unverifiable=True))
1359
1360         # Build our opener
1361         opener = compat_urllib_request.OpenerDirector()
1362         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1363                         HTTPMethodFallback, HEADRedirectHandler,
1364                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1365             opener.add_handler(handler())
1366
1367         response = opener.open(HeadRequest(url))
1368         if response is None:
1369             raise ExtractorError(u'Invalid URL protocol')
1370         new_url = response.geturl()
1371
1372         if url == new_url:
1373             return False
1374
1375         self.report_following_redirect(new_url)
1376         return new_url
1377
1378     def _real_extract(self, url):
1379         new_url = self._test_redirect(url)
1380         if new_url: return [self.url_result(new_url)]
1381
1382         video_id = url.split('/')[-1]
1383         try:
1384             webpage = self._download_webpage(url, video_id)
1385         except ValueError as err:
1386             # since this is the last-resort InfoExtractor, if
1387             # this error is thrown, it'll be thrown here
1388             raise ExtractorError(u'Invalid URL: %s' % url)
1389
1390         self.report_extraction(video_id)
1391         # Start with something easy: JW Player in SWFObject
1392         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1393         if mobj is None:
1394             # Broaden the search a little bit
1395             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1396         if mobj is None:
1397             # Broaden the search a little bit: JWPlayer JS loader
1398             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1399         if mobj is None:
1400             # Try to find twitter cards info
1401             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1402         if mobj is None:
1403             raise ExtractorError(u'Invalid URL: %s' % url)
1404
1405         # It's possible that one of the regexes
1406         # matched, but returned an empty group:
1407         if mobj.group(1) is None:
1408             raise ExtractorError(u'Invalid URL: %s' % url)
1409
1410         video_url = compat_urllib_parse.unquote(mobj.group(1))
1411         video_id = os.path.basename(video_url)
1412
1413         # here's a fun little line of code for you:
1414         video_extension = os.path.splitext(video_id)[1][1:]
1415         video_id = os.path.splitext(video_id)[0]
1416
1417         # it's tempting to parse this further, but you would
1418         # have to take into account all the variations like
1419         #   Video Title - Site Name
1420         #   Site Name | Video Title
1421         #   Video Title - Tagline | Site Name
1422         # and so on and so forth; it's just not practical
1423         mobj = re.search(r'<title>(.*)</title>', webpage)
1424         if mobj is None:
1425             raise ExtractorError(u'Unable to extract title')
1426         video_title = mobj.group(1)
1427
1428         # video uploader is domain name
1429         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1430         if mobj is None:
1431             raise ExtractorError(u'Unable to extract title')
1432         video_uploader = mobj.group(1)
1433
1434         return [{
1435             'id':       video_id,
1436             'url':      video_url,
1437             'uploader': video_uploader,
1438             'upload_date':  None,
1439             'title':    video_title,
1440             'ext':      video_extension,
1441         }]
1442
1443
1444 class YoutubeSearchIE(SearchInfoExtractor):
1445     """Information Extractor for YouTube search queries."""
1446     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1447     _MAX_RESULTS = 1000
1448     IE_NAME = u'youtube:search'
1449     _SEARCH_KEY = 'ytsearch'
1450
1451     def report_download_page(self, query, pagenum):
1452         """Report attempt to download search page with given number."""
1453         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1454
1455     def _get_n_results(self, query, n):
1456         """Get a specified number of results for a query"""
1457
1458         video_ids = []
1459         pagenum = 0
1460         limit = n
1461
1462         while (50 * pagenum) < limit:
1463             self.report_download_page(query, pagenum+1)
1464             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1465             request = compat_urllib_request.Request(result_url)
1466             try:
1467                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1468             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1469                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1470             api_response = json.loads(data)['data']
1471
1472             if not 'items' in api_response:
1473                 raise ExtractorError(u'[youtube] No video results')
1474
1475             new_ids = list(video['id'] for video in api_response['items'])
1476             video_ids += new_ids
1477
1478             limit = min(n, api_response['totalItems'])
1479             pagenum += 1
1480
1481         if len(video_ids) > n:
1482             video_ids = video_ids[:n]
1483         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1484         return self.playlist_result(videos, query)
1485
1486
1487 class GoogleSearchIE(SearchInfoExtractor):
1488     """Information Extractor for Google Video search queries."""
1489     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1490     _MAX_RESULTS = 1000
1491     IE_NAME = u'video.google:search'
1492     _SEARCH_KEY = 'gvsearch'
1493
1494     def _get_n_results(self, query, n):
1495         """Get a specified number of results for a query"""
1496
1497         res = {
1498             '_type': 'playlist',
1499             'id': query,
1500             'entries': []
1501         }
1502
1503         for pagenum in itertools.count(1):
1504             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1505             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1506                                              note='Downloading result page ' + str(pagenum))
1507
1508             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1509                 e = {
1510                     '_type': 'url',
1511                     'url': mobj.group(1)
1512                 }
1513                 res['entries'].append(e)
1514
1515             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1516                 return res
1517
1518 class YahooSearchIE(SearchInfoExtractor):
1519     """Information Extractor for Yahoo! Video search queries."""
1520
1521     _MAX_RESULTS = 1000
1522     IE_NAME = u'screen.yahoo:search'
1523     _SEARCH_KEY = 'yvsearch'
1524
1525     def _get_n_results(self, query, n):
1526         """Get a specified number of results for a query"""
1527
1528         res = {
1529             '_type': 'playlist',
1530             'id': query,
1531             'entries': []
1532         }
1533         for pagenum in itertools.count(0):
1534             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1535             webpage = self._download_webpage(result_url, query,
1536                                              note='Downloading results page '+str(pagenum+1))
1537             info = json.loads(webpage)
1538             m = info[u'm']
1539             results = info[u'results']
1540
1541             for (i, r) in enumerate(results):
1542                 if (pagenum * 30) +i >= n:
1543                     break
1544                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1545                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1546                 res['entries'].append(e)
1547             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1548                 break
1549
1550         return res
1551
1552
1553 class YoutubePlaylistIE(InfoExtractor):
1554     """Information Extractor for YouTube playlists."""
1555
1556     _VALID_URL = r"""(?:
1557                         (?:https?://)?
1558                         (?:\w+\.)?
1559                         youtube\.com/
1560                         (?:
1561                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1562                            \? (?:.*?&)*? (?:p|a|list)=
1563                         |  p/
1564                         )
1565                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1566                         .*
1567                      |
1568                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1569                      )"""
1570     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1571     _MAX_RESULTS = 50
1572     IE_NAME = u'youtube:playlist'
1573
1574     @classmethod
1575     def suitable(cls, url):
1576         """Receives a URL and returns True if suitable for this IE."""
1577         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1578
1579     def _real_extract(self, url):
1580         # Extract playlist id
1581         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1582         if mobj is None:
1583             raise ExtractorError(u'Invalid URL: %s' % url)
1584
1585         # Download playlist videos from API
1586         playlist_id = mobj.group(1) or mobj.group(2)
1587         page_num = 1
1588         videos = []
1589
1590         while True:
1591             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1592             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1593
1594             try:
1595                 response = json.loads(page)
1596             except ValueError as err:
1597                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1598
1599             if 'feed' not in response:
1600                 raise ExtractorError(u'Got a malformed response from YouTube API')
1601             playlist_title = response['feed']['title']['$t']
1602             if 'entry' not in response['feed']:
1603                 # Number of videos is a multiple of self._MAX_RESULTS
1604                 break
1605
1606             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1607                         for entry in response['feed']['entry']
1608                         if 'content' in entry ]
1609
1610             if len(response['feed']['entry']) < self._MAX_RESULTS:
1611                 break
1612             page_num += 1
1613
1614         videos = [v[1] for v in sorted(videos)]
1615
1616         url_results = [self.url_result(url, 'Youtube') for url in videos]
1617         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1618
1619
1620 class YoutubeChannelIE(InfoExtractor):
1621     """Information Extractor for YouTube channels."""
1622
1623     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1624     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1625     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1626     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1627     IE_NAME = u'youtube:channel'
1628
1629     def extract_videos_from_page(self, page):
1630         ids_in_page = []
1631         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1632             if mobj.group(1) not in ids_in_page:
1633                 ids_in_page.append(mobj.group(1))
1634         return ids_in_page
1635
1636     def _real_extract(self, url):
1637         # Extract channel id
1638         mobj = re.match(self._VALID_URL, url)
1639         if mobj is None:
1640             raise ExtractorError(u'Invalid URL: %s' % url)
1641
1642         # Download channel page
1643         channel_id = mobj.group(1)
1644         video_ids = []
1645         pagenum = 1
1646
1647         url = self._TEMPLATE_URL % (channel_id, pagenum)
1648         page = self._download_webpage(url, channel_id,
1649                                       u'Downloading page #%s' % pagenum)
1650
1651         # Extract video identifiers
1652         ids_in_page = self.extract_videos_from_page(page)
1653         video_ids.extend(ids_in_page)
1654
1655         # Download any subsequent channel pages using the json-based channel_ajax query
1656         if self._MORE_PAGES_INDICATOR in page:
1657             while True:
1658                 pagenum = pagenum + 1
1659
1660                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1661                 page = self._download_webpage(url, channel_id,
1662                                               u'Downloading page #%s' % pagenum)
1663
1664                 page = json.loads(page)
1665
1666                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1667                 video_ids.extend(ids_in_page)
1668
1669                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1670                     break
1671
1672         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1673
1674         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1675         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1676         return [self.playlist_result(url_entries, channel_id)]
1677
1678
1679 class YoutubeUserIE(InfoExtractor):
1680     """Information Extractor for YouTube users."""
1681
1682     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1683     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1684     _GDATA_PAGE_SIZE = 50
1685     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1686     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1687     IE_NAME = u'youtube:user'
1688
1689     def _real_extract(self, url):
1690         # Extract username
1691         mobj = re.match(self._VALID_URL, url)
1692         if mobj is None:
1693             raise ExtractorError(u'Invalid URL: %s' % url)
1694
1695         username = mobj.group(1)
1696
1697         # Download video ids using YouTube Data API. Result size per
1698         # query is limited (currently to 50 videos) so we need to query
1699         # page by page until there are no video ids - it means we got
1700         # all of them.
1701
1702         video_ids = []
1703         pagenum = 0
1704
1705         while True:
1706             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1707
1708             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1709             page = self._download_webpage(gdata_url, username,
1710                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714
1715             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1716                 if mobj.group(1) not in ids_in_page:
1717                     ids_in_page.append(mobj.group(1))
1718
1719             video_ids.extend(ids_in_page)
1720
1721             # A little optimization - if current page is not
1722             # "full", ie. does not contain PAGE_SIZE video ids then
1723             # we can assume that this page is the last one - there
1724             # are no more ids on further pages - no need to query
1725             # again.
1726
1727             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1728                 break
1729
1730             pagenum += 1
1731
1732         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1733         url_results = [self.url_result(url, 'Youtube') for url in urls]
1734         return [self.playlist_result(url_results, playlist_title = username)]
1735
1736
1737 class BlipTVUserIE(InfoExtractor):
1738     """Information Extractor for blip.tv users."""
1739
1740     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1741     _PAGE_SIZE = 12
1742     IE_NAME = u'blip.tv:user'
1743
1744     def _real_extract(self, url):
1745         # Extract username
1746         mobj = re.match(self._VALID_URL, url)
1747         if mobj is None:
1748             raise ExtractorError(u'Invalid URL: %s' % url)
1749
1750         username = mobj.group(1)
1751
1752         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1753
1754         page = self._download_webpage(url, username, u'Downloading user page')
1755         mobj = re.search(r'data-users-id="([^"]+)"', page)
1756         page_base = page_base % mobj.group(1)
1757
1758
1759         # Download video ids using BlipTV Ajax calls. Result size per
1760         # query is limited (currently to 12 videos) so we need to query
1761         # page by page until there are no video ids - it means we got
1762         # all of them.
1763
1764         video_ids = []
1765         pagenum = 1
1766
1767         while True:
1768             url = page_base + "&page=" + str(pagenum)
1769             page = self._download_webpage(url, username,
1770                                           u'Downloading video ids from page %d' % pagenum)
1771
1772             # Extract video identifiers
1773             ids_in_page = []
1774
1775             for mobj in re.finditer(r'href="/([^"]+)"', page):
1776                 if mobj.group(1) not in ids_in_page:
1777                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1778
1779             video_ids.extend(ids_in_page)
1780
1781             # A little optimization - if current page is not
1782             # "full", ie. does not contain PAGE_SIZE video ids then
1783             # we can assume that this page is the last one - there
1784             # are no more ids on further pages - no need to query
1785             # again.
1786
1787             if len(ids_in_page) < self._PAGE_SIZE:
1788                 break
1789
1790             pagenum += 1
1791
1792         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1793         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1794         return [self.playlist_result(url_entries, playlist_title = username)]
1795
1796
1797 class DepositFilesIE(InfoExtractor):
1798     """Information extractor for depositfiles.com"""
1799
1800     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1801
1802     def _real_extract(self, url):
1803         file_id = url.split('/')[-1]
1804         # Rebuild url in english locale
1805         url = 'http://depositfiles.com/en/files/' + file_id
1806
1807         # Retrieve file webpage with 'Free download' button pressed
1808         free_download_indication = { 'gateway_result' : '1' }
1809         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1810         try:
1811             self.report_download_webpage(file_id)
1812             webpage = compat_urllib_request.urlopen(request).read()
1813         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1814             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1815
1816         # Search for the real file URL
1817         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1818         if (mobj is None) or (mobj.group(1) is None):
1819             # Try to figure out reason of the error.
1820             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1821             if (mobj is not None) and (mobj.group(1) is not None):
1822                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1823                 raise ExtractorError(u'%s' % restriction_message)
1824             else:
1825                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1826
1827         file_url = mobj.group(1)
1828         file_extension = os.path.splitext(file_url)[1][1:]
1829
1830         # Search for file title
1831         mobj = re.search(r'<b title="(.*?)">', webpage)
1832         if mobj is None:
1833             raise ExtractorError(u'Unable to extract title')
1834         file_title = mobj.group(1).decode('utf-8')
1835
1836         return [{
1837             'id':       file_id.decode('utf-8'),
1838             'url':      file_url.decode('utf-8'),
1839             'uploader': None,
1840             'upload_date':  None,
1841             'title':    file_title,
1842             'ext':      file_extension.decode('utf-8'),
1843         }]
1844
1845
1846 class FacebookIE(InfoExtractor):
1847     """Information Extractor for Facebook"""
1848
1849     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1850     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1851     _NETRC_MACHINE = 'facebook'
1852     IE_NAME = u'facebook'
1853
1854     def report_login(self):
1855         """Report attempt to log in."""
1856         self.to_screen(u'Logging in')
1857
1858     def _real_initialize(self):
1859         if self._downloader is None:
1860             return
1861
1862         useremail = None
1863         password = None
1864         downloader_params = self._downloader.params
1865
1866         # Attempt to use provided username and password or .netrc data
1867         if downloader_params.get('username', None) is not None:
1868             useremail = downloader_params['username']
1869             password = downloader_params['password']
1870         elif downloader_params.get('usenetrc', False):
1871             try:
1872                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1873                 if info is not None:
1874                     useremail = info[0]
1875                     password = info[2]
1876                 else:
1877                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1878             except (IOError, netrc.NetrcParseError) as err:
1879                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1880                 return
1881
1882         if useremail is None:
1883             return
1884
1885         # Log in
1886         login_form = {
1887             'email': useremail,
1888             'pass': password,
1889             'login': 'Log+In'
1890             }
1891         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1892         try:
1893             self.report_login()
1894             login_results = compat_urllib_request.urlopen(request).read()
1895             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1896                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1897                 return
1898         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1900             return
1901
1902     def _real_extract(self, url):
1903         mobj = re.match(self._VALID_URL, url)
1904         if mobj is None:
1905             raise ExtractorError(u'Invalid URL: %s' % url)
1906         video_id = mobj.group('ID')
1907
1908         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1909         webpage = self._download_webpage(url, video_id)
1910
1911         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1912         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1913         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1914         if not m:
1915             raise ExtractorError(u'Cannot parse data')
1916         data = dict(json.loads(m.group(1)))
1917         params_raw = compat_urllib_parse.unquote(data['params'])
1918         params = json.loads(params_raw)
1919         video_data = params['video_data'][0]
1920         video_url = video_data.get('hd_src')
1921         if not video_url:
1922             video_url = video_data['sd_src']
1923         if not video_url:
1924             raise ExtractorError(u'Cannot find video URL')
1925         video_duration = int(video_data['video_duration'])
1926         thumbnail = video_data['thumbnail_src']
1927
1928         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1929         if not m:
1930             raise ExtractorError(u'Cannot find title in webpage')
1931         video_title = unescapeHTML(m.group(1))
1932
1933         info = {
1934             'id': video_id,
1935             'title': video_title,
1936             'url': video_url,
1937             'ext': 'mp4',
1938             'duration': video_duration,
1939             'thumbnail': thumbnail,
1940         }
1941         return [info]
1942
1943
1944 class BlipTVIE(InfoExtractor):
1945     """Information extractor for blip.tv"""
1946
1947     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1948     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1949     IE_NAME = u'blip.tv'
1950
1951     def report_direct_download(self, title):
1952         """Report information extraction."""
1953         self.to_screen(u'%s: Direct download detected' % title)
1954
1955     def _real_extract(self, url):
1956         mobj = re.match(self._VALID_URL, url)
1957         if mobj is None:
1958             raise ExtractorError(u'Invalid URL: %s' % url)
1959
1960         # See https://github.com/rg3/youtube-dl/issues/857
1961         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1962         if api_mobj is not None:
1963             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1964         urlp = compat_urllib_parse_urlparse(url)
1965         if urlp.path.startswith('/play/'):
1966             request = compat_urllib_request.Request(url)
1967             response = compat_urllib_request.urlopen(request)
1968             redirecturl = response.geturl()
1969             rurlp = compat_urllib_parse_urlparse(redirecturl)
1970             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1971             url = 'http://blip.tv/a/a-' + file_id
1972             return self._real_extract(url)
1973
1974
1975         if '?' in url:
1976             cchar = '&'
1977         else:
1978             cchar = '?'
1979         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1980         request = compat_urllib_request.Request(json_url)
1981         request.add_header('User-Agent', 'iTunes/10.6.1')
1982         self.report_extraction(mobj.group(1))
1983         info = None
1984         try:
1985             urlh = compat_urllib_request.urlopen(request)
1986             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1987                 basename = url.split('/')[-1]
1988                 title,ext = os.path.splitext(basename)
1989                 title = title.decode('UTF-8')
1990                 ext = ext.replace('.', '')
1991                 self.report_direct_download(title)
1992                 info = {
1993                     'id': title,
1994                     'url': url,
1995                     'uploader': None,
1996                     'upload_date': None,
1997                     'title': title,
1998                     'ext': ext,
1999                     'urlhandle': urlh
2000                 }
2001         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2002             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2003         if info is None: # Regular URL
2004             try:
2005                 json_code_bytes = urlh.read()
2006                 json_code = json_code_bytes.decode('utf-8')
2007             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2009
2010             try:
2011                 json_data = json.loads(json_code)
2012                 if 'Post' in json_data:
2013                     data = json_data['Post']
2014                 else:
2015                     data = json_data
2016
2017                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2018                 video_url = data['media']['url']
2019                 umobj = re.match(self._URL_EXT, video_url)
2020                 if umobj is None:
2021                     raise ValueError('Can not determine filename extension')
2022                 ext = umobj.group(1)
2023
2024                 info = {
2025                     'id': data['item_id'],
2026                     'url': video_url,
2027                     'uploader': data['display_name'],
2028                     'upload_date': upload_date,
2029                     'title': data['title'],
2030                     'ext': ext,
2031                     'format': data['media']['mimeType'],
2032                     'thumbnail': data['thumbnailUrl'],
2033                     'description': data['description'],
2034                     'player_url': data['embedUrl'],
2035                     'user_agent': 'iTunes/10.6.1',
2036                 }
2037             except (ValueError,KeyError) as err:
2038                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2039
2040         return [info]
2041
2042
2043 class MyVideoIE(InfoExtractor):
2044     """Information Extractor for myvideo.de."""
2045
2046     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2047     IE_NAME = u'myvideo'
2048
2049     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2050     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2051     # https://github.com/rg3/youtube-dl/pull/842
2052     def __rc4crypt(self,data, key):
2053         x = 0
2054         box = list(range(256))
2055         for i in list(range(256)):
2056             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2057             box[i], box[x] = box[x], box[i]
2058         x = 0
2059         y = 0
2060         out = ''
2061         for char in data:
2062             x = (x + 1) % 256
2063             y = (y + box[x]) % 256
2064             box[x], box[y] = box[y], box[x]
2065             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2066         return out
2067
2068     def __md5(self,s):
2069         return hashlib.md5(s).hexdigest().encode()
2070
2071     def _real_extract(self,url):
2072         mobj = re.match(self._VALID_URL, url)
2073         if mobj is None:
2074             raise ExtractorError(u'invalid URL: %s' % url)
2075
2076         video_id = mobj.group(1)
2077
2078         GK = (
2079           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2080           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2081           b'TnpsbA0KTVRkbU1tSTRNdz09'
2082         )
2083
2084         # Get video webpage
2085         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2086         webpage = self._download_webpage(webpage_url, video_id)
2087
2088         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2089         if mobj is not None:
2090             self.report_extraction(video_id)
2091             video_url = mobj.group(1) + '.flv'
2092
2093             mobj = re.search('<title>([^<]+)</title>', webpage)
2094             if mobj is None:
2095                 raise ExtractorError(u'Unable to extract title')
2096             video_title = mobj.group(1)
2097
2098             mobj = re.search('[.](.+?)$', video_url)
2099             if mobj is None:
2100                 raise ExtractorError(u'Unable to extract extention')
2101             video_ext = mobj.group(1)
2102
2103             return [{
2104                 'id':       video_id,
2105                 'url':      video_url,
2106                 'uploader': None,
2107                 'upload_date':  None,
2108                 'title':    video_title,
2109                 'ext':      u'flv',
2110             }]
2111
2112         # try encxml
2113         mobj = re.search('var flashvars={(.+?)}', webpage)
2114         if mobj is None:
2115             raise ExtractorError(u'Unable to extract video')
2116
2117         params = {}
2118         encxml = ''
2119         sec = mobj.group(1)
2120         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2121             if not a == '_encxml':
2122                 params[a] = b
2123             else:
2124                 encxml = compat_urllib_parse.unquote(b)
2125         if not params.get('domain'):
2126             params['domain'] = 'www.myvideo.de'
2127         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2128         if 'flash_playertype=MTV' in xmldata_url:
2129             self._downloader.report_warning(u'avoiding MTV player')
2130             xmldata_url = (
2131                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2132                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2133             ) % video_id
2134
2135         # get enc data
2136         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2137         enc_data_b = binascii.unhexlify(enc_data)
2138         sk = self.__md5(
2139             base64.b64decode(base64.b64decode(GK)) +
2140             self.__md5(
2141                 str(video_id).encode('utf-8')
2142             )
2143         )
2144         dec_data = self.__rc4crypt(enc_data_b, sk)
2145
2146         # extracting infos
2147         self.report_extraction(video_id)
2148
2149         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2150         if mobj is None:
2151             raise ExtractorError(u'unable to extract rtmpurl')
2152         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2153         if 'myvideo2flash' in video_rtmpurl:
2154             self._downloader.report_warning(u'forcing RTMPT ...')
2155             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2156
2157         # extract non rtmp videos
2158         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2159             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2160             if mobj is None:
2161                 raise ExtractorError(u'unable to extract url')
2162             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2163
2164         mobj = re.search('source=\'(.*?)\'', dec_data)
2165         if mobj is None:
2166             raise ExtractorError(u'unable to extract swfobj')
2167         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2168
2169         if not video_file.endswith('f4m'):
2170             ppath, prefix = video_file.split('.')
2171             video_playpath = '%s:%s' % (prefix, ppath)
2172             video_hls_playlist = ''
2173         else:
2174             video_playpath = ''
2175             video_hls_playlist = (
2176                 video_filepath + video_file
2177             ).replace('.f4m', '.m3u8')
2178
2179         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2180         if mobj is None:
2181             raise ExtractorError(u'unable to extract swfobj')
2182         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2183
2184         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2185         if mobj is None:
2186             raise ExtractorError(u'unable to extract title')
2187         video_title = mobj.group(1)
2188
2189         return [{
2190             'id':                 video_id,
2191             'url':                video_rtmpurl,
2192             'tc_url':             video_rtmpurl,
2193             'uploader':           None,
2194             'upload_date':        None,
2195             'title':              video_title,
2196             'ext':                u'flv',
2197             'play_path':          video_playpath,
2198             'video_file':         video_file,
2199             'video_hls_playlist': video_hls_playlist,
2200             'player_url':         video_swfobj,
2201         }]
2202
2203 class ComedyCentralIE(InfoExtractor):
2204     """Information extractor for The Daily Show and Colbert Report """
2205
2206     # urls can be abbreviations like :thedailyshow or :colbert
2207     # urls for episodes like:
2208     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2209     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2210     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2211     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2212                       |(https?://)?(www\.)?
2213                           (?P<showname>thedailyshow|colbertnation)\.com/
2214                          (full-episodes/(?P<episode>.*)|
2215                           (?P<clip>
2216                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2217                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2218                      $"""
2219
2220     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2221
2222     _video_extensions = {
2223         '3500': 'mp4',
2224         '2200': 'mp4',
2225         '1700': 'mp4',
2226         '1200': 'mp4',
2227         '750': 'mp4',
2228         '400': 'mp4',
2229     }
2230     _video_dimensions = {
2231         '3500': '1280x720',
2232         '2200': '960x540',
2233         '1700': '768x432',
2234         '1200': '640x360',
2235         '750': '512x288',
2236         '400': '384x216',
2237     }
2238
2239     @classmethod
2240     def suitable(cls, url):
2241         """Receives a URL and returns True if suitable for this IE."""
2242         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2243
2244     def _print_formats(self, formats):
2245         print('Available formats:')
2246         for x in formats:
2247             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2248
2249
2250     def _real_extract(self, url):
2251         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252         if mobj is None:
2253             raise ExtractorError(u'Invalid URL: %s' % url)
2254
2255         if mobj.group('shortname'):
2256             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2257                 url = u'http://www.thedailyshow.com/full-episodes/'
2258             else:
2259                 url = u'http://www.colbertnation.com/full-episodes/'
2260             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2261             assert mobj is not None
2262
2263         if mobj.group('clip'):
2264             if mobj.group('showname') == 'thedailyshow':
2265                 epTitle = mobj.group('tdstitle')
2266             else:
2267                 epTitle = mobj.group('cntitle')
2268             dlNewest = False
2269         else:
2270             dlNewest = not mobj.group('episode')
2271             if dlNewest:
2272                 epTitle = mobj.group('showname')
2273             else:
2274                 epTitle = mobj.group('episode')
2275
2276         self.report_extraction(epTitle)
2277         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2278         if dlNewest:
2279             url = htmlHandle.geturl()
2280             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2281             if mobj is None:
2282                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2283             if mobj.group('episode') == '':
2284                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2285             epTitle = mobj.group('episode')
2286
2287         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2288
2289         if len(mMovieParams) == 0:
2290             # The Colbert Report embeds the information in a without
2291             # a URL prefix; so extract the alternate reference
2292             # and then add the URL prefix manually.
2293
2294             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2295             if len(altMovieParams) == 0:
2296                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2297             else:
2298                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2299
2300         uri = mMovieParams[0][1]
2301         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2302         indexXml = self._download_webpage(indexUrl, epTitle,
2303                                           u'Downloading show index',
2304                                           u'unable to download episode index')
2305
2306         results = []
2307
2308         idoc = xml.etree.ElementTree.fromstring(indexXml)
2309         itemEls = idoc.findall('.//item')
2310         for partNum,itemEl in enumerate(itemEls):
2311             mediaId = itemEl.findall('./guid')[0].text
2312             shortMediaId = mediaId.split(':')[-1]
2313             showId = mediaId.split(':')[-2].replace('.com', '')
2314             officialTitle = itemEl.findall('./title')[0].text
2315             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2316
2317             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2318                         compat_urllib_parse.urlencode({'uri': mediaId}))
2319             configXml = self._download_webpage(configUrl, epTitle,
2320                                                u'Downloading configuration for %s' % shortMediaId)
2321
2322             cdoc = xml.etree.ElementTree.fromstring(configXml)
2323             turls = []
2324             for rendition in cdoc.findall('.//rendition'):
2325                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2326                 turls.append(finfo)
2327
2328             if len(turls) == 0:
2329                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2330                 continue
2331
2332             if self._downloader.params.get('listformats', None):
2333                 self._print_formats([i[0] for i in turls])
2334                 return
2335
2336             # For now, just pick the highest bitrate
2337             format,rtmp_video_url = turls[-1]
2338
2339             # Get the format arg from the arg stream
2340             req_format = self._downloader.params.get('format', None)
2341
2342             # Select format if we can find one
2343             for f,v in turls:
2344                 if f == req_format:
2345                     format, rtmp_video_url = f, v
2346                     break
2347
2348             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2349             if not m:
2350                 raise ExtractorError(u'Cannot transform RTMP url')
2351             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2352             video_url = base + m.group('finalid')
2353
2354             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2355             info = {
2356                 'id': shortMediaId,
2357                 'url': video_url,
2358                 'uploader': showId,
2359                 'upload_date': officialDate,
2360                 'title': effTitle,
2361                 'ext': 'mp4',
2362                 'format': format,
2363                 'thumbnail': None,
2364                 'description': officialTitle,
2365             }
2366             results.append(info)
2367
2368         return results
2369
2370
2371 class EscapistIE(InfoExtractor):
2372     """Information extractor for The Escapist """
2373
2374     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2375     IE_NAME = u'escapist'
2376
2377     def _real_extract(self, url):
2378         mobj = re.match(self._VALID_URL, url)
2379         if mobj is None:
2380             raise ExtractorError(u'Invalid URL: %s' % url)
2381         showName = mobj.group('showname')
2382         videoId = mobj.group('episode')
2383
2384         self.report_extraction(showName)
2385         webPage = self._download_webpage(url, showName)
2386
2387         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2388         description = unescapeHTML(descMatch.group(1))
2389         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2390         imgUrl = unescapeHTML(imgMatch.group(1))
2391         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2392         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2393         configUrlMatch = re.search('config=(.*)$', playerUrl)
2394         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2395
2396         configJSON = self._download_webpage(configUrl, showName,
2397                                             u'Downloading configuration',
2398                                             u'unable to download configuration')
2399
2400         # Technically, it's JavaScript, not JSON
2401         configJSON = configJSON.replace("'", '"')
2402
2403         try:
2404             config = json.loads(configJSON)
2405         except (ValueError,) as err:
2406             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2407
2408         playlist = config['playlist']
2409         videoUrl = playlist[1]['url']
2410
2411         info = {
2412             'id': videoId,
2413             'url': videoUrl,
2414             'uploader': showName,
2415             'upload_date': None,
2416             'title': showName,
2417             'ext': 'mp4',
2418             'thumbnail': imgUrl,
2419             'description': description,
2420             'player_url': playerUrl,
2421         }
2422
2423         return [info]
2424
2425 class CollegeHumorIE(InfoExtractor):
2426     """Information extractor for collegehumor.com"""
2427
2428     _WORKING = False
2429     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2430     IE_NAME = u'collegehumor'
2431
2432     def report_manifest(self, video_id):
2433         """Report information extraction."""
2434         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2435
2436     def _real_extract(self, url):
2437         mobj = re.match(self._VALID_URL, url)
2438         if mobj is None:
2439             raise ExtractorError(u'Invalid URL: %s' % url)
2440         video_id = mobj.group('videoid')
2441
2442         info = {
2443             'id': video_id,
2444             'uploader': None,
2445             'upload_date': None,
2446         }
2447
2448         self.report_extraction(video_id)
2449         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2450         try:
2451             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2452         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2454
2455         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2456         try:
2457             videoNode = mdoc.findall('./video')[0]
2458             info['description'] = videoNode.findall('./description')[0].text
2459             info['title'] = videoNode.findall('./caption')[0].text
2460             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2461             manifest_url = videoNode.findall('./file')[0].text
2462         except IndexError:
2463             raise ExtractorError(u'Invalid metadata XML file')
2464
2465         manifest_url += '?hdcore=2.10.3'
2466         self.report_manifest(video_id)
2467         try:
2468             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2471
2472         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2473         try:
2474             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2475             node_id = media_node.attrib['url']
2476             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2477         except IndexError as err:
2478             raise ExtractorError(u'Invalid manifest file')
2479
2480         url_pr = compat_urllib_parse_urlparse(manifest_url)
2481         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2482
2483         info['url'] = url
2484         info['ext'] = 'f4f'
2485         return [info]
2486
2487
2488 class XVideosIE(InfoExtractor):
2489     """Information extractor for xvideos.com"""
2490
2491     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2492     IE_NAME = u'xvideos'
2493
2494     def _real_extract(self, url):
2495         mobj = re.match(self._VALID_URL, url)
2496         if mobj is None:
2497             raise ExtractorError(u'Invalid URL: %s' % url)
2498         video_id = mobj.group(1)
2499
2500         webpage = self._download_webpage(url, video_id)
2501
2502         self.report_extraction(video_id)
2503
2504
2505         # Extract video URL
2506         mobj = re.search(r'flv_url=(.+?)&', webpage)
2507         if mobj is None:
2508             raise ExtractorError(u'Unable to extract video url')
2509         video_url = compat_urllib_parse.unquote(mobj.group(1))
2510
2511
2512         # Extract title
2513         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2514         if mobj is None:
2515             raise ExtractorError(u'Unable to extract video title')
2516         video_title = mobj.group(1)
2517
2518
2519         # Extract video thumbnail
2520         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2521         if mobj is None:
2522             raise ExtractorError(u'Unable to extract video thumbnail')
2523         video_thumbnail = mobj.group(0)
2524
2525         info = {
2526             'id': video_id,
2527             'url': video_url,
2528             'uploader': None,
2529             'upload_date': None,
2530             'title': video_title,
2531             'ext': 'flv',
2532             'thumbnail': video_thumbnail,
2533             'description': None,
2534         }
2535
2536         return [info]
2537
2538
2539 class SoundcloudIE(InfoExtractor):
2540     """Information extractor for soundcloud.com
2541        To access the media, the uid of the song and a stream token
2542        must be extracted from the page source and the script must make
2543        a request to media.soundcloud.com/crossdomain.xml. Then
2544        the media can be grabbed by requesting from an url composed
2545        of the stream token and uid
2546      """
2547
2548     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2549     IE_NAME = u'soundcloud'
2550
2551     def report_resolve(self, video_id):
2552         """Report information extraction."""
2553         self.to_screen(u'%s: Resolving id' % video_id)
2554
2555     def _real_extract(self, url):
2556         mobj = re.match(self._VALID_URL, url)
2557         if mobj is None:
2558             raise ExtractorError(u'Invalid URL: %s' % url)
2559
2560         # extract uploader (which is in the url)
2561         uploader = mobj.group(1)
2562         # extract simple title (uploader + slug of song title)
2563         slug_title =  mobj.group(2)
2564         simple_title = uploader + u'-' + slug_title
2565         full_title = '%s/%s' % (uploader, slug_title)
2566
2567         self.report_resolve(full_title)
2568
2569         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2570         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2571         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2572
2573         info = json.loads(info_json)
2574         video_id = info['id']
2575         self.report_extraction(full_title)
2576
2577         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578         stream_json = self._download_webpage(streams_url, full_title,
2579                                              u'Downloading stream definitions',
2580                                              u'unable to download stream definitions')
2581
2582         streams = json.loads(stream_json)
2583         mediaURL = streams['http_mp3_128_url']
2584         upload_date = unified_strdate(info['created_at'])
2585
2586         return [{
2587             'id':       info['id'],
2588             'url':      mediaURL,
2589             'uploader': info['user']['username'],
2590             'upload_date': upload_date,
2591             'title':    info['title'],
2592             'ext':      u'mp3',
2593             'description': info['description'],
2594         }]
2595
2596 class SoundcloudSetIE(InfoExtractor):
2597     """Information extractor for soundcloud.com sets
2598        To access the media, the uid of the song and a stream token
2599        must be extracted from the page source and the script must make
2600        a request to media.soundcloud.com/crossdomain.xml. Then
2601        the media can be grabbed by requesting from an url composed
2602        of the stream token and uid
2603      """
2604
2605     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2606     IE_NAME = u'soundcloud:set'
2607
2608     def report_resolve(self, video_id):
2609         """Report information extraction."""
2610         self.to_screen(u'%s: Resolving id' % video_id)
2611
2612     def _real_extract(self, url):
2613         mobj = re.match(self._VALID_URL, url)
2614         if mobj is None:
2615             raise ExtractorError(u'Invalid URL: %s' % url)
2616
2617         # extract uploader (which is in the url)
2618         uploader = mobj.group(1)
2619         # extract simple title (uploader + slug of song title)
2620         slug_title =  mobj.group(2)
2621         simple_title = uploader + u'-' + slug_title
2622         full_title = '%s/sets/%s' % (uploader, slug_title)
2623
2624         self.report_resolve(full_title)
2625
2626         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2627         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2628         info_json = self._download_webpage(resolv_url, full_title)
2629
2630         videos = []
2631         info = json.loads(info_json)
2632         if 'errors' in info:
2633             for err in info['errors']:
2634                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2635             return
2636
2637         self.report_extraction(full_title)
2638         for track in info['tracks']:
2639             video_id = track['id']
2640
2641             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2642             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2643
2644             self.report_extraction(video_id)
2645             streams = json.loads(stream_json)
2646             mediaURL = streams['http_mp3_128_url']
2647
2648             videos.append({
2649                 'id':       video_id,
2650                 'url':      mediaURL,
2651                 'uploader': track['user']['username'],
2652                 'upload_date':  unified_strdate(track['created_at']),
2653                 'title':    track['title'],
2654                 'ext':      u'mp3',
2655                 'description': track['description'],
2656             })
2657         return videos
2658
2659
2660 class InfoQIE(InfoExtractor):
2661     """Information extractor for infoq.com"""
2662     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2663
2664     def _real_extract(self, url):
2665         mobj = re.match(self._VALID_URL, url)
2666         if mobj is None:
2667             raise ExtractorError(u'Invalid URL: %s' % url)
2668
2669         webpage = self._download_webpage(url, video_id=url)
2670         self.report_extraction(url)
2671
2672         # Extract video URL
2673         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2674         if mobj is None:
2675             raise ExtractorError(u'Unable to extract video url')
2676         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2677         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2678
2679         # Extract title
2680         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2681         if mobj is None:
2682             raise ExtractorError(u'Unable to extract video title')
2683         video_title = mobj.group(1)
2684
2685         # Extract description
2686         video_description = u'No description available.'
2687         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2688         if mobj is not None:
2689             video_description = mobj.group(1)
2690
2691         video_filename = video_url.split('/')[-1]
2692         video_id, extension = video_filename.split('.')
2693
2694         info = {
2695             'id': video_id,
2696             'url': video_url,
2697             'uploader': None,
2698             'upload_date': None,
2699             'title': video_title,
2700             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2701             'thumbnail': None,
2702             'description': video_description,
2703         }
2704
2705         return [info]
2706
2707 class MixcloudIE(InfoExtractor):
2708     """Information extractor for www.mixcloud.com"""
2709
2710     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2711     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2712     IE_NAME = u'mixcloud'
2713
2714     def report_download_json(self, file_id):
2715         """Report JSON download."""
2716         self.to_screen(u'Downloading json')
2717
2718     def get_urls(self, jsonData, fmt, bitrate='best'):
2719         """Get urls from 'audio_formats' section in json"""
2720         file_url = None
2721         try:
2722             bitrate_list = jsonData[fmt]
2723             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2724                 bitrate = max(bitrate_list) # select highest
2725
2726             url_list = jsonData[fmt][bitrate]
2727         except TypeError: # we have no bitrate info.
2728             url_list = jsonData[fmt]
2729         return url_list
2730
2731     def check_urls(self, url_list):
2732         """Returns 1st active url from list"""
2733         for url in url_list:
2734             try:
2735                 compat_urllib_request.urlopen(url)
2736                 return url
2737             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2738                 url = None
2739
2740         return None
2741
2742     def _print_formats(self, formats):
2743         print('Available formats:')
2744         for fmt in formats.keys():
2745             for b in formats[fmt]:
2746                 try:
2747                     ext = formats[fmt][b][0]
2748                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2749                 except TypeError: # we have no bitrate info
2750                     ext = formats[fmt][0]
2751                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2752                     break
2753
2754     def _real_extract(self, url):
2755         mobj = re.match(self._VALID_URL, url)
2756         if mobj is None:
2757             raise ExtractorError(u'Invalid URL: %s' % url)
2758         # extract uploader & filename from url
2759         uploader = mobj.group(1).decode('utf-8')
2760         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2761
2762         # construct API request
2763         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2764         # retrieve .json file with links to files
2765         request = compat_urllib_request.Request(file_url)
2766         try:
2767             self.report_download_json(file_url)
2768             jsonData = compat_urllib_request.urlopen(request).read()
2769         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2770             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2771
2772         # parse JSON
2773         json_data = json.loads(jsonData)
2774         player_url = json_data['player_swf_url']
2775         formats = dict(json_data['audio_formats'])
2776
2777         req_format = self._downloader.params.get('format', None)
2778         bitrate = None
2779
2780         if self._downloader.params.get('listformats', None):
2781             self._print_formats(formats)
2782             return
2783
2784         if req_format is None or req_format == 'best':
2785             for format_param in formats.keys():
2786                 url_list = self.get_urls(formats, format_param)
2787                 # check urls
2788                 file_url = self.check_urls(url_list)
2789                 if file_url is not None:
2790                     break # got it!
2791         else:
2792             if req_format not in formats:
2793                 raise ExtractorError(u'Format is not available')
2794
2795             url_list = self.get_urls(formats, req_format)
2796             file_url = self.check_urls(url_list)
2797             format_param = req_format
2798
2799         return [{
2800             'id': file_id.decode('utf-8'),
2801             'url': file_url.decode('utf-8'),
2802             'uploader': uploader.decode('utf-8'),
2803             'upload_date': None,
2804             'title': json_data['name'],
2805             'ext': file_url.split('.')[-1].decode('utf-8'),
2806             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2807             'thumbnail': json_data['thumbnail_url'],
2808             'description': json_data['description'],
2809             'player_url': player_url.decode('utf-8'),
2810         }]
2811
2812 class StanfordOpenClassroomIE(InfoExtractor):
2813     """Information extractor for Stanford's Open ClassRoom"""
2814
2815     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2816     IE_NAME = u'stanfordoc'
2817
2818     def _real_extract(self, url):
2819         mobj = re.match(self._VALID_URL, url)
2820         if mobj is None:
2821             raise ExtractorError(u'Invalid URL: %s' % url)
2822
2823         if mobj.group('course') and mobj.group('video'): # A specific video
2824             course = mobj.group('course')
2825             video = mobj.group('video')
2826             info = {
2827                 'id': course + '_' + video,
2828                 'uploader': None,
2829                 'upload_date': None,
2830             }
2831
2832             self.report_extraction(info['id'])
2833             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2834             xmlUrl = baseUrl + video + '.xml'
2835             try:
2836                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2838                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2839             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2840             try:
2841                 info['title'] = mdoc.findall('./title')[0].text
2842                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2843             except IndexError:
2844                 raise ExtractorError(u'Invalid metadata XML file')
2845             info['ext'] = info['url'].rpartition('.')[2]
2846             return [info]
2847         elif mobj.group('course'): # A course page
2848             course = mobj.group('course')
2849             info = {
2850                 'id': course,
2851                 'type': 'playlist',
2852                 'uploader': None,
2853                 'upload_date': None,
2854             }
2855
2856             coursepage = self._download_webpage(url, info['id'],
2857                                         note='Downloading course info page',
2858                                         errnote='Unable to download course info page')
2859
2860             m = re.search('<h1>([^<]+)</h1>', coursepage)
2861             if m:
2862                 info['title'] = unescapeHTML(m.group(1))
2863             else:
2864                 info['title'] = info['id']
2865
2866             m = re.search('<description>([^<]+)</description>', coursepage)
2867             if m:
2868                 info['description'] = unescapeHTML(m.group(1))
2869
2870             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2871             info['list'] = [
2872                 {
2873                     'type': 'reference',
2874                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2875                 }
2876                     for vpage in links]
2877             results = []
2878             for entry in info['list']:
2879                 assert entry['type'] == 'reference'
2880                 results += self.extract(entry['url'])
2881             return results
2882         else: # Root page
2883             info = {
2884                 'id': 'Stanford OpenClassroom',
2885                 'type': 'playlist',
2886                 'uploader': None,
2887                 'upload_date': None,
2888             }
2889
2890             self.report_download_webpage(info['id'])
2891             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2892             try:
2893                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2894             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2896
2897             info['title'] = info['id']
2898
2899             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2900             info['list'] = [
2901                 {
2902                     'type': 'reference',
2903                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2904                 }
2905                     for cpage in links]
2906
2907             results = []
2908             for entry in info['list']:
2909                 assert entry['type'] == 'reference'
2910                 results += self.extract(entry['url'])
2911             return results
2912
2913 class MTVIE(InfoExtractor):
2914     """Information extractor for MTV.com"""
2915
2916     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2917     IE_NAME = u'mtv'
2918
2919     def _real_extract(self, url):
2920         mobj = re.match(self._VALID_URL, url)
2921         if mobj is None:
2922             raise ExtractorError(u'Invalid URL: %s' % url)
2923         if not mobj.group('proto'):
2924             url = 'http://' + url
2925         video_id = mobj.group('videoid')
2926
2927         webpage = self._download_webpage(url, video_id)
2928
2929         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2930         if mobj is None:
2931             raise ExtractorError(u'Unable to extract song name')
2932         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2933         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2934         if mobj is None:
2935             raise ExtractorError(u'Unable to extract performer')
2936         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2937         video_title = performer + ' - ' + song_name
2938
2939         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2940         if mobj is None:
2941             raise ExtractorError(u'Unable to mtvn_uri')
2942         mtvn_uri = mobj.group(1)
2943
2944         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2945         if mobj is None:
2946             raise ExtractorError(u'Unable to extract content id')
2947         content_id = mobj.group(1)
2948
2949         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2950         self.report_extraction(video_id)
2951         request = compat_urllib_request.Request(videogen_url)
2952         try:
2953             metadataXml = compat_urllib_request.urlopen(request).read()
2954         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2955             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2956
2957         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2958         renditions = mdoc.findall('.//rendition')
2959
2960         # For now, always pick the highest quality.
2961         rendition = renditions[-1]
2962
2963         try:
2964             _,_,ext = rendition.attrib['type'].partition('/')
2965             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2966             video_url = rendition.find('./src').text
2967         except KeyError:
2968             raise ExtractorError('Invalid rendition field.')
2969
2970         info = {
2971             'id': video_id,
2972             'url': video_url,
2973             'uploader': performer,
2974             'upload_date': None,
2975             'title': video_title,
2976             'ext': ext,
2977             'format': format,
2978         }
2979
2980         return [info]
2981
2982
2983 class YoukuIE(InfoExtractor):
2984     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2985
2986     def _gen_sid(self):
2987         nowTime = int(time.time() * 1000)
2988         random1 = random.randint(1000,1998)
2989         random2 = random.randint(1000,9999)
2990
2991         return "%d%d%d" %(nowTime,random1,random2)
2992
2993     def _get_file_ID_mix_string(self, seed):
2994         mixed = []
2995         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2996         seed = float(seed)
2997         for i in range(len(source)):
2998             seed  =  (seed * 211 + 30031 ) % 65536
2999             index  =  math.floor(seed / 65536 * len(source) )
3000             mixed.append(source[int(index)])
3001             source.remove(source[int(index)])
3002         #return ''.join(mixed)
3003         return mixed
3004
3005     def _get_file_id(self, fileId, seed):
3006         mixed = self._get_file_ID_mix_string(seed)
3007         ids = fileId.split('*')
3008         realId = []
3009         for ch in ids:
3010             if ch:
3011                 realId.append(mixed[int(ch)])
3012         return ''.join(realId)
3013
3014     def _real_extract(self, url):
3015         mobj = re.match(self._VALID_URL, url)
3016         if mobj is None:
3017             raise ExtractorError(u'Invalid URL: %s' % url)
3018         video_id = mobj.group('ID')
3019
3020         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3021
3022         jsondata = self._download_webpage(info_url, video_id)
3023
3024         self.report_extraction(video_id)
3025         try:
3026             config = json.loads(jsondata)
3027
3028             video_title =  config['data'][0]['title']
3029             seed = config['data'][0]['seed']
3030
3031             format = self._downloader.params.get('format', None)
3032             supported_format = list(config['data'][0]['streamfileids'].keys())
3033
3034             if format is None or format == 'best':
3035                 if 'hd2' in supported_format:
3036                     format = 'hd2'
3037                 else:
3038                     format = 'flv'
3039                 ext = u'flv'
3040             elif format == 'worst':
3041                 format = 'mp4'
3042                 ext = u'mp4'
3043             else:
3044                 format = 'flv'
3045                 ext = u'flv'
3046
3047
3048             fileid = config['data'][0]['streamfileids'][format]
3049             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3050         except (UnicodeDecodeError, ValueError, KeyError):
3051             raise ExtractorError(u'Unable to extract info section')
3052
3053         files_info=[]
3054         sid = self._gen_sid()
3055         fileid = self._get_file_id(fileid, seed)
3056
3057         #column 8,9 of fileid represent the segment number
3058         #fileid[7:9] should be changed
3059         for index, key in enumerate(keys):
3060
3061             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3062             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3063
3064             info = {
3065                 'id': '%s_part%02d' % (video_id, index),
3066                 'url': download_url,
3067                 'uploader': None,
3068                 'upload_date': None,
3069                 'title': video_title,
3070                 'ext': ext,
3071             }
3072             files_info.append(info)
3073
3074         return files_info
3075
3076
3077 class XNXXIE(InfoExtractor):
3078     """Information extractor for xnxx.com"""
3079
3080     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3081     IE_NAME = u'xnxx'
3082     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3083     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3084     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3085
3086     def _real_extract(self, url):
3087         mobj = re.match(self._VALID_URL, url)
3088         if mobj is None:
3089             raise ExtractorError(u'Invalid URL: %s' % url)
3090         video_id = mobj.group(1)
3091
3092         # Get webpage content
3093         webpage = self._download_webpage(url, video_id)
3094
3095         result = re.search(self.VIDEO_URL_RE, webpage)
3096         if result is None:
3097             raise ExtractorError(u'Unable to extract video url')
3098         video_url = compat_urllib_parse.unquote(result.group(1))
3099
3100         result = re.search(self.VIDEO_TITLE_RE, webpage)
3101         if result is None:
3102             raise ExtractorError(u'Unable to extract video title')
3103         video_title = result.group(1)
3104
3105         result = re.search(self.VIDEO_THUMB_RE, webpage)
3106         if result is None:
3107             raise ExtractorError(u'Unable to extract video thumbnail')
3108         video_thumbnail = result.group(1)
3109
3110         return [{
3111             'id': video_id,
3112             'url': video_url,
3113             'uploader': None,
3114             'upload_date': None,
3115             'title': video_title,
3116             'ext': 'flv',
3117             'thumbnail': video_thumbnail,
3118             'description': None,
3119         }]
3120
3121
3122 class GooglePlusIE(InfoExtractor):
3123     """Information extractor for plus.google.com."""
3124
3125     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3126     IE_NAME = u'plus.google'
3127
3128     def report_extract_entry(self, url):
3129         """Report downloading extry"""
3130         self.to_screen(u'Downloading entry: %s' % url)
3131
3132     def report_date(self, upload_date):
3133         """Report downloading extry"""
3134         self.to_screen(u'Entry date: %s' % upload_date)
3135
3136     def report_uploader(self, uploader):
3137         """Report downloading extry"""
3138         self.to_screen(u'Uploader: %s' % uploader)
3139
3140     def report_title(self, video_title):
3141         """Report downloading extry"""
3142         self.to_screen(u'Title: %s' % video_title)
3143
3144     def report_extract_vid_page(self, video_page):
3145         """Report information extraction."""
3146         self.to_screen(u'Extracting video page: %s' % video_page)
3147
3148     def _real_extract(self, url):
3149         # Extract id from URL
3150         mobj = re.match(self._VALID_URL, url)
3151         if mobj is None:
3152             raise ExtractorError(u'Invalid URL: %s' % url)
3153
3154         post_url = mobj.group(0)
3155         video_id = mobj.group(1)
3156
3157         video_extension = 'flv'
3158
3159         # Step 1, Retrieve post webpage to extract further information
3160         self.report_extract_entry(post_url)
3161         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3162
3163         # Extract update date
3164         upload_date = None
3165         pattern = 'title="Timestamp">(.*?)</a>'
3166         mobj = re.search(pattern, webpage)
3167         if mobj:
3168             upload_date = mobj.group(1)
3169             # Convert timestring to a format suitable for filename
3170             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3171             upload_date = upload_date.strftime('%Y%m%d')
3172         self.report_date(upload_date)
3173
3174         # Extract uploader
3175         uploader = None
3176         pattern = r'rel\="author".*?>(.*?)</a>'
3177         mobj = re.search(pattern, webpage)
3178         if mobj:
3179             uploader = mobj.group(1)
3180         self.report_uploader(uploader)
3181
3182         # Extract title
3183         # Get the first line for title
3184         video_title = u'NA'
3185         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3186         mobj = re.search(pattern, webpage)
3187         if mobj:
3188             video_title = mobj.group(1)
3189         self.report_title(video_title)
3190
3191         # Step 2, Stimulate clicking the image box to launch video
3192         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3193         mobj = re.search(pattern, webpage)
3194         if mobj is None:
3195             raise ExtractorError(u'Unable to extract video page URL')
3196
3197         video_page = mobj.group(1)
3198         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3199         self.report_extract_vid_page(video_page)
3200
3201
3202         # Extract video links on video page
3203         """Extract video links of all sizes"""
3204         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3205         mobj = re.findall(pattern, webpage)
3206         if len(mobj) == 0:
3207             raise ExtractorError(u'Unable to extract video links')
3208
3209         # Sort in resolution
3210         links = sorted(mobj)
3211
3212         # Choose the lowest of the sort, i.e. highest resolution
3213         video_url = links[-1]
3214         # Only get the url. The resolution part in the tuple has no use anymore
3215         video_url = video_url[-1]
3216         # Treat escaped \u0026 style hex
3217         try:
3218             video_url = video_url.decode("unicode_escape")
3219         except AttributeError: # Python 3
3220             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3221
3222
3223         return [{
3224             'id':       video_id,
3225             'url':      video_url,
3226             'uploader': uploader,
3227             'upload_date':  upload_date,
3228             'title':    video_title,
3229             'ext':      video_extension,
3230         }]
3231
3232 class NBAIE(InfoExtractor):
3233     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3234     IE_NAME = u'nba'
3235
3236     def _real_extract(self, url):
3237         mobj = re.match(self._VALID_URL, url)
3238         if mobj is None:
3239             raise ExtractorError(u'Invalid URL: %s' % url)
3240
3241         video_id = mobj.group(1)
3242         if video_id.endswith('/index.html'):
3243             video_id = video_id[:-len('/index.html')]
3244
3245         webpage = self._download_webpage(url, video_id)
3246
3247         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3248         def _findProp(rexp, default=None):
3249             m = re.search(rexp, webpage)
3250             if m:
3251                 return unescapeHTML(m.group(1))
3252             else:
3253                 return default
3254
3255         shortened_video_id = video_id.rpartition('/')[2]
3256         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3257         info = {
3258             'id': shortened_video_id,
3259             'url': video_url,
3260             'ext': 'mp4',
3261             'title': title,
3262             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3263             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3264         }
3265         return [info]
3266
3267 class JustinTVIE(InfoExtractor):
3268     """Information extractor for justin.tv and twitch.tv"""
3269     # TODO: One broadcast may be split into multiple videos. The key
3270     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3271     # starts at 1 and increases. Can we treat all parts as one video?
3272
3273     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3274         (?:
3275             (?P<channelid>[^/]+)|
3276             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3277             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3278         )
3279         /?(?:\#.*)?$
3280         """
3281     _JUSTIN_PAGE_LIMIT = 100
3282     IE_NAME = u'justin.tv'
3283
3284     def report_download_page(self, channel, offset):
3285         """Report attempt to download a single page of videos."""
3286         self.to_screen(u'%s: Downloading video information from %d to %d' %
3287                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3288
3289     # Return count of items, list of *valid* items
3290     def _parse_page(self, url, video_id):
3291         webpage = self._download_webpage(url, video_id,
3292                                          u'Downloading video info JSON',
3293                                          u'unable to download video info JSON')
3294
3295         response = json.loads(webpage)
3296         if type(response) != list:
3297             error_text = response.get('error', 'unknown error')
3298             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3299         info = []
3300         for clip in response:
3301             video_url = clip['video_file_url']
3302             if video_url:
3303                 video_extension = os.path.splitext(video_url)[1][1:]
3304                 video_date = re.sub('-', '', clip['start_time'][:10])
3305                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3306                 video_id = clip['id']
3307                 video_title = clip.get('title', video_id)
3308                 info.append({
3309                     'id': video_id,
3310                     'url': video_url,
3311                     'title': video_title,
3312                     'uploader': clip.get('channel_name', video_uploader_id),
3313                     'uploader_id': video_uploader_id,
3314                     'upload_date': video_date,
3315                     'ext': video_extension,
3316                 })
3317         return (len(response), info)
3318
3319     def _real_extract(self, url):
3320         mobj = re.match(self._VALID_URL, url)
3321         if mobj is None:
3322             raise ExtractorError(u'invalid URL: %s' % url)
3323
3324         api_base = 'http://api.justin.tv'
3325         paged = False
3326         if mobj.group('channelid'):
3327             paged = True
3328             video_id = mobj.group('channelid')
3329             api = api_base + '/channel/archives/%s.json' % video_id
3330         elif mobj.group('chapterid'):
3331             chapter_id = mobj.group('chapterid')
3332
3333             webpage = self._download_webpage(url, chapter_id)
3334             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3335             if not m:
3336                 raise ExtractorError(u'Cannot find archive of a chapter')
3337             archive_id = m.group(1)
3338
3339             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3340             chapter_info_xml = self._download_webpage(api, chapter_id,
3341                                              note=u'Downloading chapter information',
3342                                              errnote=u'Chapter information download failed')
3343             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3344             for a in doc.findall('.//archive'):
3345                 if archive_id == a.find('./id').text:
3346                     break
3347             else:
3348                 raise ExtractorError(u'Could not find chapter in chapter information')
3349
3350             video_url = a.find('./video_file_url').text
3351             video_ext = video_url.rpartition('.')[2] or u'flv'
3352
3353             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3354             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3355                                    note='Downloading chapter metadata',
3356                                    errnote='Download of chapter metadata failed')
3357             chapter_info = json.loads(chapter_info_json)
3358
3359             bracket_start = int(doc.find('.//bracket_start').text)
3360             bracket_end = int(doc.find('.//bracket_end').text)
3361
3362             # TODO determine start (and probably fix up file)
3363             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3364             #video_url += u'?start=' + TODO:start_timestamp
3365             # bracket_start is 13290, but we want 51670615
3366             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3367                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3368
3369             info = {
3370                 'id': u'c' + chapter_id,
3371                 'url': video_url,
3372                 'ext': video_ext,
3373                 'title': chapter_info['title'],
3374                 'thumbnail': chapter_info['preview'],
3375                 'description': chapter_info['description'],
3376                 'uploader': chapter_info['channel']['display_name'],
3377                 'uploader_id': chapter_info['channel']['name'],
3378             }
3379             return [info]
3380         else:
3381             video_id = mobj.group('videoid')
3382             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3383
3384         self.report_extraction(video_id)
3385
3386         info = []
3387         offset = 0
3388         limit = self._JUSTIN_PAGE_LIMIT
3389         while True:
3390             if paged:
3391                 self.report_download_page(video_id, offset)
3392             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3393             page_count, page_info = self._parse_page(page_url, video_id)
3394             info.extend(page_info)
3395             if not paged or page_count != limit:
3396                 break
3397             offset += limit
3398         return info
3399
3400 class FunnyOrDieIE(InfoExtractor):
3401     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3402
3403     def _real_extract(self, url):
3404         mobj = re.match(self._VALID_URL, url)
3405         if mobj is None:
3406             raise ExtractorError(u'invalid URL: %s' % url)
3407
3408         video_id = mobj.group('id')
3409         webpage = self._download_webpage(url, video_id)
3410
3411         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3412         if not m:
3413             raise ExtractorError(u'Unable to find video information')
3414         video_url = unescapeHTML(m.group('url'))
3415
3416         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3417         if not m:
3418             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3419             if not m:
3420                 raise ExtractorError(u'Cannot find video title')
3421         title = clean_html(m.group('title'))
3422
3423         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3424         if m:
3425             desc = unescapeHTML(m.group('desc'))
3426         else:
3427             desc = None
3428
3429         info = {
3430             'id': video_id,
3431             'url': video_url,
3432             'ext': 'mp4',
3433             'title': title,
3434             'description': desc,
3435         }
3436         return [info]
3437
3438 class SteamIE(InfoExtractor):
3439     _VALID_URL = r"""http://store\.steampowered\.com/
3440                 (agecheck/)?
3441                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3442                 (?P<gameID>\d+)/?
3443                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3444                 """
3445
3446     @classmethod
3447     def suitable(cls, url):
3448         """Receives a URL and returns True if suitable for this IE."""
3449         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3450
3451     def _real_extract(self, url):
3452         m = re.match(self._VALID_URL, url, re.VERBOSE)
3453         gameID = m.group('gameID')
3454         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3455         self.report_age_confirmation()
3456         webpage = self._download_webpage(videourl, gameID)
3457         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3458
3459         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3460         mweb = re.finditer(urlRE, webpage)
3461         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3462         titles = re.finditer(namesRE, webpage)
3463         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3464         thumbs = re.finditer(thumbsRE, webpage)
3465         videos = []
3466         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3467             video_id = vid.group('videoID')
3468             title = vtitle.group('videoName')
3469             video_url = vid.group('videoURL')
3470             video_thumb = thumb.group('thumbnail')
3471             if not video_url:
3472                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3473             info = {
3474                 'id':video_id,
3475                 'url':video_url,
3476                 'ext': 'flv',
3477                 'title': unescapeHTML(title),
3478                 'thumbnail': video_thumb
3479                   }
3480             videos.append(info)
3481         return [self.playlist_result(videos, gameID, game_title)]
3482
3483 class UstreamIE(InfoExtractor):
3484     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3485     IE_NAME = u'ustream'
3486
3487     def _real_extract(self, url):
3488         m = re.match(self._VALID_URL, url)
3489         video_id = m.group('videoID')
3490         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3491         webpage = self._download_webpage(url, video_id)
3492         self.report_extraction(video_id)
3493         try:
3494             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3495             title = m.group('title')
3496             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3497                           webpage, re.DOTALL)
3498             uploader = unescapeHTML(m.group('uploader').strip())
3499             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3500             thumb = m.group('thumb')
3501         except AttributeError:
3502             raise ExtractorError(u'Unable to extract info')
3503         info = {
3504                 'id':video_id,
3505                 'url':video_url,
3506                 'ext': 'flv',
3507                 'title': title,
3508                 'uploader': uploader,
3509                 'thumbnail': thumb,
3510                   }
3511         return info
3512
3513 class WorldStarHipHopIE(InfoExtractor):
3514     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3515     IE_NAME = u'WorldStarHipHop'
3516
3517     def _real_extract(self, url):
3518         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3519
3520         m = re.match(self._VALID_URL, url)
3521         video_id = m.group('id')
3522
3523         webpage_src = self._download_webpage(url, video_id)
3524
3525         mobj = re.search(_src_url, webpage_src)
3526
3527         if mobj is not None:
3528             video_url = mobj.group(1)
3529             if 'mp4' in video_url:
3530                 ext = 'mp4'
3531             else:
3532                 ext = 'flv'
3533         else:
3534             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3535
3536         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3537
3538         if mobj is None:
3539             raise ExtractorError(u'Cannot determine title')
3540         title = mobj.group(1)
3541
3542         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3543         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3544         if mobj is not None:
3545             thumbnail = mobj.group(1)
3546         else:
3547             _title = r"""candytitles.*>(.*)</span>"""
3548             mobj = re.search(_title, webpage_src)
3549             if mobj is not None:
3550                 title = mobj.group(1)
3551             thumbnail = None
3552
3553         results = [{
3554                     'id': video_id,
3555                     'url' : video_url,
3556                     'title' : title,
3557                     'thumbnail' : thumbnail,
3558                     'ext' : ext,
3559                     }]
3560         return results
3561
3562 class RBMARadioIE(InfoExtractor):
3563     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3564
3565     def _real_extract(self, url):
3566         m = re.match(self._VALID_URL, url)
3567         video_id = m.group('videoID')
3568
3569         webpage = self._download_webpage(url, video_id)
3570         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3571         if not m:
3572             raise ExtractorError(u'Cannot find metadata')
3573         json_data = m.group(1)
3574
3575         try:
3576             data = json.loads(json_data)
3577         except ValueError as e:
3578             raise ExtractorError(u'Invalid JSON: ' + str(e))
3579
3580         video_url = data['akamai_url'] + '&cbr=256'
3581         url_parts = compat_urllib_parse_urlparse(video_url)
3582         video_ext = url_parts.path.rpartition('.')[2]
3583         info = {
3584                 'id': video_id,
3585                 'url': video_url,
3586                 'ext': video_ext,
3587                 'title': data['title'],
3588                 'description': data.get('teaser_text'),
3589                 'location': data.get('country_of_origin'),
3590                 'uploader': data.get('host', {}).get('name'),
3591                 'uploader_id': data.get('host', {}).get('slug'),
3592                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3593                 'duration': data.get('duration'),
3594         }
3595         return [info]
3596
3597
3598 class YouPornIE(InfoExtractor):
3599     """Information extractor for youporn.com."""
3600     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3601
3602     def _print_formats(self, formats):
3603         """Print all available formats"""
3604         print(u'Available formats:')
3605         print(u'ext\t\tformat')
3606         print(u'---------------------------------')
3607         for format in formats:
3608             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3609
3610     def _specific(self, req_format, formats):
3611         for x in formats:
3612             if(x["format"]==req_format):
3613                 return x
3614         return None
3615
3616     def _real_extract(self, url):
3617         mobj = re.match(self._VALID_URL, url)
3618         if mobj is None:
3619             raise ExtractorError(u'Invalid URL: %s' % url)
3620
3621         video_id = mobj.group('videoid')
3622
3623         req = compat_urllib_request.Request(url)
3624         req.add_header('Cookie', 'age_verified=1')
3625         webpage = self._download_webpage(req, video_id)
3626
3627         # Get the video title
3628         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3629         if result is None:
3630             raise ExtractorError(u'Unable to extract video title')
3631         video_title = result.group('title').strip()
3632
3633         # Get the video date
3634         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3635         if result is None:
3636             self._downloader.report_warning(u'unable to extract video date')
3637             upload_date = None
3638         else:
3639             upload_date = unified_strdate(result.group('date').strip())
3640
3641         # Get the video uploader
3642         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3643         if result is None:
3644             self._downloader.report_warning(u'unable to extract uploader')
3645             video_uploader = None
3646         else:
3647             video_uploader = result.group('uploader').strip()
3648             video_uploader = clean_html( video_uploader )
3649
3650         # Get all of the formats available
3651         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3652         result = re.search(DOWNLOAD_LIST_RE, webpage)
3653         if result is None:
3654             raise ExtractorError(u'Unable to extract download list')
3655         download_list_html = result.group('download_list').strip()
3656
3657         # Get all of the links from the page
3658         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3659         links = re.findall(LINK_RE, download_list_html)
3660         if(len(links) == 0):
3661             raise ExtractorError(u'ERROR: no known formats available for video')
3662
3663         self.to_screen(u'Links found: %d' % len(links))
3664
3665         formats = []
3666         for link in links:
3667
3668             # A link looks like this:
3669             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3670             # A path looks like this:
3671             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3672             video_url = unescapeHTML( link )
3673             path = compat_urllib_parse_urlparse( video_url ).path
3674             extension = os.path.splitext( path )[1][1:]
3675             format = path.split('/')[4].split('_')[:2]
3676             size = format[0]
3677             bitrate = format[1]
3678             format = "-".join( format )
3679             title = u'%s-%s-%s' % (video_title, size, bitrate)
3680
3681             formats.append({
3682                 'id': video_id,
3683                 'url': video_url,
3684                 'uploader': video_uploader,
3685                 'upload_date': upload_date,
3686                 'title': title,
3687                 'ext': extension,
3688                 'format': format,
3689                 'thumbnail': None,
3690                 'description': None,
3691                 'player_url': None
3692             })
3693
3694         if self._downloader.params.get('listformats', None):
3695             self._print_formats(formats)
3696             return
3697
3698         req_format = self._downloader.params.get('format', None)
3699         self.to_screen(u'Format: %s' % req_format)
3700
3701         if req_format is None or req_format == 'best':
3702             return [formats[0]]
3703         elif req_format == 'worst':
3704             return [formats[-1]]
3705         elif req_format in ('-1', 'all'):
3706             return formats
3707         else:
3708             format = self._specific( req_format, formats )
3709             if result is None:
3710                 raise ExtractorError(u'Requested format not available')
3711             return [format]
3712
3713
3714
3715 class PornotubeIE(InfoExtractor):
3716     """Information extractor for pornotube.com."""
3717     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3718
3719     def _real_extract(self, url):
3720         mobj = re.match(self._VALID_URL, url)
3721         if mobj is None:
3722             raise ExtractorError(u'Invalid URL: %s' % url)
3723
3724         video_id = mobj.group('videoid')
3725         video_title = mobj.group('title')
3726
3727         # Get webpage content
3728         webpage = self._download_webpage(url, video_id)
3729
3730         # Get the video URL
3731         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3732         result = re.search(VIDEO_URL_RE, webpage)
3733         if result is None:
3734             raise ExtractorError(u'Unable to extract video url')
3735         video_url = compat_urllib_parse.unquote(result.group('url'))
3736
3737         #Get the uploaded date
3738         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3739         result = re.search(VIDEO_UPLOADED_RE, webpage)
3740         if result is None:
3741             raise ExtractorError(u'Unable to extract video title')
3742         upload_date = unified_strdate(result.group('date'))
3743
3744         info = {'id': video_id,
3745                 'url': video_url,
3746                 'uploader': None,
3747                 'upload_date': upload_date,
3748                 'title': video_title,
3749                 'ext': 'flv',
3750                 'format': 'flv'}
3751
3752         return [info]
3753
3754 class YouJizzIE(InfoExtractor):
3755     """Information extractor for youjizz.com."""
3756     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3757
3758     def _real_extract(self, url):
3759         mobj = re.match(self._VALID_URL, url)
3760         if mobj is None:
3761             raise ExtractorError(u'Invalid URL: %s' % url)
3762
3763         video_id = mobj.group('videoid')
3764
3765         # Get webpage content
3766         webpage = self._download_webpage(url, video_id)
3767
3768         # Get the video title
3769         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3770         if result is None:
3771             raise ExtractorError(u'ERROR: unable to extract video title')
3772         video_title = result.group('title').strip()
3773
3774         # Get the embed page
3775         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3776         if result is None:
3777             raise ExtractorError(u'ERROR: unable to extract embed page')
3778
3779         embed_page_url = result.group(0).strip()
3780         video_id = result.group('videoid')
3781
3782         webpage = self._download_webpage(embed_page_url, video_id)
3783
3784         # Get the video URL
3785         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3786         if result is None:
3787             raise ExtractorError(u'ERROR: unable to extract video url')
3788         video_url = result.group('source')
3789
3790         info = {'id': video_id,
3791                 'url': video_url,
3792                 'title': video_title,
3793                 'ext': 'flv',
3794                 'format': 'flv',
3795                 'player_url': embed_page_url}
3796
3797         return [info]
3798
3799 class EightTracksIE(InfoExtractor):
3800     IE_NAME = '8tracks'
3801     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3802
3803     def _real_extract(self, url):
3804         mobj = re.match(self._VALID_URL, url)
3805         if mobj is None:
3806             raise ExtractorError(u'Invalid URL: %s' % url)
3807         playlist_id = mobj.group('id')
3808
3809         webpage = self._download_webpage(url, playlist_id)
3810
3811         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3812         if not m:
3813             raise ExtractorError(u'Cannot find trax information')
3814         json_like = m.group(1)
3815         data = json.loads(json_like)
3816
3817         session = str(random.randint(0, 1000000000))
3818         mix_id = data['id']
3819         track_count = data['tracks_count']
3820         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3821         next_url = first_url
3822         res = []
3823         for i in itertools.count():
3824             api_json = self._download_webpage(next_url, playlist_id,
3825                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3826                 errnote=u'Failed to download song information')
3827             api_data = json.loads(api_json)
3828             track_data = api_data[u'set']['track']
3829             info = {
3830                 'id': track_data['id'],
3831                 'url': track_data['track_file_stream_url'],
3832                 'title': track_data['performer'] + u' - ' + track_data['name'],
3833                 'raw_title': track_data['name'],
3834                 'uploader_id': data['user']['login'],
3835                 'ext': 'm4a',
3836             }
3837             res.append(info)
3838             if api_data['set']['at_last_track']:
3839                 break
3840             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3841         return res
3842
3843 class KeekIE(InfoExtractor):
3844     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3845     IE_NAME = u'keek'
3846
3847     def _real_extract(self, url):
3848         m = re.match(self._VALID_URL, url)
3849         video_id = m.group('videoID')
3850         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3851         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3852         webpage = self._download_webpage(url, video_id)
3853         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3854         title = unescapeHTML(m.group('title'))
3855         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3856         uploader = clean_html(m.group('uploader'))
3857         info = {
3858                 'id': video_id,
3859                 'url': video_url,
3860                 'ext': 'mp4',
3861                 'title': title,
3862                 'thumbnail': thumbnail,
3863                 'uploader': uploader
3864         }
3865         return [info]
3866
3867 class TEDIE(InfoExtractor):
3868     _VALID_URL=r'''http://www\.ted\.com/
3869                    (
3870                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3871                         |
3872                         ((?P<type_talk>talks)) # We have a simple talk
3873                    )
3874                    (/lang/(.*?))? # The url may contain the language
3875                    /(?P<name>\w+) # Here goes the name and then ".html"
3876                    '''
3877
3878     @classmethod
3879     def suitable(cls, url):
3880         """Receives a URL and returns True if suitable for this IE."""
3881         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3882
3883     def _real_extract(self, url):
3884         m=re.match(self._VALID_URL, url, re.VERBOSE)
3885         if m.group('type_talk'):
3886             return [self._talk_info(url)]
3887         else :
3888             playlist_id=m.group('playlist_id')
3889             name=m.group('name')
3890             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3891             return [self._playlist_videos_info(url,name,playlist_id)]
3892
3893     def _talk_video_link(self,mediaSlug):
3894         '''Returns the video link for that mediaSlug'''
3895         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3896
3897     def _playlist_videos_info(self,url,name,playlist_id=0):
3898         '''Returns the videos of the playlist'''
3899         video_RE=r'''
3900                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3901                      ([.\s]*?)data-playlist_item_id="(\d+)"
3902                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3903                      '''
3904         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3905         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3906         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3907         m_names=re.finditer(video_name_RE,webpage)
3908
3909         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3910         m_playlist = re.search(playlist_RE, webpage)
3911         playlist_title = m_playlist.group('playlist_title')
3912
3913         playlist_entries = []
3914         for m_video, m_name in zip(m_videos,m_names):
3915             video_id=m_video.group('video_id')
3916             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3917             playlist_entries.append(self.url_result(talk_url, 'TED'))
3918         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3919
3920     def _talk_info(self, url, video_id=0):
3921         """Return the video for the talk in the url"""
3922         m=re.match(self._VALID_URL, url,re.VERBOSE)
3923         videoName=m.group('name')
3924         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3925         # If the url includes the language we get the title translated
3926         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3927         title=re.search(title_RE, webpage).group('title')
3928         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3929                         "id":(?P<videoID>[\d]+).*?
3930                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3931         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3932         thumb_match=re.search(thumb_RE,webpage)
3933         info_match=re.search(info_RE,webpage,re.VERBOSE)
3934         video_id=info_match.group('videoID')
3935         mediaSlug=info_match.group('mediaSlug')
3936         video_url=self._talk_video_link(mediaSlug)
3937         info = {
3938                 'id': video_id,
3939                 'url': video_url,
3940                 'ext': 'mp4',
3941                 'title': title,
3942                 'thumbnail': thumb_match.group('thumbnail')
3943                 }
3944         return info
3945
3946 class MySpassIE(InfoExtractor):
3947     _VALID_URL = r'http://www.myspass.de/.*'
3948
3949     def _real_extract(self, url):
3950         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3951
3952         # video id is the last path element of the URL
3953         # usually there is a trailing slash, so also try the second but last
3954         url_path = compat_urllib_parse_urlparse(url).path
3955         url_parent_path, video_id = os.path.split(url_path)
3956         if not video_id:
3957             _, video_id = os.path.split(url_parent_path)
3958
3959         # get metadata
3960         metadata_url = META_DATA_URL_TEMPLATE % video_id
3961         metadata_text = self._download_webpage(metadata_url, video_id)
3962         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3963
3964         # extract values from metadata
3965         url_flv_el = metadata.find('url_flv')
3966         if url_flv_el is None:
3967             raise ExtractorError(u'Unable to extract download url')
3968         video_url = url_flv_el.text
3969         extension = os.path.splitext(video_url)[1][1:]
3970         title_el = metadata.find('title')
3971         if title_el is None:
3972             raise ExtractorError(u'Unable to extract title')
3973         title = title_el.text
3974         format_id_el = metadata.find('format_id')
3975         if format_id_el is None:
3976             format = ext
3977         else:
3978             format = format_id_el.text
3979         description_el = metadata.find('description')
3980         if description_el is not None:
3981             description = description_el.text
3982         else:
3983             description = None
3984         imagePreview_el = metadata.find('imagePreview')
3985         if imagePreview_el is not None:
3986             thumbnail = imagePreview_el.text
3987         else:
3988             thumbnail = None
3989         info = {
3990             'id': video_id,
3991             'url': video_url,
3992             'title': title,
3993             'ext': extension,
3994             'format': format,
3995             'thumbnail': thumbnail,
3996             'description': description
3997         }
3998         return [info]
3999
4000 class SpiegelIE(InfoExtractor):
4001     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4002
4003     def _real_extract(self, url):
4004         m = re.match(self._VALID_URL, url)
4005         video_id = m.group('videoID')
4006
4007         webpage = self._download_webpage(url, video_id)
4008         m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
4009         if not m:
4010             raise ExtractorError(u'Cannot find title')
4011         video_title = unescapeHTML(m.group(1))
4012
4013         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4014         xml_code = self._download_webpage(xml_url, video_id,
4015                     note=u'Downloading XML', errnote=u'Failed to download XML')
4016
4017         idoc = xml.etree.ElementTree.fromstring(xml_code)
4018         last_type = idoc[-1]
4019         filename = last_type.findall('./filename')[0].text
4020         duration = float(last_type.findall('./duration')[0].text)
4021
4022         video_url = 'http://video2.spiegel.de/flash/' + filename
4023         video_ext = filename.rpartition('.')[2]
4024         info = {
4025             'id': video_id,
4026             'url': video_url,
4027             'ext': video_ext,
4028             'title': video_title,
4029             'duration': duration,
4030         }
4031         return [info]
4032
4033 class LiveLeakIE(InfoExtractor):
4034
4035     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4036     IE_NAME = u'liveleak'
4037
4038     def _real_extract(self, url):
4039         mobj = re.match(self._VALID_URL, url)
4040         if mobj is None:
4041             raise ExtractorError(u'Invalid URL: %s' % url)
4042
4043         video_id = mobj.group('video_id')
4044
4045         webpage = self._download_webpage(url, video_id)
4046
4047         m = re.search(r'file: "(.*?)",', webpage)
4048         if not m:
4049             raise ExtractorError(u'Unable to find video url')
4050         video_url = m.group(1)
4051
4052         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4053         if not m:
4054             raise ExtractorError(u'Cannot find video title')
4055         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4056
4057         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4058         if m:
4059             desc = unescapeHTML(m.group('desc'))
4060         else:
4061             desc = None
4062
4063         m = re.search(r'By:.*?(\w+)</a>', webpage)
4064         if m:
4065             uploader = clean_html(m.group(1))
4066         else:
4067             uploader = None
4068
4069         info = {
4070             'id':  video_id,
4071             'url': video_url,
4072             'ext': 'mp4',
4073             'title': title,
4074             'description': desc,
4075             'uploader': uploader
4076         }
4077
4078         return [info]
4079
4080 class ARDIE(InfoExtractor):
4081     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4082     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4083     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4084
4085     def _real_extract(self, url):
4086         # determine video id from url
4087         m = re.match(self._VALID_URL, url)
4088
4089         numid = re.search(r'documentId=([0-9]+)', url)
4090         if numid:
4091             video_id = numid.group(1)
4092         else:
4093             video_id = m.group('video_id')
4094
4095         # determine title and media streams from webpage
4096         html = self._download_webpage(url, video_id)
4097         title = re.search(self._TITLE, html).group('title')
4098         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4099         if not streams:
4100             assert '"fsk"' in html
4101             raise ExtractorError(u'This video is only available after 8:00 pm')
4102
4103         # choose default media type and highest quality for now
4104         stream = max([s for s in streams if int(s["media_type"]) == 0],
4105                      key=lambda s: int(s["quality"]))
4106
4107         # there's two possibilities: RTMP stream or HTTP download
4108         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4109         if stream['rtmp_url']:
4110             self.to_screen(u'RTMP download detected')
4111             assert stream['video_url'].startswith('mp4:')
4112             info["url"] = stream["rtmp_url"]
4113             info["play_path"] = stream['video_url']
4114         else:
4115             assert stream["video_url"].endswith('.mp4')
4116             info["url"] = stream["video_url"]
4117         return [info]
4118
4119 class ZDFIE(InfoExtractor):
4120     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4121     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4122     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4123     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4124     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4125
4126     def _real_extract(self, url):
4127         mobj = re.match(self._VALID_URL, url)
4128         if mobj is None:
4129             raise ExtractorError(u'Invalid URL: %s' % url)
4130         video_id = mobj.group('video_id')
4131
4132         html = self._download_webpage(url, video_id)
4133         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4134         if streams is None:
4135             raise ExtractorError(u'No media url found.')
4136
4137         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4138         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4139         # choose first/default media type and highest quality for now
4140         for s in streams:        #find 300 - dsl1000mbit
4141             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4142                 stream_=s
4143                 break
4144         for s in streams:        #find veryhigh - dsl2000mbit
4145             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4146                 stream_=s
4147                 break
4148         if stream_ is None:
4149             raise ExtractorError(u'No stream found.')
4150
4151         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4152
4153         self.report_extraction(video_id)
4154         mobj = re.search(self._TITLE, html)
4155         if mobj is None:
4156             raise ExtractorError(u'Cannot extract title')
4157         title = unescapeHTML(mobj.group('title'))
4158
4159         mobj = re.search(self._MMS_STREAM, media_link)
4160         if mobj is None:
4161             mobj = re.search(self._RTSP_STREAM, media_link)
4162             if mobj is None:
4163                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4164         mms_url = mobj.group('video_url')
4165
4166         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4167         if mobj is None:
4168             raise ExtractorError(u'Cannot extract extention')
4169         ext = mobj.group('ext')
4170
4171         return [{'id': video_id,
4172                  'url': mms_url,
4173                  'title': title,
4174                  'ext': ext
4175                  }]
4176
4177 class TumblrIE(InfoExtractor):
4178     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4179
4180     def _real_extract(self, url):
4181         m_url = re.match(self._VALID_URL, url)
4182         video_id = m_url.group('id')
4183         blog = m_url.group('blog_name')
4184
4185         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4186         webpage = self._download_webpage(url, video_id)
4187
4188         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4189         video = re.search(re_video, webpage)
4190         if video is None:
4191             self.to_screen("No video found")
4192             return []
4193         video_url = video.group('video_url')
4194         ext = video.group('ext')
4195
4196         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4197         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4198
4199         # The only place where you can get a title, it's not complete,
4200         # but searching in other places doesn't work for all videos
4201         re_title = r'<title>(?P<title>.*?)</title>'
4202         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4203
4204         return [{'id': video_id,
4205                  'url': video_url,
4206                  'title': title,
4207                  'thumbnail': thumb,
4208                  'ext': ext
4209                  }]
4210
4211 class BandcampIE(InfoExtractor):
4212     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4213
4214     def _real_extract(self, url):
4215         mobj = re.match(self._VALID_URL, url)
4216         title = mobj.group('title')
4217         webpage = self._download_webpage(url, title)
4218         # We get the link to the free download page
4219         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4220         if m_download is None:
4221             raise ExtractorError(u'No free songs founded')
4222
4223         download_link = m_download.group(1)
4224         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4225                        webpage, re.MULTILINE|re.DOTALL).group('id')
4226
4227         download_webpage = self._download_webpage(download_link, id,
4228                                                   'Downloading free downloads page')
4229         # We get the dictionary of the track from some javascrip code
4230         info = re.search(r'items: (.*?),$',
4231                          download_webpage, re.MULTILINE).group(1)
4232         info = json.loads(info)[0]
4233         # We pick mp3-320 for now, until format selection can be easily implemented.
4234         mp3_info = info[u'downloads'][u'mp3-320']
4235         # If we try to use this url it says the link has expired
4236         initial_url = mp3_info[u'url']
4237         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4238         m_url = re.match(re_url, initial_url)
4239         #We build the url we will use to get the final track url
4240         # This url is build in Bandcamp in the script download_bunde_*.js
4241         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4242         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4243         # If we could correctly generate the .rand field the url would be
4244         #in the "download_url" key
4245         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4246
4247         track_info = {'id':id,
4248                       'title' : info[u'title'],
4249                       'ext' : 'mp3',
4250                       'url' : final_url,
4251                       'thumbnail' : info[u'thumb_url'],
4252                       'uploader' : info[u'artist']
4253                       }
4254
4255         return [track_info]
4256
4257 class RedTubeIE(InfoExtractor):
4258     """Information Extractor for redtube"""
4259     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4260
4261     def _real_extract(self,url):
4262         mobj = re.match(self._VALID_URL, url)
4263         if mobj is None:
4264             raise ExtractorError(u'Invalid URL: %s' % url)
4265
4266         video_id = mobj.group('id')
4267         video_extension = 'mp4'
4268         webpage = self._download_webpage(url, video_id)
4269         self.report_extraction(video_id)
4270         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4271
4272         if mobj is None:
4273             raise ExtractorError(u'Unable to extract media URL')
4274
4275         video_url = mobj.group(1)
4276         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4277         if mobj is None:
4278             raise ExtractorError(u'Unable to extract title')
4279         video_title = mobj.group(1)
4280
4281         return [{
4282             'id':       video_id,
4283             'url':      video_url,
4284             'ext':      video_extension,
4285             'title':    video_title,
4286         }]
4287
4288 class InaIE(InfoExtractor):
4289     """Information Extractor for Ina.fr"""
4290     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4291
4292     def _real_extract(self,url):
4293         mobj = re.match(self._VALID_URL, url)
4294
4295         video_id = mobj.group('id')
4296         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4297         video_extension = 'mp4'
4298         webpage = self._download_webpage(mrss_url, video_id)
4299
4300         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4301         if mobj is None:
4302             raise ExtractorError(u'Unable to extract media URL')
4303         video_url = mobj.group(1)
4304
4305         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4306         if mobj is None:
4307             raise ExtractorError(u'Unable to extract title')
4308         video_title = mobj.group(1)
4309
4310         return [{
4311             'id':       video_id,
4312             'url':      video_url,
4313             'ext':      video_extension,
4314             'title':    video_title,
4315         }]
4316
4317 class HowcastIE(InfoExtractor):
4318     """Information Extractor for Howcast.com"""
4319     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4320
4321     def _real_extract(self, url):
4322         mobj = re.match(self._VALID_URL, url)
4323
4324         video_id = mobj.group('id')
4325         webpage_url = 'http://www.howcast.com/videos/' + video_id
4326         webpage = self._download_webpage(webpage_url, video_id)
4327
4328         self.report_extraction(video_id)
4329
4330         mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4331         if mobj is None:
4332             raise ExtractorError(u'Unable to extract video URL')
4333         video_url = mobj.group(1)
4334
4335         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4336         if mobj is None:
4337             raise ExtractorError(u'Unable to extract title')
4338         video_title = mobj.group(1) or mobj.group(2)
4339
4340         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4341         if mobj is None:
4342             self._downloader.report_warning(u'unable to extract description')
4343             video_description = None
4344         else:
4345             video_description = mobj.group(1) or mobj.group(2)
4346
4347         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4348         if mobj is None:
4349             raise ExtractorError(u'Unable to extract thumbnail')
4350         thumbnail = mobj.group(1)
4351
4352         return [{
4353             'id':       video_id,
4354             'url':      video_url,
4355             'ext':      'mp4',
4356             'title':    video_title,
4357             'description': video_description,
4358             'thumbnail': thumbnail,
4359         }]
4360
4361 class VineIE(InfoExtractor):
4362     """Information Extractor for Vine.co"""
4363     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4364
4365     def _real_extract(self, url):
4366
4367         mobj = re.match(self._VALID_URL, url)
4368
4369         video_id = mobj.group('id')
4370         webpage_url = 'https://vine.co/v/' + video_id
4371         webpage = self._download_webpage(webpage_url, video_id)
4372
4373         self.report_extraction(video_id)
4374
4375         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4376         if mobj is None:
4377             raise ExtractorError(u'Unable to extract video URL')
4378         video_url = mobj.group(1)
4379
4380         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4381         if mobj is None:
4382             raise ExtractorError(u'Unable to extract title')
4383         video_title = mobj.group(1)
4384
4385         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4386         if mobj is None:
4387             raise ExtractorError(u'Unable to extract thumbnail')
4388         thumbnail = mobj.group(1)
4389
4390         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4391         if mobj is None:
4392             raise ExtractorError(u'Unable to extract uploader')
4393         uploader = mobj.group(1)
4394
4395         return [{
4396             'id':        video_id,
4397             'url':       video_url,
4398             'ext':       'mp4',
4399             'title':     video_title,
4400             'thumbnail': thumbnail,
4401             'uploader':  uploader,
4402         }]
4403
4404 class FlickrIE(InfoExtractor):
4405     """Information Extractor for Flickr videos"""
4406     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4407
4408     def _real_extract(self, url):
4409         mobj = re.match(self._VALID_URL, url)
4410
4411         video_id = mobj.group('id')
4412         video_uploader_id = mobj.group('uploader_id')
4413         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4414         webpage = self._download_webpage(webpage_url, video_id)
4415
4416         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4417         if mobj is None:
4418             raise ExtractorError(u'Unable to extract video secret')
4419         secret = mobj.group(1)
4420
4421         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4422         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4423
4424         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4425         if mobj is None:
4426             raise ExtractorError(u'Unable to extract node_id')
4427         node_id = mobj.group(1)
4428
4429         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4430         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4431
4432         self.report_extraction(video_id)
4433
4434         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4435         if mobj is None:
4436             raise ExtractorError(u'Unable to extract video url')
4437         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4438
4439         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4440         if mobj is None:
4441             raise ExtractorError(u'Unable to extract title')
4442         video_title = mobj.group(1) or mobj.group(2)
4443
4444         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4445         if mobj is None:
4446             self._downloader.report_warning(u'unable to extract description')
4447             video_description = None
4448         else:
4449             video_description = mobj.group(1) or mobj.group(2)
4450
4451         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4452         if mobj is None:
4453             raise ExtractorError(u'Unable to extract thumbnail')
4454         thumbnail = mobj.group(1) or mobj.group(2)
4455
4456         return [{
4457             'id':          video_id,
4458             'url':         video_url,
4459             'ext':         'mp4',
4460             'title':       video_title,
4461             'description': video_description,
4462             'thumbnail':   thumbnail,
4463             'uploader_id': video_uploader_id,
4464         }]
4465
4466 class TeamcocoIE(InfoExtractor):
4467     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4468
4469     def _real_extract(self, url):
4470         mobj = re.match(self._VALID_URL, url)
4471         if mobj is None:
4472             raise ExtractorError(u'Invalid URL: %s' % url)
4473         url_title = mobj.group('url_title')
4474         webpage = self._download_webpage(url, url_title)
4475
4476         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4477         video_id = mobj.group(1)
4478
4479         self.report_extraction(video_id)
4480
4481         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4482         if mobj is None:
4483             raise ExtractorError(u'Unable to extract title')
4484         video_title = mobj.group(1)
4485
4486         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4487         if mobj is None:
4488             raise ExtractorError(u'Unable to extract thumbnail')
4489         thumbnail = mobj.group(1)
4490
4491         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4492         if mobj is None:
4493             raise ExtractorError(u'Unable to extract description')
4494         description = mobj.group(1)
4495
4496         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4497         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4498         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4499         if mobj is None:
4500             raise ExtractorError(u'Unable to extract video url')
4501         video_url = mobj.group(1)
4502
4503         return [{
4504             'id':          video_id,
4505             'url':         video_url,
4506             'ext':         'mp4',
4507             'title':       video_title,
4508             'thumbnail':   thumbnail,
4509             'description': description,
4510         }]
4511
4512 class XHamsterIE(InfoExtractor):
4513     """Information Extractor for xHamster"""
4514     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4515
4516     def _real_extract(self,url):
4517         mobj = re.match(self._VALID_URL, url)
4518
4519         video_id = mobj.group('id')
4520         mrss_url='http://xhamster.com/movies/%s/.html' % video_id
4521         webpage = self._download_webpage(mrss_url, video_id)
4522         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4523         if mobj is None:
4524             raise ExtractorError(u'Unable to extract media URL')
4525         if len(mobj.group('server')) == 0:
4526             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4527         else:
4528             video_url = mobj.group('server')+'/key='+mobj.group('file')
4529         video_extension = video_url.split('.')[-1]
4530
4531         mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
4532         if mobj is None:
4533             raise ExtractorError(u'Unable to extract title')
4534         video_title = unescapeHTML(mobj.group('title'))
4535
4536         mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
4537         if mobj is None:
4538             video_description = u''
4539         else:
4540             video_description = unescapeHTML(mobj.group('description'))
4541
4542         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4543         if mobj is None:
4544             raise ExtractorError(u'Unable to extract upload date')
4545         video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4546
4547         mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
4548         if mobj is None:
4549             video_uploader_id = u'anonymous'
4550         else:
4551             video_uploader_id = mobj.group('uploader_id')
4552
4553         mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
4554         if mobj is None:
4555             raise ExtractorError(u'Unable to extract thumbnail URL')
4556         video_thumbnail = mobj.group('thumbnail')
4557
4558         return [{
4559             'id':       video_id,
4560             'url':      video_url,
4561             'ext':      video_extension,
4562             'title':    video_title,
4563             'description': video_description,
4564             'upload_date': video_upload_date,
4565             'uploader_id': video_uploader_id,
4566             'thumbnail': video_thumbnail
4567         }]
4568
4569 class HypemIE(InfoExtractor):
4570     """Information Extractor for hypem"""
4571     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4572
4573     def _real_extract(self, url):
4574         mobj = re.match(self._VALID_URL, url)
4575         if mobj is None:
4576             raise ExtractorError(u'Invalid URL: %s' % url)
4577         track_id = mobj.group(1)
4578
4579         data = { 'ax': 1, 'ts': time.time() }
4580         data_encoded = compat_urllib_parse.urlencode(data)
4581         complete_url = url + "?" + data_encoded
4582         request = compat_urllib_request.Request(complete_url)
4583         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4584         cookie = urlh.headers.get('Set-Cookie', '')
4585
4586         self.report_extraction(track_id)
4587         mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
4588         if mobj is None:
4589             raise ExtractorError(u'Unable to extrack tracks')
4590         html_tracks = mobj.group(1).strip()
4591         try:
4592             track_list = json.loads(html_tracks)
4593             track = track_list[u'tracks'][0]
4594         except ValueError:
4595             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4596
4597         key = track[u"key"]
4598         track_id = track[u"id"]
4599         artist = track[u"artist"]
4600         title = track[u"song"]
4601
4602         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4603         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4604         request.add_header('cookie', cookie)
4605         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4606         try:
4607             song_data = json.loads(song_data_json)
4608         except ValueError:
4609             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4610         final_url = song_data[u"url"]
4611
4612         return [{
4613             'id':       track_id,
4614             'url':      final_url,
4615             'ext':      "mp3",
4616             'title':    title,
4617             'artist':   artist,
4618         }]
4619
4620 class Vbox7IE(InfoExtractor):
4621     """Information Extractor for Vbox7"""
4622     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4623
4624     def _real_extract(self,url):
4625         mobj = re.match(self._VALID_URL, url)
4626         if mobj is None:
4627             raise ExtractorError(u'Invalid URL: %s' % url)
4628         video_id = mobj.group(1)
4629
4630         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4631         redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
4632         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4633
4634         title = re.search(r'<title>(.*)</title>', webpage)
4635         title = (title.group(1)).split('/')[0].strip()
4636
4637         ext = "flv"
4638         info_url = "http://vbox7.com/play/magare.do"
4639         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4640         info_request = compat_urllib_request.Request(info_url, data)
4641         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4642         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4643         if info_response is None:
4644             raise ExtractorError(u'Unable to extract the media url')
4645         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4646
4647         return [{
4648             'id':        video_id,
4649             'url':       final_url,
4650             'ext':       ext,
4651             'title':     title,
4652             'thumbnail': thumbnail_url,
4653         }]
4654
4655 def gen_extractors():
4656     """ Return a list of an instance of every supported extractor.
4657     The order does matter; the first extractor matched is the one handling the URL.
4658     """
4659     return [
4660         YoutubePlaylistIE(),
4661         YoutubeChannelIE(),
4662         YoutubeUserIE(),
4663         YoutubeSearchIE(),
4664         YoutubeIE(),
4665         MetacafeIE(),
4666         DailymotionIE(),
4667         GoogleSearchIE(),
4668         PhotobucketIE(),
4669         YahooIE(),
4670         YahooSearchIE(),
4671         DepositFilesIE(),
4672         FacebookIE(),
4673         BlipTVIE(),
4674         BlipTVUserIE(),
4675         VimeoIE(),
4676         MyVideoIE(),
4677         ComedyCentralIE(),
4678         EscapistIE(),
4679         CollegeHumorIE(),
4680         XVideosIE(),
4681         SoundcloudSetIE(),
4682         SoundcloudIE(),
4683         InfoQIE(),
4684         MixcloudIE(),
4685         StanfordOpenClassroomIE(),
4686         MTVIE(),
4687         YoukuIE(),
4688         XNXXIE(),
4689         YouJizzIE(),
4690         PornotubeIE(),
4691         YouPornIE(),
4692         GooglePlusIE(),
4693         ArteTvIE(),
4694         NBAIE(),
4695         WorldStarHipHopIE(),
4696         JustinTVIE(),
4697         FunnyOrDieIE(),
4698         SteamIE(),
4699         UstreamIE(),
4700         RBMARadioIE(),
4701         EightTracksIE(),
4702         KeekIE(),
4703         TEDIE(),
4704         MySpassIE(),
4705         SpiegelIE(),
4706         LiveLeakIE(),
4707         ARDIE(),
4708         ZDFIE(),
4709         TumblrIE(),
4710         BandcampIE(),
4711         RedTubeIE(),
4712         InaIE(),
4713         HowcastIE(),
4714         VineIE(),
4715         FlickrIE(),
4716         TeamcocoIE(),
4717         XHamsterIE(),
4718         HypemIE(),
4719         Vbox7IE(),
4720         GenericIE()
4721     ]
4722
4723 def get_info_extractor(ie_name):
4724     """Returns the info extractor class with the given ie_name"""
4725     return globals()[ie_name+'IE']