_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194 class SearchInfoExtractor(InfoExtractor):
 195     """
 196     Base class for paged search queries extractors.
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 199     """
 200
 201     @classmethod
 202     def _make_valid_url(cls):
 203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 204
 205     @classmethod
 206     def suitable(cls, url):
 207         return re.match(cls._make_valid_url(), url) is not None
 208
 209     def _real_extract(self, query):
 210         mobj = re.match(self._make_valid_url(), query)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid search query "%s"' % query)
 213
 214         prefix = mobj.group('prefix')
 215         query = mobj.group('query')
 216         if prefix == '':
 217             return self._get_n_results(query, 1)
 218         elif prefix == 'all':
 219             return self._get_n_results(query, self._MAX_RESULTS)
 220         else:
 221             n = int(prefix)
 222             if n <= 0:
 223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 224             elif n > self._MAX_RESULTS:
 225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 226                 n = self._MAX_RESULTS
 227             return self._get_n_results(query, n)
 228
 229     def _get_n_results(self, query, n):
 230         """Get a specified number of results for a query"""
 231         raise NotImplementedError("This method must be implemented by sublclasses")
 232
 233
 234 class YoutubeIE(InfoExtractor):
 235     """Information extractor for youtube.com."""
 236
 237     _VALID_URL = r"""^
 238                      (
 239                          (?:https?://)?                                       # http(s):// (optional)
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 243                          (?:                                                  # the various things that can precede the ID:
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 245                              |(?:                                             # or the v= param in all its forms
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 249                                  v=
 250                              )
 251                          )?                                                   # optional -> youtube.com/xxxx is OK
 252                      )?                                                       # all until now is optional -> you can pass the naked ID
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 254                      (?(1).+)?                                                # if we found the ID, everything can follow
 255                      $"""
 256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 260     _NETRC_MACHINE = 'youtube'
 261     # Listed in order of quality
 262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 264     _video_extensions = {
 265         '13': '3gp',
 266         '17': 'mp4',
 267         '18': 'mp4',
 268         '22': 'mp4',
 269         '37': 'mp4',
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 271         '43': 'webm',
 272         '44': 'webm',
 273         '45': 'webm',
 274         '46': 'webm',
 275     }
 276     _video_dimensions = {
 277         '5': '240x400',
 278         '6': '???',
 279         '13': '???',
 280         '17': '144x176',
 281         '18': '360x640',
 282         '22': '720x1280',
 283         '34': '360x640',
 284         '35': '480x854',
 285         '37': '1080x1920',
 286         '38': '3072x4096',
 287         '43': '360x640',
 288         '44': '480x854',
 289         '45': '720x1280',
 290         '46': '1080x1920',
 291     }
 292     IE_NAME = u'youtube'
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         """Receives a URL and returns True if suitable for this IE."""
 297         if YoutubePlaylistIE.suitable(url): return False
 298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 299
 300     def report_lang(self):
 301         """Report attempt to set language."""
 302         self.to_screen(u'Setting language')
 303
 304     def report_login(self):
 305         """Report attempt to log in."""
 306         self.to_screen(u'Logging in')
 307
 308     def report_video_webpage_download(self, video_id):
 309         """Report attempt to download video webpage."""
 310         self.to_screen(u'%s: Downloading video webpage' % video_id)
 311
 312     def report_video_info_webpage_download(self, video_id):
 313         """Report attempt to download video info webpage."""
 314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 315
 316     def report_video_subtitles_download(self, video_id):
 317         """Report attempt to download video info webpage."""
 318         self.to_screen(u'%s: Checking available subtitles' % video_id)
 319
 320     def report_video_subtitles_request(self, video_id, sub_lang, format):
 321         """Report attempt to download video info webpage."""
 322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 323
 324     def report_video_subtitles_available(self, video_id, sub_lang_list):
 325         """Report available subtitles."""
 326         sub_lang = ",".join(list(sub_lang_list.keys()))
 327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 328
 329     def report_information_extraction(self, video_id):
 330         """Report attempt to extract video information."""
 331         self.to_screen(u'%s: Extracting video information' % video_id)
 332
 333     def report_unavailable_format(self, video_id, format):
 334         """Report extracted video URL."""
 335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 336
 337     def report_rtmp_download(self):
 338         """Indicate the download will use the RTMP protocol."""
 339         self.to_screen(u'RTMP download detected')
 340
 341     def _get_available_subtitles(self, video_id):
 342         self.report_video_subtitles_download(video_id)
 343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 344         try:
 345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 350         if not sub_lang_list:
 351             return (u'video doesn\'t have subtitles', None)
 352         return sub_lang_list
 353
 354     def _list_available_subtitles(self, video_id):
 355         sub_lang_list = self._get_available_subtitles(video_id)
 356         self.report_video_subtitles_available(video_id, sub_lang_list)
 357
 358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 359         """
 360         Return tuple:
 361         (error_message, sub_lang, sub)
 362         """
 363         self.report_video_subtitles_request(video_id, sub_lang, format)
 364         params = compat_urllib_parse.urlencode({
 365             'lang': sub_lang,
 366             'name': sub_name,
 367             'v': video_id,
 368             'fmt': format,
 369         })
 370         url = 'http://www.youtube.com/api/timedtext?' + params
 371         try:
 372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 375         if not sub:
 376             return (u'Did not fetch video subtitles', None, None)
 377         return (None, sub_lang, sub)
 378
 379     def _request_automatic_caption(self, video_id, webpage):
 380         """We need the webpage for getting the captions url, pass it as an
 381            argument to speed up the process."""
 382         sub_lang = self._downloader.params.get('subtitleslang')
 383         sub_format = self._downloader.params.get('subtitlesformat')
 384         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 385         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 386         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 387         if mobj is None:
 388             return [(err_msg, None, None)]
 389         player_config = json.loads(mobj.group(1))
 390         try:
 391             args = player_config[u'args']
 392             caption_url = args[u'ttsurl']
 393             timestamp = args[u'timestamp']
 394             params = compat_urllib_parse.urlencode({
 395                 'lang': 'en',
 396                 'tlang': sub_lang,
 397                 'fmt': sub_format,
 398                 'ts': timestamp,
 399                 'kind': 'asr',
 400             })
 401             subtitles_url = caption_url + '&' + params
 402             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 403             return [(None, sub_lang, sub)]
 404         except KeyError:
 405             return [(err_msg, None, None)]
 406
 407     def _extract_subtitle(self, video_id):
 408         """
 409         Return a list with a tuple:
 410         [(error_message, sub_lang, sub)]
 411         """
 412         sub_lang_list = self._get_available_subtitles(video_id)
 413         sub_format = self._downloader.params.get('subtitlesformat')
 414         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 415             return [(sub_lang_list[0], None, None)]
 416         if self._downloader.params.get('subtitleslang', False):
 417             sub_lang = self._downloader.params.get('subtitleslang')
 418         elif 'en' in sub_lang_list:
 419             sub_lang = 'en'
 420         else:
 421             sub_lang = list(sub_lang_list.keys())[0]
 422         if not sub_lang in sub_lang_list:
 423             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 424
 425         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 426         return [subtitle]
 427
 428     def _extract_all_subtitles(self, video_id):
 429         sub_lang_list = self._get_available_subtitles(video_id)
 430         sub_format = self._downloader.params.get('subtitlesformat')
 431         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 432             return [(sub_lang_list[0], None, None)]
 433         subtitles = []
 434         for sub_lang in sub_lang_list:
 435             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 436             subtitles.append(subtitle)
 437         return subtitles
 438
 439     def _print_formats(self, formats):
 440         print('Available formats:')
 441         for x in formats:
 442             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 443
 444     def _real_initialize(self):
 445         if self._downloader is None:
 446             return
 447
 448         username = None
 449         password = None
 450         downloader_params = self._downloader.params
 451
 452         # Attempt to use provided username and password or .netrc data
 453         if downloader_params.get('username', None) is not None:
 454             username = downloader_params['username']
 455             password = downloader_params['password']
 456         elif downloader_params.get('usenetrc', False):
 457             try:
 458                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 459                 if info is not None:
 460                     username = info[0]
 461                     password = info[2]
 462                 else:
 463                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 464             except (IOError, netrc.NetrcParseError) as err:
 465                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 466                 return
 467
 468         # Set language
 469         request = compat_urllib_request.Request(self._LANG_URL)
 470         try:
 471             self.report_lang()
 472             compat_urllib_request.urlopen(request).read()
 473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 474             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 475             return
 476
 477         # No authentication to be performed
 478         if username is None:
 479             return
 480
 481         request = compat_urllib_request.Request(self._LOGIN_URL)
 482         try:
 483             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 485             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 486             return
 487
 488         galx = None
 489         dsh = None
 490         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 491         if match:
 492           galx = match.group(1)
 493
 494         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 495         if match:
 496           dsh = match.group(1)
 497
 498         # Log in
 499         login_form_strs = {
 500                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 501                 u'Email': username,
 502                 u'GALX': galx,
 503                 u'Passwd': password,
 504                 u'PersistentCookie': u'yes',
 505                 u'_utf8': u'霱',
 506                 u'bgresponse': u'js_disabled',
 507                 u'checkConnection': u'',
 508                 u'checkedDomains': u'youtube',
 509                 u'dnConn': u'',
 510                 u'dsh': dsh,
 511                 u'pstMsg': u'0',
 512                 u'rmShown': u'1',
 513                 u'secTok': u'',
 514                 u'signIn': u'Sign in',
 515                 u'timeStmp': u'',
 516                 u'service': u'youtube',
 517                 u'uilel': u'3',
 518                 u'hl': u'en_US',
 519         }
 520         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 521         # chokes on unicode
 522         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 523         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 524         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 525         try:
 526             self.report_login()
 527             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 528             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 529                 self._downloader.report_warning(u'unable to log in: bad username or password')
 530                 return
 531         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 532             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 533             return
 534
 535         # Confirm age
 536         age_form = {
 537                 'next_url':     '/',
 538                 'action_confirm':   'Confirm',
 539                 }
 540         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 541         try:
 542             self.report_age_confirmation()
 543             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 544         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 545             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 546
 547     def _extract_id(self, url):
 548         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 549         if mobj is None:
 550             raise ExtractorError(u'Invalid URL: %s' % url)
 551         video_id = mobj.group(2)
 552         return video_id
 553
 554     def _real_extract(self, url):
 555         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 556         mobj = re.search(self._NEXT_URL_RE, url)
 557         if mobj:
 558             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 559         video_id = self._extract_id(url)
 560
 561         # Get video webpage
 562         self.report_video_webpage_download(video_id)
 563         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 564         request = compat_urllib_request.Request(url)
 565         try:
 566             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 568             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 569
 570         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 571
 572         # Attempt to extract SWF player URL
 573         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 574         if mobj is not None:
 575             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 576         else:
 577             player_url = None
 578
 579         # Get video info
 580         self.report_video_info_webpage_download(video_id)
 581         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 582             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 583                     % (video_id, el_type))
 584             video_info_webpage = self._download_webpage(video_info_url, video_id,
 585                                     note=False,
 586                                     errnote='unable to download video info webpage')
 587             video_info = compat_parse_qs(video_info_webpage)
 588             if 'token' in video_info:
 589                 break
 590         if 'token' not in video_info:
 591             if 'reason' in video_info:
 592                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 593             else:
 594                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 595
 596         # Check for "rental" videos
 597         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 598             raise ExtractorError(u'"rental" videos not supported')
 599
 600         # Start extracting information
 601         self.report_information_extraction(video_id)
 602
 603         # uploader
 604         if 'author' not in video_info:
 605             raise ExtractorError(u'Unable to extract uploader name')
 606         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 607
 608         # uploader_id
 609         video_uploader_id = None
 610         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 611         if mobj is not None:
 612             video_uploader_id = mobj.group(1)
 613         else:
 614             self._downloader.report_warning(u'unable to extract uploader nickname')
 615
 616         # title
 617         if 'title' not in video_info:
 618             raise ExtractorError(u'Unable to extract video title')
 619         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 620
 621         # thumbnail image
 622         if 'thumbnail_url' not in video_info:
 623             self._downloader.report_warning(u'unable to extract video thumbnail')
 624             video_thumbnail = ''
 625         else:   # don't panic if we can't find it
 626             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 627
 628         # upload date
 629         upload_date = None
 630         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 631         if mobj is not None:
 632             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 633             upload_date = unified_strdate(upload_date)
 634
 635         # description
 636         video_description = get_element_by_id("eow-description", video_webpage)
 637         if video_description:
 638             video_description = clean_html(video_description)
 639         else:
 640             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 641             if fd_mobj:
 642                 video_description = unescapeHTML(fd_mobj.group(1))
 643             else:
 644                 video_description = u''
 645
 646         # subtitles
 647         video_subtitles = None
 648
 649         if self._downloader.params.get('writesubtitles', False):
 650             video_subtitles = self._extract_subtitle(video_id)
 651             if video_subtitles:
 652                 (sub_error, sub_lang, sub) = video_subtitles[0]
 653                 if sub_error:
 654                     # We try with the automatic captions
 655                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 656                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 657                     if sub is not None:
 658                         pass
 659                     else:
 660                         # We report the original error
 661                         self._downloader.report_error(sub_error)
 662
 663         if self._downloader.params.get('allsubtitles', False):
 664             video_subtitles = self._extract_all_subtitles(video_id)
 665             for video_subtitle in video_subtitles:
 666                 (sub_error, sub_lang, sub) = video_subtitle
 667                 if sub_error:
 668                     self._downloader.report_error(sub_error)
 669
 670         if self._downloader.params.get('listsubtitles', False):
 671             sub_lang_list = self._list_available_subtitles(video_id)
 672             return
 673
 674         if 'length_seconds' not in video_info:
 675             self._downloader.report_warning(u'unable to extract video duration')
 676             video_duration = ''
 677         else:
 678             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 679
 680         # token
 681         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 682
 683         # Decide which formats to download
 684         req_format = self._downloader.params.get('format', None)
 685
 686         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 687             self.report_rtmp_download()
 688             video_url_list = [(None, video_info['conn'][0])]
 689         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 690             url_map = {}
 691             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 692                 url_data = compat_parse_qs(url_data_str)
 693                 if 'itag' in url_data and 'url' in url_data:
 694                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 695                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 696                     url_map[url_data['itag'][0]] = url
 697
 698             format_limit = self._downloader.params.get('format_limit', None)
 699             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 700             if format_limit is not None and format_limit in available_formats:
 701                 format_list = available_formats[available_formats.index(format_limit):]
 702             else:
 703                 format_list = available_formats
 704             existing_formats = [x for x in format_list if x in url_map]
 705             if len(existing_formats) == 0:
 706                 raise ExtractorError(u'no known formats available for video')
 707             if self._downloader.params.get('listformats', None):
 708                 self._print_formats(existing_formats)
 709                 return
 710             if req_format is None or req_format == 'best':
 711                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 712             elif req_format == 'worst':
 713                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 714             elif req_format in ('-1', 'all'):
 715                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 716             else:
 717                 # Specific formats. We pick the first in a slash-delimeted sequence.
 718                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 719                 req_formats = req_format.split('/')
 720                 video_url_list = None
 721                 for rf in req_formats:
 722                     if rf in url_map:
 723                         video_url_list = [(rf, url_map[rf])]
 724                         break
 725                 if video_url_list is None:
 726                     raise ExtractorError(u'requested format not available')
 727         else:
 728             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 729
 730         results = []
 731         for format_param, video_real_url in video_url_list:
 732             # Extension
 733             video_extension = self._video_extensions.get(format_param, 'flv')
 734
 735             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 736                                               self._video_dimensions.get(format_param, '???'))
 737
 738             results.append({
 739                 'id':       video_id,
 740                 'url':      video_real_url,
 741                 'uploader': video_uploader,
 742                 'uploader_id': video_uploader_id,
 743                 'upload_date':  upload_date,
 744                 'title':    video_title,
 745                 'ext':      video_extension,
 746                 'format':   video_format,
 747                 'thumbnail':    video_thumbnail,
 748                 'description':  video_description,
 749                 'player_url':   player_url,
 750                 'subtitles':    video_subtitles,
 751                 'duration':     video_duration
 752             })
 753         return results
 754
 755
 756 class MetacafeIE(InfoExtractor):
 757     """Information Extractor for metacafe.com."""
 758
 759     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 760     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 761     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 762     IE_NAME = u'metacafe'
 763
 764     def report_disclaimer(self):
 765         """Report disclaimer retrieval."""
 766         self.to_screen(u'Retrieving disclaimer')
 767
 768     def _real_initialize(self):
 769         # Retrieve disclaimer
 770         request = compat_urllib_request.Request(self._DISCLAIMER)
 771         try:
 772             self.report_disclaimer()
 773             disclaimer = compat_urllib_request.urlopen(request).read()
 774         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 775             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 776
 777         # Confirm age
 778         disclaimer_form = {
 779             'filters': '0',
 780             'submit': "Continue - I'm over 18",
 781             }
 782         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 783         try:
 784             self.report_age_confirmation()
 785             disclaimer = compat_urllib_request.urlopen(request).read()
 786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 787             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             raise ExtractorError(u'Invalid URL: %s' % url)
 794
 795         video_id = mobj.group(1)
 796
 797         # Check if video comes from YouTube
 798         mobj2 = re.match(r'^yt-(.*)$', video_id)
 799         if mobj2 is not None:
 800             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 801
 802         # Retrieve video webpage to extract further information
 803         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 808         if mobj is not None:
 809             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 810             video_extension = mediaURL[-3:]
 811
 812             # Extract gdaKey if available
 813             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 814             if mobj is None:
 815                 video_url = mediaURL
 816             else:
 817                 gdaKey = mobj.group(1)
 818                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 819         else:
 820             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 821             if mobj is None:
 822                 raise ExtractorError(u'Unable to extract media URL')
 823             vardict = compat_parse_qs(mobj.group(1))
 824             if 'mediaData' not in vardict:
 825                 raise ExtractorError(u'Unable to extract media URL')
 826             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 827             if mobj is None:
 828                 raise ExtractorError(u'Unable to extract media URL')
 829             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 830             video_extension = mediaURL[-3:]
 831             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 832
 833         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 834         if mobj is None:
 835             raise ExtractorError(u'Unable to extract title')
 836         video_title = mobj.group(1).decode('utf-8')
 837
 838         mobj = re.search(r'submitter=(.*?);', webpage)
 839         if mobj is None:
 840             raise ExtractorError(u'Unable to extract uploader nickname')
 841         video_uploader = mobj.group(1)
 842
 843         return [{
 844             'id':       video_id.decode('utf-8'),
 845             'url':      video_url.decode('utf-8'),
 846             'uploader': video_uploader.decode('utf-8'),
 847             'upload_date':  None,
 848             'title':    video_title,
 849             'ext':      video_extension.decode('utf-8'),
 850         }]
 851
 852 class DailymotionIE(InfoExtractor):
 853     """Information Extractor for Dailymotion"""
 854
 855     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 856     IE_NAME = u'dailymotion'
 857
 858     def _real_extract(self, url):
 859         # Extract id and simplified title from URL
 860         mobj = re.match(self._VALID_URL, url)
 861         if mobj is None:
 862             raise ExtractorError(u'Invalid URL: %s' % url)
 863
 864         video_id = mobj.group(1).split('_')[0].split('?')[0]
 865
 866         video_extension = 'mp4'
 867
 868         # Retrieve video webpage to extract further information
 869         request = compat_urllib_request.Request(url)
 870         request.add_header('Cookie', 'family_filter=off')
 871         webpage = self._download_webpage(request, video_id)
 872
 873         # Extract URL, uploader and title from webpage
 874         self.report_extraction(video_id)
 875         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 876         if mobj is None:
 877             raise ExtractorError(u'Unable to extract media URL')
 878         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 879
 880         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 881             if key in flashvars:
 882                 max_quality = key
 883                 self.to_screen(u'Using %s' % key)
 884                 break
 885         else:
 886             raise ExtractorError(u'Unable to extract video URL')
 887
 888         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 889         if mobj is None:
 890             raise ExtractorError(u'Unable to extract video URL')
 891
 892         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 893
 894         # TODO: support choosing qualities
 895
 896         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 897         if mobj is None:
 898             raise ExtractorError(u'Unable to extract title')
 899         video_title = unescapeHTML(mobj.group('title'))
 900
 901         video_uploader = None
 902         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 903         if mobj is None:
 904             # lookin for official user
 905             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 906             if mobj_official is None:
 907                 self._downloader.report_warning(u'unable to extract uploader nickname')
 908             else:
 909                 video_uploader = mobj_official.group(1)
 910         else:
 911             video_uploader = mobj.group(1)
 912
 913         video_upload_date = None
 914         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 915         if mobj is not None:
 916             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 917
 918         return [{
 919             'id':       video_id,
 920             'url':      video_url,
 921             'uploader': video_uploader,
 922             'upload_date':  video_upload_date,
 923             'title':    video_title,
 924             'ext':      video_extension,
 925         }]
 926
 927
 928 class PhotobucketIE(InfoExtractor):
 929     """Information extractor for photobucket.com."""
 930
 931     # TODO: the original _VALID_URL was:
 932     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 933     # Check if it's necessary to keep the old extracion process
 934     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 935     IE_NAME = u'photobucket'
 936
 937     def _real_extract(self, url):
 938         # Extract id from URL
 939         mobj = re.match(self._VALID_URL, url)
 940         if mobj is None:
 941             raise ExtractorError(u'Invalid URL: %s' % url)
 942
 943         video_id = mobj.group('id')
 944
 945         video_extension = mobj.group('ext')
 946
 947         # Retrieve video webpage to extract further information
 948         webpage = self._download_webpage(url, video_id)
 949
 950         # Extract URL, uploader, and title from webpage
 951         self.report_extraction(video_id)
 952         # We try first by looking the javascript code:
 953         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 954         if mobj is not None:
 955             info = json.loads(mobj.group('json'))
 956             return [{
 957                 'id':       video_id,
 958                 'url':      info[u'downloadUrl'],
 959                 'uploader': info[u'username'],
 960                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 961                 'title':    info[u'title'],
 962                 'ext':      video_extension,
 963                 'thumbnail': info[u'thumbUrl'],
 964             }]
 965
 966         # We try looking in other parts of the webpage
 967         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 968         if mobj is None:
 969             raise ExtractorError(u'Unable to extract media URL')
 970         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 971
 972         video_url = mediaURL
 973
 974         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 975         if mobj is None:
 976             raise ExtractorError(u'Unable to extract title')
 977         video_title = mobj.group(1).decode('utf-8')
 978
 979         video_uploader = mobj.group(2).decode('utf-8')
 980
 981         return [{
 982             'id':       video_id.decode('utf-8'),
 983             'url':      video_url.decode('utf-8'),
 984             'uploader': video_uploader,
 985             'upload_date':  None,
 986             'title':    video_title,
 987             'ext':      video_extension.decode('utf-8'),
 988         }]
 989
 990
 991 class YahooIE(InfoExtractor):
 992     """Information extractor for screen.yahoo.com."""
 993     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 994
 995     def _real_extract(self, url):
 996         mobj = re.match(self._VALID_URL, url)
 997         if mobj is None:
 998             raise ExtractorError(u'Invalid URL: %s' % url)
 999         video_id = mobj.group('id')
1000         webpage = self._download_webpage(url, video_id)
1001         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1002
1003         if m_id is None:
1004             # TODO: Check which url parameters are required
1005             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1006             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1007             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1008                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1009                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1010                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1011                         '''
1012             self.report_extraction(video_id)
1013             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1014             if m_info is None:
1015                 raise ExtractorError(u'Unable to extract video info')
1016             video_title = m_info.group('title')
1017             video_description = m_info.group('description')
1018             video_thumb = m_info.group('thumb')
1019             video_date = m_info.group('date')
1020             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1021
1022             # TODO: Find a way to get mp4 videos
1023             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1024             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1025             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1026             video_url = m_rest.group('url')
1027             video_path = m_rest.group('path')
1028             if m_rest is None:
1029                 raise ExtractorError(u'Unable to extract video url')
1030
1031         else: # We have to use a different method if another id is defined
1032             long_id = m_id.group('new_id')
1033             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1034             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1035             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1036             info = json.loads(json_str)
1037             res = info[u'query'][u'results'][u'mediaObj'][0]
1038             stream = res[u'streams'][0]
1039             video_path = stream[u'path']
1040             video_url = stream[u'host']
1041             meta = res[u'meta']
1042             video_title = meta[u'title']
1043             video_description = meta[u'description']
1044             video_thumb = meta[u'thumbnail']
1045             video_date = None # I can't find it
1046
1047         info_dict = {
1048                      'id': video_id,
1049                      'url': video_url,
1050                      'play_path': video_path,
1051                      'title':video_title,
1052                      'description': video_description,
1053                      'thumbnail': video_thumb,
1054                      'upload_date': video_date,
1055                      'ext': 'flv',
1056                      }
1057         return info_dict
1058
1059 class VimeoIE(InfoExtractor):
1060     """Information extractor for vimeo.com."""
1061
1062     # _VALID_URL matches Vimeo URLs
1063     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1064     IE_NAME = u'vimeo'
1065
1066     def _real_extract(self, url, new_video=True):
1067         # Extract ID from URL
1068         mobj = re.match(self._VALID_URL, url)
1069         if mobj is None:
1070             raise ExtractorError(u'Invalid URL: %s' % url)
1071
1072         video_id = mobj.group('id')
1073         if not mobj.group('proto'):
1074             url = 'https://' + url
1075         if mobj.group('direct_link'):
1076             url = 'https://vimeo.com/' + video_id
1077
1078         # Retrieve video webpage to extract further information
1079         request = compat_urllib_request.Request(url, None, std_headers)
1080         webpage = self._download_webpage(request, video_id)
1081
1082         # Now we begin extracting as much information as we can from what we
1083         # retrieved. First we extract the information common to all extractors,
1084         # and latter we extract those that are Vimeo specific.
1085         self.report_extraction(video_id)
1086
1087         # Extract the config JSON
1088         try:
1089             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1090             config = json.loads(config)
1091         except:
1092             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1093                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1094             else:
1095                 raise ExtractorError(u'Unable to extract info section')
1096
1097         # Extract title
1098         video_title = config["video"]["title"]
1099
1100         # Extract uploader and uploader_id
1101         video_uploader = config["video"]["owner"]["name"]
1102         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1103
1104         # Extract video thumbnail
1105         video_thumbnail = config["video"]["thumbnail"]
1106
1107         # Extract video description
1108         video_description = get_element_by_attribute("itemprop", "description", webpage)
1109         if video_description: video_description = clean_html(video_description)
1110         else: video_description = u''
1111
1112         # Extract upload date
1113         video_upload_date = None
1114         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115         if mobj is not None:
1116             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1117
1118         # Vimeo specific: extract request signature and timestamp
1119         sig = config['request']['signature']
1120         timestamp = config['request']['timestamp']
1121
1122         # Vimeo specific: extract video codec and quality information
1123         # First consider quality, then codecs, then take everything
1124         # TODO bind to format param
1125         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126         files = { 'hd': [], 'sd': [], 'other': []}
1127         for codec_name, codec_extension in codecs:
1128             if codec_name in config["video"]["files"]:
1129                 if 'hd' in config["video"]["files"][codec_name]:
1130                     files['hd'].append((codec_name, codec_extension, 'hd'))
1131                 elif 'sd' in config["video"]["files"][codec_name]:
1132                     files['sd'].append((codec_name, codec_extension, 'sd'))
1133                 else:
1134                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1135
1136         for quality in ('hd', 'sd', 'other'):
1137             if len(files[quality]) > 0:
1138                 video_quality = files[quality][0][2]
1139                 video_codec = files[quality][0][0]
1140                 video_extension = files[quality][0][1]
1141                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1142                 break
1143         else:
1144             raise ExtractorError(u'No known codec found')
1145
1146         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1148
1149         return [{
1150             'id':       video_id,
1151             'url':      video_url,
1152             'uploader': video_uploader,
1153             'uploader_id': video_uploader_id,
1154             'upload_date':  video_upload_date,
1155             'title':    video_title,
1156             'ext':      video_extension,
1157             'thumbnail':    video_thumbnail,
1158             'description':  video_description,
1159         }]
1160
1161
1162 class ArteTvIE(InfoExtractor):
1163     """arte.tv information extractor."""
1164
1165     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166     _LIVE_URL = r'index-[0-9]+\.html$'
1167
1168     IE_NAME = u'arte.tv'
1169
1170     def fetch_webpage(self, url):
1171         request = compat_urllib_request.Request(url)
1172         try:
1173             self.report_download_webpage(url)
1174             webpage = compat_urllib_request.urlopen(request).read()
1175         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177         except ValueError as err:
1178             raise ExtractorError(u'Invalid URL: %s' % url)
1179         return webpage
1180
1181     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182         page = self.fetch_webpage(url)
1183         mobj = re.search(regex, page, regexFlags)
1184         info = {}
1185
1186         if mobj is None:
1187             raise ExtractorError(u'Invalid URL: %s' % url)
1188
1189         for (i, key, err) in matchTuples:
1190             if mobj.group(i) is None:
1191                 raise ExtractorError(err)
1192             else:
1193                 info[key] = mobj.group(i)
1194
1195         return info
1196
1197     def extractLiveStream(self, url):
1198         video_lang = url.split('/')[-4]
1199         info = self.grep_webpage(
1200             url,
1201             r'src="(.*?/videothek_js.*?\.js)',
1202             0,
1203             [
1204                 (1, 'url', u'Invalid URL: %s' % url)
1205             ]
1206         )
1207         http_host = url.split('/')[2]
1208         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1209         info = self.grep_webpage(
1210             next_url,
1211             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1212                 '(http://.*?\.swf).*?' +
1213                 '(rtmp://.*?)\'',
1214             re.DOTALL,
1215             [
1216                 (1, 'path',   u'could not extract video path: %s' % url),
1217                 (2, 'player', u'could not extract video player: %s' % url),
1218                 (3, 'url',    u'could not extract video url: %s' % url)
1219             ]
1220         )
1221         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1222
1223     def extractPlus7Stream(self, url):
1224         video_lang = url.split('/')[-3]
1225         info = self.grep_webpage(
1226             url,
1227             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1228             0,
1229             [
1230                 (1, 'url', u'Invalid URL: %s' % url)
1231             ]
1232         )
1233         next_url = compat_urllib_parse.unquote(info.get('url'))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1237             0,
1238             [
1239                 (1, 'url', u'Could not find <video> tag: %s' % url)
1240             ]
1241         )
1242         next_url = compat_urllib_parse.unquote(info.get('url'))
1243
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video id="(.*?)".*?>.*?' +
1247                 '<name>(.*?)</name>.*?' +
1248                 '<dateVideo>(.*?)</dateVideo>.*?' +
1249                 '<url quality="hd">(.*?)</url>',
1250             re.DOTALL,
1251             [
1252                 (1, 'id',    u'could not extract video id: %s' % url),
1253                 (2, 'title', u'could not extract video title: %s' % url),
1254                 (3, 'date',  u'could not extract video date: %s' % url),
1255                 (4, 'url',   u'could not extract video url: %s' % url)
1256             ]
1257         )
1258
1259         return {
1260             'id':           info.get('id'),
1261             'url':          compat_urllib_parse.unquote(info.get('url')),
1262             'uploader':     u'arte.tv',
1263             'upload_date':  unified_strdate(info.get('date')),
1264             'title':        info.get('title').decode('utf-8'),
1265             'ext':          u'mp4',
1266             'format':       u'NA',
1267             'player_url':   None,
1268         }
1269
1270     def _real_extract(self, url):
1271         video_id = url.split('/')[-1]
1272         self.report_extraction(video_id)
1273
1274         if re.search(self._LIVE_URL, video_id) is not None:
1275             self.extractLiveStream(url)
1276             return
1277         else:
1278             info = self.extractPlus7Stream(url)
1279
1280         return [info]
1281
1282
1283 class GenericIE(InfoExtractor):
1284     """Generic last-resort information extractor."""
1285
1286     _VALID_URL = r'.*'
1287     IE_NAME = u'generic'
1288
1289     def report_download_webpage(self, video_id):
1290         """Report webpage download."""
1291         if not self._downloader.params.get('test', False):
1292             self._downloader.report_warning(u'Falling back on generic information extractor.')
1293         super(GenericIE, self).report_download_webpage(video_id)
1294
1295     def report_following_redirect(self, new_url):
1296         """Report information extraction."""
1297         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1298
1299     def _test_redirect(self, url):
1300         """Check if it is a redirect, like url shorteners, in case return the new url."""
1301         class HeadRequest(compat_urllib_request.Request):
1302             def get_method(self):
1303                 return "HEAD"
1304
1305         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1306             """
1307             Subclass the HTTPRedirectHandler to make it use our
1308             HeadRequest also on the redirected URL
1309             """
1310             def redirect_request(self, req, fp, code, msg, headers, newurl):
1311                 if code in (301, 302, 303, 307):
1312                     newurl = newurl.replace(' ', '%20')
1313                     newheaders = dict((k,v) for k,v in req.headers.items()
1314                                       if k.lower() not in ("content-length", "content-type"))
1315                     return HeadRequest(newurl,
1316                                        headers=newheaders,
1317                                        origin_req_host=req.get_origin_req_host(),
1318                                        unverifiable=True)
1319                 else:
1320                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1321
1322         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1323             """
1324             Fallback to GET if HEAD is not allowed (405 HTTP error)
1325             """
1326             def http_error_405(self, req, fp, code, msg, headers):
1327                 fp.read()
1328                 fp.close()
1329
1330                 newheaders = dict((k,v) for k,v in req.headers.items()
1331                                   if k.lower() not in ("content-length", "content-type"))
1332                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1333                                                  headers=newheaders,
1334                                                  origin_req_host=req.get_origin_req_host(),
1335                                                  unverifiable=True))
1336
1337         # Build our opener
1338         opener = compat_urllib_request.OpenerDirector()
1339         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1340                         HTTPMethodFallback, HEADRedirectHandler,
1341                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1342             opener.add_handler(handler())
1343
1344         response = opener.open(HeadRequest(url))
1345         if response is None:
1346             raise ExtractorError(u'Invalid URL protocol')
1347         new_url = response.geturl()
1348
1349         if url == new_url:
1350             return False
1351
1352         self.report_following_redirect(new_url)
1353         return new_url
1354
1355     def _real_extract(self, url):
1356         new_url = self._test_redirect(url)
1357         if new_url: return [self.url_result(new_url)]
1358
1359         video_id = url.split('/')[-1]
1360         try:
1361             webpage = self._download_webpage(url, video_id)
1362         except ValueError as err:
1363             # since this is the last-resort InfoExtractor, if
1364             # this error is thrown, it'll be thrown here
1365             raise ExtractorError(u'Invalid URL: %s' % url)
1366
1367         self.report_extraction(video_id)
1368         # Start with something easy: JW Player in SWFObject
1369         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1370         if mobj is None:
1371             # Broaden the search a little bit
1372             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit: JWPlayer JS loader
1375             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             raise ExtractorError(u'Invalid URL: %s' % url)
1378
1379         # It's possible that one of the regexes
1380         # matched, but returned an empty group:
1381         if mobj.group(1) is None:
1382             raise ExtractorError(u'Invalid URL: %s' % url)
1383
1384         video_url = compat_urllib_parse.unquote(mobj.group(1))
1385         video_id = os.path.basename(video_url)
1386
1387         # here's a fun little line of code for you:
1388         video_extension = os.path.splitext(video_id)[1][1:]
1389         video_id = os.path.splitext(video_id)[0]
1390
1391         # it's tempting to parse this further, but you would
1392         # have to take into account all the variations like
1393         #   Video Title - Site Name
1394         #   Site Name | Video Title
1395         #   Video Title - Tagline | Site Name
1396         # and so on and so forth; it's just not practical
1397         mobj = re.search(r'<title>(.*)</title>', webpage)
1398         if mobj is None:
1399             raise ExtractorError(u'Unable to extract title')
1400         video_title = mobj.group(1)
1401
1402         # video uploader is domain name
1403         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404         if mobj is None:
1405             raise ExtractorError(u'Unable to extract title')
1406         video_uploader = mobj.group(1)
1407
1408         return [{
1409             'id':       video_id,
1410             'url':      video_url,
1411             'uploader': video_uploader,
1412             'upload_date':  None,
1413             'title':    video_title,
1414             'ext':      video_extension,
1415         }]
1416
1417
1418 class YoutubeSearchIE(SearchInfoExtractor):
1419     """Information Extractor for YouTube search queries."""
1420     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1421     _MAX_RESULTS = 1000
1422     IE_NAME = u'youtube:search'
1423     _SEARCH_KEY = 'ytsearch'
1424
1425     def report_download_page(self, query, pagenum):
1426         """Report attempt to download search page with given number."""
1427         query = query.decode(preferredencoding())
1428         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429
1430     def _get_n_results(self, query, n):
1431         """Get a specified number of results for a query"""
1432
1433         video_ids = []
1434         pagenum = 0
1435         limit = n
1436
1437         while (50 * pagenum) < limit:
1438             self.report_download_page(query, pagenum+1)
1439             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1440             request = compat_urllib_request.Request(result_url)
1441             try:
1442                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1443             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1444                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1445             api_response = json.loads(data)['data']
1446
1447             if not 'items' in api_response:
1448                 raise ExtractorError(u'[youtube] No video results')
1449
1450             new_ids = list(video['id'] for video in api_response['items'])
1451             video_ids += new_ids
1452
1453             limit = min(n, api_response['totalItems'])
1454             pagenum += 1
1455
1456         if len(video_ids) > n:
1457             video_ids = video_ids[:n]
1458         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1459         return self.playlist_result(videos, query)
1460
1461
1462 class GoogleSearchIE(SearchInfoExtractor):
1463     """Information Extractor for Google Video search queries."""
1464     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1465     _MAX_RESULTS = 1000
1466     IE_NAME = u'video.google:search'
1467     _SEARCH_KEY = 'gvsearch'
1468
1469     def _get_n_results(self, query, n):
1470         """Get a specified number of results for a query"""
1471
1472         res = {
1473             '_type': 'playlist',
1474             'id': query,
1475             'entries': []
1476         }
1477
1478         for pagenum in itertools.count(1):
1479             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1480             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1481                                              note='Downloading result page ' + str(pagenum))
1482
1483             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1484                 e = {
1485                     '_type': 'url',
1486                     'url': mobj.group(1)
1487                 }
1488                 res['entries'].append(e)
1489
1490             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1491                 return res
1492
1493 class YahooSearchIE(SearchInfoExtractor):
1494     """Information Extractor for Yahoo! Video search queries."""
1495
1496     _MAX_RESULTS = 1000
1497     IE_NAME = u'screen.yahoo:search'
1498     _SEARCH_KEY = 'yvsearch'
1499
1500     def _get_n_results(self, query, n):
1501         """Get a specified number of results for a query"""
1502
1503         res = {
1504             '_type': 'playlist',
1505             'id': query,
1506             'entries': []
1507         }
1508         for pagenum in itertools.count(0):
1509             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1510             webpage = self._download_webpage(result_url, query,
1511                                              note='Downloading results page '+str(pagenum+1))
1512             info = json.loads(webpage)
1513             m = info[u'm']
1514             results = info[u'results']
1515
1516             for (i, r) in enumerate(results):
1517                 if (pagenum * 30) +i >= n:
1518                     break
1519                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1520                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1521                 res['entries'].append(e)
1522             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1523                 break
1524
1525         return res
1526
1527
1528 class YoutubePlaylistIE(InfoExtractor):
1529     """Information Extractor for YouTube playlists."""
1530
1531     _VALID_URL = r"""(?:
1532                         (?:https?://)?
1533                         (?:\w+\.)?
1534                         youtube\.com/
1535                         (?:
1536                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1537                            \? (?:.*?&)*? (?:p|a|list)=
1538                         |  p/
1539                         )
1540                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1541                         .*
1542                      |
1543                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1544                      )"""
1545     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1546     _MAX_RESULTS = 50
1547     IE_NAME = u'youtube:playlist'
1548
1549     @classmethod
1550     def suitable(cls, url):
1551         """Receives a URL and returns True if suitable for this IE."""
1552         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1553
1554     def _real_extract(self, url):
1555         # Extract playlist id
1556         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1557         if mobj is None:
1558             raise ExtractorError(u'Invalid URL: %s' % url)
1559
1560         # Download playlist videos from API
1561         playlist_id = mobj.group(1) or mobj.group(2)
1562         page_num = 1
1563         videos = []
1564
1565         while True:
1566             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1567             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1568
1569             try:
1570                 response = json.loads(page)
1571             except ValueError as err:
1572                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1573
1574             if 'feed' not in response:
1575                 raise ExtractorError(u'Got a malformed response from YouTube API')
1576             playlist_title = response['feed']['title']['$t']
1577             if 'entry' not in response['feed']:
1578                 # Number of videos is a multiple of self._MAX_RESULTS
1579                 break
1580
1581             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1582                         for entry in response['feed']['entry']
1583                         if 'content' in entry ]
1584
1585             if len(response['feed']['entry']) < self._MAX_RESULTS:
1586                 break
1587             page_num += 1
1588
1589         videos = [v[1] for v in sorted(videos)]
1590
1591         url_results = [self.url_result(url, 'Youtube') for url in videos]
1592         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1593
1594
1595 class YoutubeChannelIE(InfoExtractor):
1596     """Information Extractor for YouTube channels."""
1597
1598     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1599     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1600     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1601     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1602     IE_NAME = u'youtube:channel'
1603
1604     def extract_videos_from_page(self, page):
1605         ids_in_page = []
1606         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1607             if mobj.group(1) not in ids_in_page:
1608                 ids_in_page.append(mobj.group(1))
1609         return ids_in_page
1610
1611     def _real_extract(self, url):
1612         # Extract channel id
1613         mobj = re.match(self._VALID_URL, url)
1614         if mobj is None:
1615             raise ExtractorError(u'Invalid URL: %s' % url)
1616
1617         # Download channel page
1618         channel_id = mobj.group(1)
1619         video_ids = []
1620         pagenum = 1
1621
1622         url = self._TEMPLATE_URL % (channel_id, pagenum)
1623         page = self._download_webpage(url, channel_id,
1624                                       u'Downloading page #%s' % pagenum)
1625
1626         # Extract video identifiers
1627         ids_in_page = self.extract_videos_from_page(page)
1628         video_ids.extend(ids_in_page)
1629
1630         # Download any subsequent channel pages using the json-based channel_ajax query
1631         if self._MORE_PAGES_INDICATOR in page:
1632             while True:
1633                 pagenum = pagenum + 1
1634
1635                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1636                 page = self._download_webpage(url, channel_id,
1637                                               u'Downloading page #%s' % pagenum)
1638
1639                 page = json.loads(page)
1640
1641                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1642                 video_ids.extend(ids_in_page)
1643
1644                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1645                     break
1646
1647         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1648
1649         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1650         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1651         return [self.playlist_result(url_entries, channel_id)]
1652
1653
1654 class YoutubeUserIE(InfoExtractor):
1655     """Information Extractor for YouTube users."""
1656
1657     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1658     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1659     _GDATA_PAGE_SIZE = 50
1660     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1661     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1662     IE_NAME = u'youtube:user'
1663
1664     def _real_extract(self, url):
1665         # Extract username
1666         mobj = re.match(self._VALID_URL, url)
1667         if mobj is None:
1668             raise ExtractorError(u'Invalid URL: %s' % url)
1669
1670         username = mobj.group(1)
1671
1672         # Download video ids using YouTube Data API. Result size per
1673         # query is limited (currently to 50 videos) so we need to query
1674         # page by page until there are no video ids - it means we got
1675         # all of them.
1676
1677         video_ids = []
1678         pagenum = 0
1679
1680         while True:
1681             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1682
1683             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1684             page = self._download_webpage(gdata_url, username,
1685                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1686
1687             # Extract video identifiers
1688             ids_in_page = []
1689
1690             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691                 if mobj.group(1) not in ids_in_page:
1692                     ids_in_page.append(mobj.group(1))
1693
1694             video_ids.extend(ids_in_page)
1695
1696             # A little optimization - if current page is not
1697             # "full", ie. does not contain PAGE_SIZE video ids then
1698             # we can assume that this page is the last one - there
1699             # are no more ids on further pages - no need to query
1700             # again.
1701
1702             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1703                 break
1704
1705             pagenum += 1
1706
1707         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1708         url_results = [self.url_result(url, 'Youtube') for url in urls]
1709         return [self.playlist_result(url_results, playlist_title = username)]
1710
1711
1712 class BlipTVUserIE(InfoExtractor):
1713     """Information Extractor for blip.tv users."""
1714
1715     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1716     _PAGE_SIZE = 12
1717     IE_NAME = u'blip.tv:user'
1718
1719     def _real_extract(self, url):
1720         # Extract username
1721         mobj = re.match(self._VALID_URL, url)
1722         if mobj is None:
1723             raise ExtractorError(u'Invalid URL: %s' % url)
1724
1725         username = mobj.group(1)
1726
1727         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1728
1729         page = self._download_webpage(url, username, u'Downloading user page')
1730         mobj = re.search(r'data-users-id="([^"]+)"', page)
1731         page_base = page_base % mobj.group(1)
1732
1733
1734         # Download video ids using BlipTV Ajax calls. Result size per
1735         # query is limited (currently to 12 videos) so we need to query
1736         # page by page until there are no video ids - it means we got
1737         # all of them.
1738
1739         video_ids = []
1740         pagenum = 1
1741
1742         while True:
1743             url = page_base + "&page=" + str(pagenum)
1744             page = self._download_webpage(url, username,
1745                                           u'Downloading video ids from page %d' % pagenum)
1746
1747             # Extract video identifiers
1748             ids_in_page = []
1749
1750             for mobj in re.finditer(r'href="/([^"]+)"', page):
1751                 if mobj.group(1) not in ids_in_page:
1752                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1753
1754             video_ids.extend(ids_in_page)
1755
1756             # A little optimization - if current page is not
1757             # "full", ie. does not contain PAGE_SIZE video ids then
1758             # we can assume that this page is the last one - there
1759             # are no more ids on further pages - no need to query
1760             # again.
1761
1762             if len(ids_in_page) < self._PAGE_SIZE:
1763                 break
1764
1765             pagenum += 1
1766
1767         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1768         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1769         return [self.playlist_result(url_entries, playlist_title = username)]
1770
1771
1772 class DepositFilesIE(InfoExtractor):
1773     """Information extractor for depositfiles.com"""
1774
1775     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1776
1777     def _real_extract(self, url):
1778         file_id = url.split('/')[-1]
1779         # Rebuild url in english locale
1780         url = 'http://depositfiles.com/en/files/' + file_id
1781
1782         # Retrieve file webpage with 'Free download' button pressed
1783         free_download_indication = { 'gateway_result' : '1' }
1784         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1785         try:
1786             self.report_download_webpage(file_id)
1787             webpage = compat_urllib_request.urlopen(request).read()
1788         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1790
1791         # Search for the real file URL
1792         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1793         if (mobj is None) or (mobj.group(1) is None):
1794             # Try to figure out reason of the error.
1795             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1796             if (mobj is not None) and (mobj.group(1) is not None):
1797                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1798                 raise ExtractorError(u'%s' % restriction_message)
1799             else:
1800                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1801
1802         file_url = mobj.group(1)
1803         file_extension = os.path.splitext(file_url)[1][1:]
1804
1805         # Search for file title
1806         mobj = re.search(r'<b title="(.*?)">', webpage)
1807         if mobj is None:
1808             raise ExtractorError(u'Unable to extract title')
1809         file_title = mobj.group(1).decode('utf-8')
1810
1811         return [{
1812             'id':       file_id.decode('utf-8'),
1813             'url':      file_url.decode('utf-8'),
1814             'uploader': None,
1815             'upload_date':  None,
1816             'title':    file_title,
1817             'ext':      file_extension.decode('utf-8'),
1818         }]
1819
1820
1821 class FacebookIE(InfoExtractor):
1822     """Information Extractor for Facebook"""
1823
1824     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1825     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1826     _NETRC_MACHINE = 'facebook'
1827     IE_NAME = u'facebook'
1828
1829     def report_login(self):
1830         """Report attempt to log in."""
1831         self.to_screen(u'Logging in')
1832
1833     def _real_initialize(self):
1834         if self._downloader is None:
1835             return
1836
1837         useremail = None
1838         password = None
1839         downloader_params = self._downloader.params
1840
1841         # Attempt to use provided username and password or .netrc data
1842         if downloader_params.get('username', None) is not None:
1843             useremail = downloader_params['username']
1844             password = downloader_params['password']
1845         elif downloader_params.get('usenetrc', False):
1846             try:
1847                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1848                 if info is not None:
1849                     useremail = info[0]
1850                     password = info[2]
1851                 else:
1852                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1853             except (IOError, netrc.NetrcParseError) as err:
1854                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1855                 return
1856
1857         if useremail is None:
1858             return
1859
1860         # Log in
1861         login_form = {
1862             'email': useremail,
1863             'pass': password,
1864             'login': 'Log+In'
1865             }
1866         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1867         try:
1868             self.report_login()
1869             login_results = compat_urllib_request.urlopen(request).read()
1870             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1871                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1872                 return
1873         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1875             return
1876
1877     def _real_extract(self, url):
1878         mobj = re.match(self._VALID_URL, url)
1879         if mobj is None:
1880             raise ExtractorError(u'Invalid URL: %s' % url)
1881         video_id = mobj.group('ID')
1882
1883         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1884         webpage = self._download_webpage(url, video_id)
1885
1886         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1887         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1888         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1889         if not m:
1890             raise ExtractorError(u'Cannot parse data')
1891         data = dict(json.loads(m.group(1)))
1892         params_raw = compat_urllib_parse.unquote(data['params'])
1893         params = json.loads(params_raw)
1894         video_data = params['video_data'][0]
1895         video_url = video_data.get('hd_src')
1896         if not video_url:
1897             video_url = video_data['sd_src']
1898         if not video_url:
1899             raise ExtractorError(u'Cannot find video URL')
1900         video_duration = int(video_data['video_duration'])
1901         thumbnail = video_data['thumbnail_src']
1902
1903         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1904         if not m:
1905             raise ExtractorError(u'Cannot find title in webpage')
1906         video_title = unescapeHTML(m.group(1))
1907
1908         info = {
1909             'id': video_id,
1910             'title': video_title,
1911             'url': video_url,
1912             'ext': 'mp4',
1913             'duration': video_duration,
1914             'thumbnail': thumbnail,
1915         }
1916         return [info]
1917
1918
1919 class BlipTVIE(InfoExtractor):
1920     """Information extractor for blip.tv"""
1921
1922     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1923     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1924     IE_NAME = u'blip.tv'
1925
1926     def report_direct_download(self, title):
1927         """Report information extraction."""
1928         self.to_screen(u'%s: Direct download detected' % title)
1929
1930     def _real_extract(self, url):
1931         mobj = re.match(self._VALID_URL, url)
1932         if mobj is None:
1933             raise ExtractorError(u'Invalid URL: %s' % url)
1934
1935         urlp = compat_urllib_parse_urlparse(url)
1936         if urlp.path.startswith('/play/'):
1937             request = compat_urllib_request.Request(url)
1938             response = compat_urllib_request.urlopen(request)
1939             redirecturl = response.geturl()
1940             rurlp = compat_urllib_parse_urlparse(redirecturl)
1941             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1942             url = 'http://blip.tv/a/a-' + file_id
1943             return self._real_extract(url)
1944
1945
1946         if '?' in url:
1947             cchar = '&'
1948         else:
1949             cchar = '?'
1950         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1951         request = compat_urllib_request.Request(json_url)
1952         request.add_header('User-Agent', 'iTunes/10.6.1')
1953         self.report_extraction(mobj.group(1))
1954         info = None
1955         try:
1956             urlh = compat_urllib_request.urlopen(request)
1957             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1958                 basename = url.split('/')[-1]
1959                 title,ext = os.path.splitext(basename)
1960                 title = title.decode('UTF-8')
1961                 ext = ext.replace('.', '')
1962                 self.report_direct_download(title)
1963                 info = {
1964                     'id': title,
1965                     'url': url,
1966                     'uploader': None,
1967                     'upload_date': None,
1968                     'title': title,
1969                     'ext': ext,
1970                     'urlhandle': urlh
1971                 }
1972         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1973             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1974         if info is None: # Regular URL
1975             try:
1976                 json_code_bytes = urlh.read()
1977                 json_code = json_code_bytes.decode('utf-8')
1978             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1979                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1980
1981             try:
1982                 json_data = json.loads(json_code)
1983                 if 'Post' in json_data:
1984                     data = json_data['Post']
1985                 else:
1986                     data = json_data
1987
1988                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1989                 video_url = data['media']['url']
1990                 umobj = re.match(self._URL_EXT, video_url)
1991                 if umobj is None:
1992                     raise ValueError('Can not determine filename extension')
1993                 ext = umobj.group(1)
1994
1995                 info = {
1996                     'id': data['item_id'],
1997                     'url': video_url,
1998                     'uploader': data['display_name'],
1999                     'upload_date': upload_date,
2000                     'title': data['title'],
2001                     'ext': ext,
2002                     'format': data['media']['mimeType'],
2003                     'thumbnail': data['thumbnailUrl'],
2004                     'description': data['description'],
2005                     'player_url': data['embedUrl'],
2006                     'user_agent': 'iTunes/10.6.1',
2007                 }
2008             except (ValueError,KeyError) as err:
2009                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2010
2011         return [info]
2012
2013
2014 class MyVideoIE(InfoExtractor):
2015     """Information Extractor for myvideo.de."""
2016
2017     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2018     IE_NAME = u'myvideo'
2019
2020     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2021     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2022     # https://github.com/rg3/youtube-dl/pull/842
2023     def __rc4crypt(self,data, key):
2024         x = 0
2025         box = list(range(256))
2026         for i in list(range(256)):
2027             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2028             box[i], box[x] = box[x], box[i]
2029         x = 0
2030         y = 0
2031         out = ''
2032         for char in data:
2033             x = (x + 1) % 256
2034             y = (y + box[x]) % 256
2035             box[x], box[y] = box[y], box[x]
2036             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2037         return out
2038
2039     def __md5(self,s):
2040         return hashlib.md5(s).hexdigest().encode()
2041
2042     def _real_extract(self,url):
2043         mobj = re.match(self._VALID_URL, url)
2044         if mobj is None:
2045             raise ExtractorError(u'invalid URL: %s' % url)
2046
2047         video_id = mobj.group(1)
2048
2049         GK = (
2050           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2051           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2052           b'TnpsbA0KTVRkbU1tSTRNdz09'
2053         )
2054
2055         # Get video webpage
2056         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2057         webpage = self._download_webpage(webpage_url, video_id)
2058
2059         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2060         if mobj is not None:
2061             self.report_extraction(video_id)
2062             video_url = mobj.group(1) + '.flv'
2063
2064             mobj = re.search('<title>([^<]+)</title>', webpage)
2065             if mobj is None:
2066                 raise ExtractorError(u'Unable to extract title')
2067             video_title = mobj.group(1)
2068
2069             mobj = re.search('[.](.+?)$', video_url)
2070             if mobj is None:
2071                 raise ExtractorError(u'Unable to extract extention')
2072             video_ext = mobj.group(1)
2073
2074             return [{
2075                 'id':       video_id,
2076                 'url':      video_url,
2077                 'uploader': None,
2078                 'upload_date':  None,
2079                 'title':    video_title,
2080                 'ext':      u'flv',
2081             }]
2082
2083         # try encxml
2084         mobj = re.search('var flashvars={(.+?)}', webpage)
2085         if mobj is None:
2086             raise ExtractorError(u'Unable to extract video')
2087
2088         params = {}
2089         encxml = ''
2090         sec = mobj.group(1)
2091         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2092             if not a == '_encxml':
2093                 params[a] = b
2094             else:
2095                 encxml = compat_urllib_parse.unquote(b)
2096         if not params.get('domain'):
2097             params['domain'] = 'www.myvideo.de'
2098         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2099         if 'flash_playertype=MTV' in xmldata_url:
2100             self._downloader.report_warning(u'avoiding MTV player')
2101             xmldata_url = (
2102                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2103                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2104             ) % video_id
2105
2106         # get enc data
2107         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2108         enc_data_b = binascii.unhexlify(enc_data)
2109         sk = self.__md5(
2110             base64.b64decode(base64.b64decode(GK)) +
2111             self.__md5(
2112                 str(video_id).encode('utf-8')
2113             )
2114         )
2115         dec_data = self.__rc4crypt(enc_data_b, sk)
2116
2117         # extracting infos
2118         self.report_extraction(video_id)
2119
2120         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2121         if mobj is None:
2122             raise ExtractorError(u'unable to extract rtmpurl')
2123         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2124         if 'myvideo2flash' in video_rtmpurl:
2125             self._downloader.report_warning(u'forcing RTMPT ...')
2126             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2127
2128         # extract non rtmp videos
2129         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2130             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2131             if mobj is None:
2132                 raise ExtractorError(u'unable to extract url')
2133             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2134
2135         mobj = re.search('source=\'(.*?)\'', dec_data)
2136         if mobj is None:
2137             raise ExtractorError(u'unable to extract swfobj')
2138         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2139
2140         if not video_file.endswith('f4m'):
2141             ppath, prefix = video_file.split('.')
2142             video_playpath = '%s:%s' % (prefix, ppath)
2143             video_hls_playlist = ''
2144         else:
2145             video_playpath = ''
2146             video_hls_playlist = (
2147                 video_filepath + video_file
2148             ).replace('.f4m', '.m3u8')
2149
2150         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2151         if mobj is None:
2152             raise ExtractorError(u'unable to extract swfobj')
2153         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2154
2155         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2156         if mobj is None:
2157             raise ExtractorError(u'unable to extract title')
2158         video_title = mobj.group(1)
2159
2160         return [{
2161             'id':                 video_id,
2162             'url':                video_rtmpurl,
2163             'tc_url':             video_rtmpurl,
2164             'uploader':           None,
2165             'upload_date':        None,
2166             'title':              video_title,
2167             'ext':                u'flv',
2168             'play_path':          video_playpath,
2169             'video_file':         video_file,
2170             'video_hls_playlist': video_hls_playlist,
2171             'player_url':         video_swfobj,
2172         }]
2173
2174 class ComedyCentralIE(InfoExtractor):
2175     """Information extractor for The Daily Show and Colbert Report """
2176
2177     # urls can be abbreviations like :thedailyshow or :colbert
2178     # urls for episodes like:
2179     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2180     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2181     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2182     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2183                       |(https?://)?(www\.)?
2184                           (?P<showname>thedailyshow|colbertnation)\.com/
2185                          (full-episodes/(?P<episode>.*)|
2186                           (?P<clip>
2187                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2188                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2189                      $"""
2190
2191     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2192
2193     _video_extensions = {
2194         '3500': 'mp4',
2195         '2200': 'mp4',
2196         '1700': 'mp4',
2197         '1200': 'mp4',
2198         '750': 'mp4',
2199         '400': 'mp4',
2200     }
2201     _video_dimensions = {
2202         '3500': '1280x720',
2203         '2200': '960x540',
2204         '1700': '768x432',
2205         '1200': '640x360',
2206         '750': '512x288',
2207         '400': '384x216',
2208     }
2209
2210     @classmethod
2211     def suitable(cls, url):
2212         """Receives a URL and returns True if suitable for this IE."""
2213         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2214
2215     def _print_formats(self, formats):
2216         print('Available formats:')
2217         for x in formats:
2218             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2219
2220
2221     def _real_extract(self, url):
2222         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2223         if mobj is None:
2224             raise ExtractorError(u'Invalid URL: %s' % url)
2225
2226         if mobj.group('shortname'):
2227             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2228                 url = u'http://www.thedailyshow.com/full-episodes/'
2229             else:
2230                 url = u'http://www.colbertnation.com/full-episodes/'
2231             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2232             assert mobj is not None
2233
2234         if mobj.group('clip'):
2235             if mobj.group('showname') == 'thedailyshow':
2236                 epTitle = mobj.group('tdstitle')
2237             else:
2238                 epTitle = mobj.group('cntitle')
2239             dlNewest = False
2240         else:
2241             dlNewest = not mobj.group('episode')
2242             if dlNewest:
2243                 epTitle = mobj.group('showname')
2244             else:
2245                 epTitle = mobj.group('episode')
2246
2247         self.report_extraction(epTitle)
2248         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2249         if dlNewest:
2250             url = htmlHandle.geturl()
2251             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252             if mobj is None:
2253                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2254             if mobj.group('episode') == '':
2255                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2256             epTitle = mobj.group('episode')
2257
2258         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2259
2260         if len(mMovieParams) == 0:
2261             # The Colbert Report embeds the information in a without
2262             # a URL prefix; so extract the alternate reference
2263             # and then add the URL prefix manually.
2264
2265             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2266             if len(altMovieParams) == 0:
2267                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2268             else:
2269                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2270
2271         uri = mMovieParams[0][1]
2272         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2273         indexXml = self._download_webpage(indexUrl, epTitle,
2274                                           u'Downloading show index',
2275                                           u'unable to download episode index')
2276
2277         results = []
2278
2279         idoc = xml.etree.ElementTree.fromstring(indexXml)
2280         itemEls = idoc.findall('.//item')
2281         for partNum,itemEl in enumerate(itemEls):
2282             mediaId = itemEl.findall('./guid')[0].text
2283             shortMediaId = mediaId.split(':')[-1]
2284             showId = mediaId.split(':')[-2].replace('.com', '')
2285             officialTitle = itemEl.findall('./title')[0].text
2286             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2287
2288             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2289                         compat_urllib_parse.urlencode({'uri': mediaId}))
2290             configXml = self._download_webpage(configUrl, epTitle,
2291                                                u'Downloading configuration for %s' % shortMediaId)
2292
2293             cdoc = xml.etree.ElementTree.fromstring(configXml)
2294             turls = []
2295             for rendition in cdoc.findall('.//rendition'):
2296                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2297                 turls.append(finfo)
2298
2299             if len(turls) == 0:
2300                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2301                 continue
2302
2303             if self._downloader.params.get('listformats', None):
2304                 self._print_formats([i[0] for i in turls])
2305                 return
2306
2307             # For now, just pick the highest bitrate
2308             format,rtmp_video_url = turls[-1]
2309
2310             # Get the format arg from the arg stream
2311             req_format = self._downloader.params.get('format', None)
2312
2313             # Select format if we can find one
2314             for f,v in turls:
2315                 if f == req_format:
2316                     format, rtmp_video_url = f, v
2317                     break
2318
2319             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2320             if not m:
2321                 raise ExtractorError(u'Cannot transform RTMP url')
2322             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2323             video_url = base + m.group('finalid')
2324
2325             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2326             info = {
2327                 'id': shortMediaId,
2328                 'url': video_url,
2329                 'uploader': showId,
2330                 'upload_date': officialDate,
2331                 'title': effTitle,
2332                 'ext': 'mp4',
2333                 'format': format,
2334                 'thumbnail': None,
2335                 'description': officialTitle,
2336             }
2337             results.append(info)
2338
2339         return results
2340
2341
2342 class EscapistIE(InfoExtractor):
2343     """Information extractor for The Escapist """
2344
2345     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2346     IE_NAME = u'escapist'
2347
2348     def _real_extract(self, url):
2349         mobj = re.match(self._VALID_URL, url)
2350         if mobj is None:
2351             raise ExtractorError(u'Invalid URL: %s' % url)
2352         showName = mobj.group('showname')
2353         videoId = mobj.group('episode')
2354
2355         self.report_extraction(showName)
2356         webPage = self._download_webpage(url, showName)
2357
2358         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2359         description = unescapeHTML(descMatch.group(1))
2360         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2361         imgUrl = unescapeHTML(imgMatch.group(1))
2362         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2363         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2364         configUrlMatch = re.search('config=(.*)$', playerUrl)
2365         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2366
2367         configJSON = self._download_webpage(configUrl, showName,
2368                                             u'Downloading configuration',
2369                                             u'unable to download configuration')
2370
2371         # Technically, it's JavaScript, not JSON
2372         configJSON = configJSON.replace("'", '"')
2373
2374         try:
2375             config = json.loads(configJSON)
2376         except (ValueError,) as err:
2377             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2378
2379         playlist = config['playlist']
2380         videoUrl = playlist[1]['url']
2381
2382         info = {
2383             'id': videoId,
2384             'url': videoUrl,
2385             'uploader': showName,
2386             'upload_date': None,
2387             'title': showName,
2388             'ext': 'mp4',
2389             'thumbnail': imgUrl,
2390             'description': description,
2391             'player_url': playerUrl,
2392         }
2393
2394         return [info]
2395
2396 class CollegeHumorIE(InfoExtractor):
2397     """Information extractor for collegehumor.com"""
2398
2399     _WORKING = False
2400     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2401     IE_NAME = u'collegehumor'
2402
2403     def report_manifest(self, video_id):
2404         """Report information extraction."""
2405         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2406
2407     def _real_extract(self, url):
2408         mobj = re.match(self._VALID_URL, url)
2409         if mobj is None:
2410             raise ExtractorError(u'Invalid URL: %s' % url)
2411         video_id = mobj.group('videoid')
2412
2413         info = {
2414             'id': video_id,
2415             'uploader': None,
2416             'upload_date': None,
2417         }
2418
2419         self.report_extraction(video_id)
2420         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2421         try:
2422             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2423         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2424             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2425
2426         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2427         try:
2428             videoNode = mdoc.findall('./video')[0]
2429             info['description'] = videoNode.findall('./description')[0].text
2430             info['title'] = videoNode.findall('./caption')[0].text
2431             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2432             manifest_url = videoNode.findall('./file')[0].text
2433         except IndexError:
2434             raise ExtractorError(u'Invalid metadata XML file')
2435
2436         manifest_url += '?hdcore=2.10.3'
2437         self.report_manifest(video_id)
2438         try:
2439             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2440         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2442
2443         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2444         try:
2445             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2446             node_id = media_node.attrib['url']
2447             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2448         except IndexError as err:
2449             raise ExtractorError(u'Invalid manifest file')
2450
2451         url_pr = compat_urllib_parse_urlparse(manifest_url)
2452         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2453
2454         info['url'] = url
2455         info['ext'] = 'f4f'
2456         return [info]
2457
2458
2459 class XVideosIE(InfoExtractor):
2460     """Information extractor for xvideos.com"""
2461
2462     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2463     IE_NAME = u'xvideos'
2464
2465     def _real_extract(self, url):
2466         mobj = re.match(self._VALID_URL, url)
2467         if mobj is None:
2468             raise ExtractorError(u'Invalid URL: %s' % url)
2469         video_id = mobj.group(1)
2470
2471         webpage = self._download_webpage(url, video_id)
2472
2473         self.report_extraction(video_id)
2474
2475
2476         # Extract video URL
2477         mobj = re.search(r'flv_url=(.+?)&', webpage)
2478         if mobj is None:
2479             raise ExtractorError(u'Unable to extract video url')
2480         video_url = compat_urllib_parse.unquote(mobj.group(1))
2481
2482
2483         # Extract title
2484         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2485         if mobj is None:
2486             raise ExtractorError(u'Unable to extract video title')
2487         video_title = mobj.group(1)
2488
2489
2490         # Extract video thumbnail
2491         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2492         if mobj is None:
2493             raise ExtractorError(u'Unable to extract video thumbnail')
2494         video_thumbnail = mobj.group(0)
2495
2496         info = {
2497             'id': video_id,
2498             'url': video_url,
2499             'uploader': None,
2500             'upload_date': None,
2501             'title': video_title,
2502             'ext': 'flv',
2503             'thumbnail': video_thumbnail,
2504             'description': None,
2505         }
2506
2507         return [info]
2508
2509
2510 class SoundcloudIE(InfoExtractor):
2511     """Information extractor for soundcloud.com
2512        To access the media, the uid of the song and a stream token
2513        must be extracted from the page source and the script must make
2514        a request to media.soundcloud.com/crossdomain.xml. Then
2515        the media can be grabbed by requesting from an url composed
2516        of the stream token and uid
2517      """
2518
2519     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2520     IE_NAME = u'soundcloud'
2521
2522     def report_resolve(self, video_id):
2523         """Report information extraction."""
2524         self.to_screen(u'%s: Resolving id' % video_id)
2525
2526     def _real_extract(self, url):
2527         mobj = re.match(self._VALID_URL, url)
2528         if mobj is None:
2529             raise ExtractorError(u'Invalid URL: %s' % url)
2530
2531         # extract uploader (which is in the url)
2532         uploader = mobj.group(1)
2533         # extract simple title (uploader + slug of song title)
2534         slug_title =  mobj.group(2)
2535         simple_title = uploader + u'-' + slug_title
2536         full_title = '%s/%s' % (uploader, slug_title)
2537
2538         self.report_resolve(full_title)
2539
2540         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2541         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2542         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2543
2544         info = json.loads(info_json)
2545         video_id = info['id']
2546         self.report_extraction(full_title)
2547
2548         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2549         stream_json = self._download_webpage(streams_url, full_title,
2550                                              u'Downloading stream definitions',
2551                                              u'unable to download stream definitions')
2552
2553         streams = json.loads(stream_json)
2554         mediaURL = streams['http_mp3_128_url']
2555         upload_date = unified_strdate(info['created_at'])
2556
2557         return [{
2558             'id':       info['id'],
2559             'url':      mediaURL,
2560             'uploader': info['user']['username'],
2561             'upload_date': upload_date,
2562             'title':    info['title'],
2563             'ext':      u'mp3',
2564             'description': info['description'],
2565         }]
2566
2567 class SoundcloudSetIE(InfoExtractor):
2568     """Information extractor for soundcloud.com sets
2569        To access the media, the uid of the song and a stream token
2570        must be extracted from the page source and the script must make
2571        a request to media.soundcloud.com/crossdomain.xml. Then
2572        the media can be grabbed by requesting from an url composed
2573        of the stream token and uid
2574      """
2575
2576     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2577     IE_NAME = u'soundcloud:set'
2578
2579     def report_resolve(self, video_id):
2580         """Report information extraction."""
2581         self.to_screen(u'%s: Resolving id' % video_id)
2582
2583     def _real_extract(self, url):
2584         mobj = re.match(self._VALID_URL, url)
2585         if mobj is None:
2586             raise ExtractorError(u'Invalid URL: %s' % url)
2587
2588         # extract uploader (which is in the url)
2589         uploader = mobj.group(1)
2590         # extract simple title (uploader + slug of song title)
2591         slug_title =  mobj.group(2)
2592         simple_title = uploader + u'-' + slug_title
2593         full_title = '%s/sets/%s' % (uploader, slug_title)
2594
2595         self.report_resolve(full_title)
2596
2597         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2598         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2599         info_json = self._download_webpage(resolv_url, full_title)
2600
2601         videos = []
2602         info = json.loads(info_json)
2603         if 'errors' in info:
2604             for err in info['errors']:
2605                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2606             return
2607
2608         self.report_extraction(full_title)
2609         for track in info['tracks']:
2610             video_id = track['id']
2611
2612             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2614
2615             self.report_extraction(video_id)
2616             streams = json.loads(stream_json)
2617             mediaURL = streams['http_mp3_128_url']
2618
2619             videos.append({
2620                 'id':       video_id,
2621                 'url':      mediaURL,
2622                 'uploader': track['user']['username'],
2623                 'upload_date':  unified_strdate(track['created_at']),
2624                 'title':    track['title'],
2625                 'ext':      u'mp3',
2626                 'description': track['description'],
2627             })
2628         return videos
2629
2630
2631 class InfoQIE(InfoExtractor):
2632     """Information extractor for infoq.com"""
2633     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2634
2635     def _real_extract(self, url):
2636         mobj = re.match(self._VALID_URL, url)
2637         if mobj is None:
2638             raise ExtractorError(u'Invalid URL: %s' % url)
2639
2640         webpage = self._download_webpage(url, video_id=url)
2641         self.report_extraction(url)
2642
2643         # Extract video URL
2644         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2645         if mobj is None:
2646             raise ExtractorError(u'Unable to extract video url')
2647         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2648         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2649
2650         # Extract title
2651         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2652         if mobj is None:
2653             raise ExtractorError(u'Unable to extract video title')
2654         video_title = mobj.group(1)
2655
2656         # Extract description
2657         video_description = u'No description available.'
2658         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2659         if mobj is not None:
2660             video_description = mobj.group(1)
2661
2662         video_filename = video_url.split('/')[-1]
2663         video_id, extension = video_filename.split('.')
2664
2665         info = {
2666             'id': video_id,
2667             'url': video_url,
2668             'uploader': None,
2669             'upload_date': None,
2670             'title': video_title,
2671             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2672             'thumbnail': None,
2673             'description': video_description,
2674         }
2675
2676         return [info]
2677
2678 class MixcloudIE(InfoExtractor):
2679     """Information extractor for www.mixcloud.com"""
2680
2681     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2682     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2683     IE_NAME = u'mixcloud'
2684
2685     def report_download_json(self, file_id):
2686         """Report JSON download."""
2687         self.to_screen(u'Downloading json')
2688
2689     def get_urls(self, jsonData, fmt, bitrate='best'):
2690         """Get urls from 'audio_formats' section in json"""
2691         file_url = None
2692         try:
2693             bitrate_list = jsonData[fmt]
2694             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2695                 bitrate = max(bitrate_list) # select highest
2696
2697             url_list = jsonData[fmt][bitrate]
2698         except TypeError: # we have no bitrate info.
2699             url_list = jsonData[fmt]
2700         return url_list
2701
2702     def check_urls(self, url_list):
2703         """Returns 1st active url from list"""
2704         for url in url_list:
2705             try:
2706                 compat_urllib_request.urlopen(url)
2707                 return url
2708             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2709                 url = None
2710
2711         return None
2712
2713     def _print_formats(self, formats):
2714         print('Available formats:')
2715         for fmt in formats.keys():
2716             for b in formats[fmt]:
2717                 try:
2718                     ext = formats[fmt][b][0]
2719                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2720                 except TypeError: # we have no bitrate info
2721                     ext = formats[fmt][0]
2722                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2723                     break
2724
2725     def _real_extract(self, url):
2726         mobj = re.match(self._VALID_URL, url)
2727         if mobj is None:
2728             raise ExtractorError(u'Invalid URL: %s' % url)
2729         # extract uploader & filename from url
2730         uploader = mobj.group(1).decode('utf-8')
2731         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2732
2733         # construct API request
2734         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2735         # retrieve .json file with links to files
2736         request = compat_urllib_request.Request(file_url)
2737         try:
2738             self.report_download_json(file_url)
2739             jsonData = compat_urllib_request.urlopen(request).read()
2740         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2742
2743         # parse JSON
2744         json_data = json.loads(jsonData)
2745         player_url = json_data['player_swf_url']
2746         formats = dict(json_data['audio_formats'])
2747
2748         req_format = self._downloader.params.get('format', None)
2749         bitrate = None
2750
2751         if self._downloader.params.get('listformats', None):
2752             self._print_formats(formats)
2753             return
2754
2755         if req_format is None or req_format == 'best':
2756             for format_param in formats.keys():
2757                 url_list = self.get_urls(formats, format_param)
2758                 # check urls
2759                 file_url = self.check_urls(url_list)
2760                 if file_url is not None:
2761                     break # got it!
2762         else:
2763             if req_format not in formats:
2764                 raise ExtractorError(u'Format is not available')
2765
2766             url_list = self.get_urls(formats, req_format)
2767             file_url = self.check_urls(url_list)
2768             format_param = req_format
2769
2770         return [{
2771             'id': file_id.decode('utf-8'),
2772             'url': file_url.decode('utf-8'),
2773             'uploader': uploader.decode('utf-8'),
2774             'upload_date': None,
2775             'title': json_data['name'],
2776             'ext': file_url.split('.')[-1].decode('utf-8'),
2777             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2778             'thumbnail': json_data['thumbnail_url'],
2779             'description': json_data['description'],
2780             'player_url': player_url.decode('utf-8'),
2781         }]
2782
2783 class StanfordOpenClassroomIE(InfoExtractor):
2784     """Information extractor for Stanford's Open ClassRoom"""
2785
2786     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2787     IE_NAME = u'stanfordoc'
2788
2789     def _real_extract(self, url):
2790         mobj = re.match(self._VALID_URL, url)
2791         if mobj is None:
2792             raise ExtractorError(u'Invalid URL: %s' % url)
2793
2794         if mobj.group('course') and mobj.group('video'): # A specific video
2795             course = mobj.group('course')
2796             video = mobj.group('video')
2797             info = {
2798                 'id': course + '_' + video,
2799                 'uploader': None,
2800                 'upload_date': None,
2801             }
2802
2803             self.report_extraction(info['id'])
2804             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2805             xmlUrl = baseUrl + video + '.xml'
2806             try:
2807                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2808             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2809                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2810             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2811             try:
2812                 info['title'] = mdoc.findall('./title')[0].text
2813                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2814             except IndexError:
2815                 raise ExtractorError(u'Invalid metadata XML file')
2816             info['ext'] = info['url'].rpartition('.')[2]
2817             return [info]
2818         elif mobj.group('course'): # A course page
2819             course = mobj.group('course')
2820             info = {
2821                 'id': course,
2822                 'type': 'playlist',
2823                 'uploader': None,
2824                 'upload_date': None,
2825             }
2826
2827             coursepage = self._download_webpage(url, info['id'],
2828                                         note='Downloading course info page',
2829                                         errnote='Unable to download course info page')
2830
2831             m = re.search('<h1>([^<]+)</h1>', coursepage)
2832             if m:
2833                 info['title'] = unescapeHTML(m.group(1))
2834             else:
2835                 info['title'] = info['id']
2836
2837             m = re.search('<description>([^<]+)</description>', coursepage)
2838             if m:
2839                 info['description'] = unescapeHTML(m.group(1))
2840
2841             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2842             info['list'] = [
2843                 {
2844                     'type': 'reference',
2845                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2846                 }
2847                     for vpage in links]
2848             results = []
2849             for entry in info['list']:
2850                 assert entry['type'] == 'reference'
2851                 results += self.extract(entry['url'])
2852             return results
2853         else: # Root page
2854             info = {
2855                 'id': 'Stanford OpenClassroom',
2856                 'type': 'playlist',
2857                 'uploader': None,
2858                 'upload_date': None,
2859             }
2860
2861             self.report_download_webpage(info['id'])
2862             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2863             try:
2864                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2865             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2867
2868             info['title'] = info['id']
2869
2870             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2871             info['list'] = [
2872                 {
2873                     'type': 'reference',
2874                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2875                 }
2876                     for cpage in links]
2877
2878             results = []
2879             for entry in info['list']:
2880                 assert entry['type'] == 'reference'
2881                 results += self.extract(entry['url'])
2882             return results
2883
2884 class MTVIE(InfoExtractor):
2885     """Information extractor for MTV.com"""
2886
2887     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2888     IE_NAME = u'mtv'
2889
2890     def _real_extract(self, url):
2891         mobj = re.match(self._VALID_URL, url)
2892         if mobj is None:
2893             raise ExtractorError(u'Invalid URL: %s' % url)
2894         if not mobj.group('proto'):
2895             url = 'http://' + url
2896         video_id = mobj.group('videoid')
2897
2898         webpage = self._download_webpage(url, video_id)
2899
2900         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2901         if mobj is None:
2902             raise ExtractorError(u'Unable to extract song name')
2903         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2904         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2905         if mobj is None:
2906             raise ExtractorError(u'Unable to extract performer')
2907         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2908         video_title = performer + ' - ' + song_name
2909
2910         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2911         if mobj is None:
2912             raise ExtractorError(u'Unable to mtvn_uri')
2913         mtvn_uri = mobj.group(1)
2914
2915         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2916         if mobj is None:
2917             raise ExtractorError(u'Unable to extract content id')
2918         content_id = mobj.group(1)
2919
2920         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2921         self.report_extraction(video_id)
2922         request = compat_urllib_request.Request(videogen_url)
2923         try:
2924             metadataXml = compat_urllib_request.urlopen(request).read()
2925         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2927
2928         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2929         renditions = mdoc.findall('.//rendition')
2930
2931         # For now, always pick the highest quality.
2932         rendition = renditions[-1]
2933
2934         try:
2935             _,_,ext = rendition.attrib['type'].partition('/')
2936             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2937             video_url = rendition.find('./src').text
2938         except KeyError:
2939             raise ExtractorError('Invalid rendition field.')
2940
2941         info = {
2942             'id': video_id,
2943             'url': video_url,
2944             'uploader': performer,
2945             'upload_date': None,
2946             'title': video_title,
2947             'ext': ext,
2948             'format': format,
2949         }
2950
2951         return [info]
2952
2953
2954 class YoukuIE(InfoExtractor):
2955     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2956
2957     def _gen_sid(self):
2958         nowTime = int(time.time() * 1000)
2959         random1 = random.randint(1000,1998)
2960         random2 = random.randint(1000,9999)
2961
2962         return "%d%d%d" %(nowTime,random1,random2)
2963
2964     def _get_file_ID_mix_string(self, seed):
2965         mixed = []
2966         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2967         seed = float(seed)
2968         for i in range(len(source)):
2969             seed  =  (seed * 211 + 30031 ) % 65536
2970             index  =  math.floor(seed / 65536 * len(source) )
2971             mixed.append(source[int(index)])
2972             source.remove(source[int(index)])
2973         #return ''.join(mixed)
2974         return mixed
2975
2976     def _get_file_id(self, fileId, seed):
2977         mixed = self._get_file_ID_mix_string(seed)
2978         ids = fileId.split('*')
2979         realId = []
2980         for ch in ids:
2981             if ch:
2982                 realId.append(mixed[int(ch)])
2983         return ''.join(realId)
2984
2985     def _real_extract(self, url):
2986         mobj = re.match(self._VALID_URL, url)
2987         if mobj is None:
2988             raise ExtractorError(u'Invalid URL: %s' % url)
2989         video_id = mobj.group('ID')
2990
2991         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2992
2993         jsondata = self._download_webpage(info_url, video_id)
2994
2995         self.report_extraction(video_id)
2996         try:
2997             config = json.loads(jsondata)
2998
2999             video_title =  config['data'][0]['title']
3000             seed = config['data'][0]['seed']
3001
3002             format = self._downloader.params.get('format', None)
3003             supported_format = list(config['data'][0]['streamfileids'].keys())
3004
3005             if format is None or format == 'best':
3006                 if 'hd2' in supported_format:
3007                     format = 'hd2'
3008                 else:
3009                     format = 'flv'
3010                 ext = u'flv'
3011             elif format == 'worst':
3012                 format = 'mp4'
3013                 ext = u'mp4'
3014             else:
3015                 format = 'flv'
3016                 ext = u'flv'
3017
3018
3019             fileid = config['data'][0]['streamfileids'][format]
3020             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3021         except (UnicodeDecodeError, ValueError, KeyError):
3022             raise ExtractorError(u'Unable to extract info section')
3023
3024         files_info=[]
3025         sid = self._gen_sid()
3026         fileid = self._get_file_id(fileid, seed)
3027
3028         #column 8,9 of fileid represent the segment number
3029         #fileid[7:9] should be changed
3030         for index, key in enumerate(keys):
3031
3032             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3033             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3034
3035             info = {
3036                 'id': '%s_part%02d' % (video_id, index),
3037                 'url': download_url,
3038                 'uploader': None,
3039                 'upload_date': None,
3040                 'title': video_title,
3041                 'ext': ext,
3042             }
3043             files_info.append(info)
3044
3045         return files_info
3046
3047
3048 class XNXXIE(InfoExtractor):
3049     """Information extractor for xnxx.com"""
3050
3051     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3052     IE_NAME = u'xnxx'
3053     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3054     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3055     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3056
3057     def _real_extract(self, url):
3058         mobj = re.match(self._VALID_URL, url)
3059         if mobj is None:
3060             raise ExtractorError(u'Invalid URL: %s' % url)
3061         video_id = mobj.group(1)
3062
3063         # Get webpage content
3064         webpage = self._download_webpage(url, video_id)
3065
3066         result = re.search(self.VIDEO_URL_RE, webpage)
3067         if result is None:
3068             raise ExtractorError(u'Unable to extract video url')
3069         video_url = compat_urllib_parse.unquote(result.group(1))
3070
3071         result = re.search(self.VIDEO_TITLE_RE, webpage)
3072         if result is None:
3073             raise ExtractorError(u'Unable to extract video title')
3074         video_title = result.group(1)
3075
3076         result = re.search(self.VIDEO_THUMB_RE, webpage)
3077         if result is None:
3078             raise ExtractorError(u'Unable to extract video thumbnail')
3079         video_thumbnail = result.group(1)
3080
3081         return [{
3082             'id': video_id,
3083             'url': video_url,
3084             'uploader': None,
3085             'upload_date': None,
3086             'title': video_title,
3087             'ext': 'flv',
3088             'thumbnail': video_thumbnail,
3089             'description': None,
3090         }]
3091
3092
3093 class GooglePlusIE(InfoExtractor):
3094     """Information extractor for plus.google.com."""
3095
3096     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3097     IE_NAME = u'plus.google'
3098
3099     def report_extract_entry(self, url):
3100         """Report downloading extry"""
3101         self.to_screen(u'Downloading entry: %s' % url)
3102
3103     def report_date(self, upload_date):
3104         """Report downloading extry"""
3105         self.to_screen(u'Entry date: %s' % upload_date)
3106
3107     def report_uploader(self, uploader):
3108         """Report downloading extry"""
3109         self.to_screen(u'Uploader: %s' % uploader)
3110
3111     def report_title(self, video_title):
3112         """Report downloading extry"""
3113         self.to_screen(u'Title: %s' % video_title)
3114
3115     def report_extract_vid_page(self, video_page):
3116         """Report information extraction."""
3117         self.to_screen(u'Extracting video page: %s' % video_page)
3118
3119     def _real_extract(self, url):
3120         # Extract id from URL
3121         mobj = re.match(self._VALID_URL, url)
3122         if mobj is None:
3123             raise ExtractorError(u'Invalid URL: %s' % url)
3124
3125         post_url = mobj.group(0)
3126         video_id = mobj.group(1)
3127
3128         video_extension = 'flv'
3129
3130         # Step 1, Retrieve post webpage to extract further information
3131         self.report_extract_entry(post_url)
3132         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3133
3134         # Extract update date
3135         upload_date = None
3136         pattern = 'title="Timestamp">(.*?)</a>'
3137         mobj = re.search(pattern, webpage)
3138         if mobj:
3139             upload_date = mobj.group(1)
3140             # Convert timestring to a format suitable for filename
3141             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3142             upload_date = upload_date.strftime('%Y%m%d')
3143         self.report_date(upload_date)
3144
3145         # Extract uploader
3146         uploader = None
3147         pattern = r'rel\="author".*?>(.*?)</a>'
3148         mobj = re.search(pattern, webpage)
3149         if mobj:
3150             uploader = mobj.group(1)
3151         self.report_uploader(uploader)
3152
3153         # Extract title
3154         # Get the first line for title
3155         video_title = u'NA'
3156         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3157         mobj = re.search(pattern, webpage)
3158         if mobj:
3159             video_title = mobj.group(1)
3160         self.report_title(video_title)
3161
3162         # Step 2, Stimulate clicking the image box to launch video
3163         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3164         mobj = re.search(pattern, webpage)
3165         if mobj is None:
3166             raise ExtractorError(u'Unable to extract video page URL')
3167
3168         video_page = mobj.group(1)
3169         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3170         self.report_extract_vid_page(video_page)
3171
3172
3173         # Extract video links on video page
3174         """Extract video links of all sizes"""
3175         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3176         mobj = re.findall(pattern, webpage)
3177         if len(mobj) == 0:
3178             raise ExtractorError(u'Unable to extract video links')
3179
3180         # Sort in resolution
3181         links = sorted(mobj)
3182
3183         # Choose the lowest of the sort, i.e. highest resolution
3184         video_url = links[-1]
3185         # Only get the url. The resolution part in the tuple has no use anymore
3186         video_url = video_url[-1]
3187         # Treat escaped \u0026 style hex
3188         try:
3189             video_url = video_url.decode("unicode_escape")
3190         except AttributeError: # Python 3
3191             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3192
3193
3194         return [{
3195             'id':       video_id,
3196             'url':      video_url,
3197             'uploader': uploader,
3198             'upload_date':  upload_date,
3199             'title':    video_title,
3200             'ext':      video_extension,
3201         }]
3202
3203 class NBAIE(InfoExtractor):
3204     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3205     IE_NAME = u'nba'
3206
3207     def _real_extract(self, url):
3208         mobj = re.match(self._VALID_URL, url)
3209         if mobj is None:
3210             raise ExtractorError(u'Invalid URL: %s' % url)
3211
3212         video_id = mobj.group(1)
3213         if video_id.endswith('/index.html'):
3214             video_id = video_id[:-len('/index.html')]
3215
3216         webpage = self._download_webpage(url, video_id)
3217
3218         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3219         def _findProp(rexp, default=None):
3220             m = re.search(rexp, webpage)
3221             if m:
3222                 return unescapeHTML(m.group(1))
3223             else:
3224                 return default
3225
3226         shortened_video_id = video_id.rpartition('/')[2]
3227         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3228         info = {
3229             'id': shortened_video_id,
3230             'url': video_url,
3231             'ext': 'mp4',
3232             'title': title,
3233             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3234             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3235         }
3236         return [info]
3237
3238 class JustinTVIE(InfoExtractor):
3239     """Information extractor for justin.tv and twitch.tv"""
3240     # TODO: One broadcast may be split into multiple videos. The key
3241     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3242     # starts at 1 and increases. Can we treat all parts as one video?
3243
3244     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3245         (?:
3246             (?P<channelid>[^/]+)|
3247             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3248             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3249         )
3250         /?(?:\#.*)?$
3251         """
3252     _JUSTIN_PAGE_LIMIT = 100
3253     IE_NAME = u'justin.tv'
3254
3255     def report_download_page(self, channel, offset):
3256         """Report attempt to download a single page of videos."""
3257         self.to_screen(u'%s: Downloading video information from %d to %d' %
3258                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3259
3260     # Return count of items, list of *valid* items
3261     def _parse_page(self, url, video_id):
3262         webpage = self._download_webpage(url, video_id,
3263                                          u'Downloading video info JSON',
3264                                          u'unable to download video info JSON')
3265
3266         response = json.loads(webpage)
3267         if type(response) != list:
3268             error_text = response.get('error', 'unknown error')
3269             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3270         info = []
3271         for clip in response:
3272             video_url = clip['video_file_url']
3273             if video_url:
3274                 video_extension = os.path.splitext(video_url)[1][1:]
3275                 video_date = re.sub('-', '', clip['start_time'][:10])
3276                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3277                 video_id = clip['id']
3278                 video_title = clip.get('title', video_id)
3279                 info.append({
3280                     'id': video_id,
3281                     'url': video_url,
3282                     'title': video_title,
3283                     'uploader': clip.get('channel_name', video_uploader_id),
3284                     'uploader_id': video_uploader_id,
3285                     'upload_date': video_date,
3286                     'ext': video_extension,
3287                 })
3288         return (len(response), info)
3289
3290     def _real_extract(self, url):
3291         mobj = re.match(self._VALID_URL, url)
3292         if mobj is None:
3293             raise ExtractorError(u'invalid URL: %s' % url)
3294
3295         api_base = 'http://api.justin.tv'
3296         paged = False
3297         if mobj.group('channelid'):
3298             paged = True
3299             video_id = mobj.group('channelid')
3300             api = api_base + '/channel/archives/%s.json' % video_id
3301         elif mobj.group('chapterid'):
3302             chapter_id = mobj.group('chapterid')
3303
3304             webpage = self._download_webpage(url, chapter_id)
3305             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3306             if not m:
3307                 raise ExtractorError(u'Cannot find archive of a chapter')
3308             archive_id = m.group(1)
3309
3310             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3311             chapter_info_xml = self._download_webpage(api, chapter_id,
3312                                              note=u'Downloading chapter information',
3313                                              errnote=u'Chapter information download failed')
3314             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3315             for a in doc.findall('.//archive'):
3316                 if archive_id == a.find('./id').text:
3317                     break
3318             else:
3319                 raise ExtractorError(u'Could not find chapter in chapter information')
3320
3321             video_url = a.find('./video_file_url').text
3322             video_ext = video_url.rpartition('.')[2] or u'flv'
3323
3324             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3325             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3326                                    note='Downloading chapter metadata',
3327                                    errnote='Download of chapter metadata failed')
3328             chapter_info = json.loads(chapter_info_json)
3329
3330             bracket_start = int(doc.find('.//bracket_start').text)
3331             bracket_end = int(doc.find('.//bracket_end').text)
3332
3333             # TODO determine start (and probably fix up file)
3334             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3335             #video_url += u'?start=' + TODO:start_timestamp
3336             # bracket_start is 13290, but we want 51670615
3337             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3338                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3339
3340             info = {
3341                 'id': u'c' + chapter_id,
3342                 'url': video_url,
3343                 'ext': video_ext,
3344                 'title': chapter_info['title'],
3345                 'thumbnail': chapter_info['preview'],
3346                 'description': chapter_info['description'],
3347                 'uploader': chapter_info['channel']['display_name'],
3348                 'uploader_id': chapter_info['channel']['name'],
3349             }
3350             return [info]
3351         else:
3352             video_id = mobj.group('videoid')
3353             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3354
3355         self.report_extraction(video_id)
3356
3357         info = []
3358         offset = 0
3359         limit = self._JUSTIN_PAGE_LIMIT
3360         while True:
3361             if paged:
3362                 self.report_download_page(video_id, offset)
3363             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3364             page_count, page_info = self._parse_page(page_url, video_id)
3365             info.extend(page_info)
3366             if not paged or page_count != limit:
3367                 break
3368             offset += limit
3369         return info
3370
3371 class FunnyOrDieIE(InfoExtractor):
3372     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3373
3374     def _real_extract(self, url):
3375         mobj = re.match(self._VALID_URL, url)
3376         if mobj is None:
3377             raise ExtractorError(u'invalid URL: %s' % url)
3378
3379         video_id = mobj.group('id')
3380         webpage = self._download_webpage(url, video_id)
3381
3382         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3383         if not m:
3384             raise ExtractorError(u'Unable to find video information')
3385         video_url = unescapeHTML(m.group('url'))
3386
3387         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3388         if not m:
3389             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3390             if not m:
3391                 raise ExtractorError(u'Cannot find video title')
3392         title = clean_html(m.group('title'))
3393
3394         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3395         if m:
3396             desc = unescapeHTML(m.group('desc'))
3397         else:
3398             desc = None
3399
3400         info = {
3401             'id': video_id,
3402             'url': video_url,
3403             'ext': 'mp4',
3404             'title': title,
3405             'description': desc,
3406         }
3407         return [info]
3408
3409 class SteamIE(InfoExtractor):
3410     _VALID_URL = r"""http://store\.steampowered\.com/
3411                 (agecheck/)?
3412                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3413                 (?P<gameID>\d+)/?
3414                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3415                 """
3416
3417     @classmethod
3418     def suitable(cls, url):
3419         """Receives a URL and returns True if suitable for this IE."""
3420         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3421
3422     def _real_extract(self, url):
3423         m = re.match(self._VALID_URL, url, re.VERBOSE)
3424         gameID = m.group('gameID')
3425         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3426         self.report_age_confirmation()
3427         webpage = self._download_webpage(videourl, gameID)
3428         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3429
3430         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3431         mweb = re.finditer(urlRE, webpage)
3432         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3433         titles = re.finditer(namesRE, webpage)
3434         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3435         thumbs = re.finditer(thumbsRE, webpage)
3436         videos = []
3437         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3438             video_id = vid.group('videoID')
3439             title = vtitle.group('videoName')
3440             video_url = vid.group('videoURL')
3441             video_thumb = thumb.group('thumbnail')
3442             if not video_url:
3443                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3444             info = {
3445                 'id':video_id,
3446                 'url':video_url,
3447                 'ext': 'flv',
3448                 'title': unescapeHTML(title),
3449                 'thumbnail': video_thumb
3450                   }
3451             videos.append(info)
3452         return [self.playlist_result(videos, gameID, game_title)]
3453
3454 class UstreamIE(InfoExtractor):
3455     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3456     IE_NAME = u'ustream'
3457
3458     def _real_extract(self, url):
3459         m = re.match(self._VALID_URL, url)
3460         video_id = m.group('videoID')
3461         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3462         webpage = self._download_webpage(url, video_id)
3463         self.report_extraction(video_id)
3464         try:
3465             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3466             title = m.group('title')
3467             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3468                           webpage, re.DOTALL)
3469             uploader = unescapeHTML(m.group('uploader').strip())
3470             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3471             thumb = m.group('thumb')
3472         except AttributeError:
3473             raise ExtractorError(u'Unable to extract info')
3474         info = {
3475                 'id':video_id,
3476                 'url':video_url,
3477                 'ext': 'flv',
3478                 'title': title,
3479                 'uploader': uploader,
3480                 'thumbnail': thumb,
3481                   }
3482         return info
3483
3484 class WorldStarHipHopIE(InfoExtractor):
3485     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3486     IE_NAME = u'WorldStarHipHop'
3487
3488     def _real_extract(self, url):
3489         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3490
3491         m = re.match(self._VALID_URL, url)
3492         video_id = m.group('id')
3493
3494         webpage_src = self._download_webpage(url, video_id)
3495
3496         mobj = re.search(_src_url, webpage_src)
3497
3498         if mobj is not None:
3499             video_url = mobj.group(1)
3500             if 'mp4' in video_url:
3501                 ext = 'mp4'
3502             else:
3503                 ext = 'flv'
3504         else:
3505             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3506
3507         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3508
3509         if mobj is None:
3510             raise ExtractorError(u'Cannot determine title')
3511         title = mobj.group(1)
3512
3513         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3514         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3515         if mobj is not None:
3516             thumbnail = mobj.group(1)
3517         else:
3518             _title = r"""candytitles.*>(.*)</span>"""
3519             mobj = re.search(_title, webpage_src)
3520             if mobj is not None:
3521                 title = mobj.group(1)
3522             thumbnail = None
3523
3524         results = [{
3525                     'id': video_id,
3526                     'url' : video_url,
3527                     'title' : title,
3528                     'thumbnail' : thumbnail,
3529                     'ext' : ext,
3530                     }]
3531         return results
3532
3533 class RBMARadioIE(InfoExtractor):
3534     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3535
3536     def _real_extract(self, url):
3537         m = re.match(self._VALID_URL, url)
3538         video_id = m.group('videoID')
3539
3540         webpage = self._download_webpage(url, video_id)
3541         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3542         if not m:
3543             raise ExtractorError(u'Cannot find metadata')
3544         json_data = m.group(1)
3545
3546         try:
3547             data = json.loads(json_data)
3548         except ValueError as e:
3549             raise ExtractorError(u'Invalid JSON: ' + str(e))
3550
3551         video_url = data['akamai_url'] + '&cbr=256'
3552         url_parts = compat_urllib_parse_urlparse(video_url)
3553         video_ext = url_parts.path.rpartition('.')[2]
3554         info = {
3555                 'id': video_id,
3556                 'url': video_url,
3557                 'ext': video_ext,
3558                 'title': data['title'],
3559                 'description': data.get('teaser_text'),
3560                 'location': data.get('country_of_origin'),
3561                 'uploader': data.get('host', {}).get('name'),
3562                 'uploader_id': data.get('host', {}).get('slug'),
3563                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3564                 'duration': data.get('duration'),
3565         }
3566         return [info]
3567
3568
3569 class YouPornIE(InfoExtractor):
3570     """Information extractor for youporn.com."""
3571     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3572
3573     def _print_formats(self, formats):
3574         """Print all available formats"""
3575         print(u'Available formats:')
3576         print(u'ext\t\tformat')
3577         print(u'---------------------------------')
3578         for format in formats:
3579             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3580
3581     def _specific(self, req_format, formats):
3582         for x in formats:
3583             if(x["format"]==req_format):
3584                 return x
3585         return None
3586
3587     def _real_extract(self, url):
3588         mobj = re.match(self._VALID_URL, url)
3589         if mobj is None:
3590             raise ExtractorError(u'Invalid URL: %s' % url)
3591
3592         video_id = mobj.group('videoid')
3593
3594         req = compat_urllib_request.Request(url)
3595         req.add_header('Cookie', 'age_verified=1')
3596         webpage = self._download_webpage(req, video_id)
3597
3598         # Get the video title
3599         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3600         if result is None:
3601             raise ExtractorError(u'Unable to extract video title')
3602         video_title = result.group('title').strip()
3603
3604         # Get the video date
3605         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3606         if result is None:
3607             self._downloader.report_warning(u'unable to extract video date')
3608             upload_date = None
3609         else:
3610             upload_date = unified_strdate(result.group('date').strip())
3611
3612         # Get the video uploader
3613         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3614         if result is None:
3615             self._downloader.report_warning(u'unable to extract uploader')
3616             video_uploader = None
3617         else:
3618             video_uploader = result.group('uploader').strip()
3619             video_uploader = clean_html( video_uploader )
3620
3621         # Get all of the formats available
3622         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3623         result = re.search(DOWNLOAD_LIST_RE, webpage)
3624         if result is None:
3625             raise ExtractorError(u'Unable to extract download list')
3626         download_list_html = result.group('download_list').strip()
3627
3628         # Get all of the links from the page
3629         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3630         links = re.findall(LINK_RE, download_list_html)
3631         if(len(links) == 0):
3632             raise ExtractorError(u'ERROR: no known formats available for video')
3633
3634         self.to_screen(u'Links found: %d' % len(links))
3635
3636         formats = []
3637         for link in links:
3638
3639             # A link looks like this:
3640             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3641             # A path looks like this:
3642             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3643             video_url = unescapeHTML( link )
3644             path = compat_urllib_parse_urlparse( video_url ).path
3645             extension = os.path.splitext( path )[1][1:]
3646             format = path.split('/')[4].split('_')[:2]
3647             size = format[0]
3648             bitrate = format[1]
3649             format = "-".join( format )
3650             title = u'%s-%s-%s' % (video_title, size, bitrate)
3651
3652             formats.append({
3653                 'id': video_id,
3654                 'url': video_url,
3655                 'uploader': video_uploader,
3656                 'upload_date': upload_date,
3657                 'title': title,
3658                 'ext': extension,
3659                 'format': format,
3660                 'thumbnail': None,
3661                 'description': None,
3662                 'player_url': None
3663             })
3664
3665         if self._downloader.params.get('listformats', None):
3666             self._print_formats(formats)
3667             return
3668
3669         req_format = self._downloader.params.get('format', None)
3670         self.to_screen(u'Format: %s' % req_format)
3671
3672         if req_format is None or req_format == 'best':
3673             return [formats[0]]
3674         elif req_format == 'worst':
3675             return [formats[-1]]
3676         elif req_format in ('-1', 'all'):
3677             return formats
3678         else:
3679             format = self._specific( req_format, formats )
3680             if result is None:
3681                 raise ExtractorError(u'Requested format not available')
3682             return [format]
3683
3684
3685
3686 class PornotubeIE(InfoExtractor):
3687     """Information extractor for pornotube.com."""
3688     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3689
3690     def _real_extract(self, url):
3691         mobj = re.match(self._VALID_URL, url)
3692         if mobj is None:
3693             raise ExtractorError(u'Invalid URL: %s' % url)
3694
3695         video_id = mobj.group('videoid')
3696         video_title = mobj.group('title')
3697
3698         # Get webpage content
3699         webpage = self._download_webpage(url, video_id)
3700
3701         # Get the video URL
3702         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3703         result = re.search(VIDEO_URL_RE, webpage)
3704         if result is None:
3705             raise ExtractorError(u'Unable to extract video url')
3706         video_url = compat_urllib_parse.unquote(result.group('url'))
3707
3708         #Get the uploaded date
3709         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3710         result = re.search(VIDEO_UPLOADED_RE, webpage)
3711         if result is None:
3712             raise ExtractorError(u'Unable to extract video title')
3713         upload_date = unified_strdate(result.group('date'))
3714
3715         info = {'id': video_id,
3716                 'url': video_url,
3717                 'uploader': None,
3718                 'upload_date': upload_date,
3719                 'title': video_title,
3720                 'ext': 'flv',
3721                 'format': 'flv'}
3722
3723         return [info]
3724
3725 class YouJizzIE(InfoExtractor):
3726     """Information extractor for youjizz.com."""
3727     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3728
3729     def _real_extract(self, url):
3730         mobj = re.match(self._VALID_URL, url)
3731         if mobj is None:
3732             raise ExtractorError(u'Invalid URL: %s' % url)
3733
3734         video_id = mobj.group('videoid')
3735
3736         # Get webpage content
3737         webpage = self._download_webpage(url, video_id)
3738
3739         # Get the video title
3740         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3741         if result is None:
3742             raise ExtractorError(u'ERROR: unable to extract video title')
3743         video_title = result.group('title').strip()
3744
3745         # Get the embed page
3746         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3747         if result is None:
3748             raise ExtractorError(u'ERROR: unable to extract embed page')
3749
3750         embed_page_url = result.group(0).strip()
3751         video_id = result.group('videoid')
3752
3753         webpage = self._download_webpage(embed_page_url, video_id)
3754
3755         # Get the video URL
3756         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3757         if result is None:
3758             raise ExtractorError(u'ERROR: unable to extract video url')
3759         video_url = result.group('source')
3760
3761         info = {'id': video_id,
3762                 'url': video_url,
3763                 'title': video_title,
3764                 'ext': 'flv',
3765                 'format': 'flv',
3766                 'player_url': embed_page_url}
3767
3768         return [info]
3769
3770 class EightTracksIE(InfoExtractor):
3771     IE_NAME = '8tracks'
3772     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3773
3774     def _real_extract(self, url):
3775         mobj = re.match(self._VALID_URL, url)
3776         if mobj is None:
3777             raise ExtractorError(u'Invalid URL: %s' % url)
3778         playlist_id = mobj.group('id')
3779
3780         webpage = self._download_webpage(url, playlist_id)
3781
3782         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3783         if not m:
3784             raise ExtractorError(u'Cannot find trax information')
3785         json_like = m.group(1)
3786         data = json.loads(json_like)
3787
3788         session = str(random.randint(0, 1000000000))
3789         mix_id = data['id']
3790         track_count = data['tracks_count']
3791         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3792         next_url = first_url
3793         res = []
3794         for i in itertools.count():
3795             api_json = self._download_webpage(next_url, playlist_id,
3796                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3797                 errnote=u'Failed to download song information')
3798             api_data = json.loads(api_json)
3799             track_data = api_data[u'set']['track']
3800             info = {
3801                 'id': track_data['id'],
3802                 'url': track_data['track_file_stream_url'],
3803                 'title': track_data['performer'] + u' - ' + track_data['name'],
3804                 'raw_title': track_data['name'],
3805                 'uploader_id': data['user']['login'],
3806                 'ext': 'm4a',
3807             }
3808             res.append(info)
3809             if api_data['set']['at_last_track']:
3810                 break
3811             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3812         return res
3813
3814 class KeekIE(InfoExtractor):
3815     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3816     IE_NAME = u'keek'
3817
3818     def _real_extract(self, url):
3819         m = re.match(self._VALID_URL, url)
3820         video_id = m.group('videoID')
3821         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3822         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3823         webpage = self._download_webpage(url, video_id)
3824         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3825         title = unescapeHTML(m.group('title'))
3826         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3827         uploader = clean_html(m.group('uploader'))
3828         info = {
3829                 'id': video_id,
3830                 'url': video_url,
3831                 'ext': 'mp4',
3832                 'title': title,
3833                 'thumbnail': thumbnail,
3834                 'uploader': uploader
3835         }
3836         return [info]
3837
3838 class TEDIE(InfoExtractor):
3839     _VALID_URL=r'''http://www\.ted\.com/
3840                    (
3841                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3842                         |
3843                         ((?P<type_talk>talks)) # We have a simple talk
3844                    )
3845                    (/lang/(.*?))? # The url may contain the language
3846                    /(?P<name>\w+) # Here goes the name and then ".html"
3847                    '''
3848
3849     @classmethod
3850     def suitable(cls, url):
3851         """Receives a URL and returns True if suitable for this IE."""
3852         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3853
3854     def _real_extract(self, url):
3855         m=re.match(self._VALID_URL, url, re.VERBOSE)
3856         if m.group('type_talk'):
3857             return [self._talk_info(url)]
3858         else :
3859             playlist_id=m.group('playlist_id')
3860             name=m.group('name')
3861             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3862             return [self._playlist_videos_info(url,name,playlist_id)]
3863
3864     def _talk_video_link(self,mediaSlug):
3865         '''Returns the video link for that mediaSlug'''
3866         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3867
3868     def _playlist_videos_info(self,url,name,playlist_id=0):
3869         '''Returns the videos of the playlist'''
3870         video_RE=r'''
3871                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3872                      ([.\s]*?)data-playlist_item_id="(\d+)"
3873                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3874                      '''
3875         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3876         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3877         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3878         m_names=re.finditer(video_name_RE,webpage)
3879
3880         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3881         m_playlist = re.search(playlist_RE, webpage)
3882         playlist_title = m_playlist.group('playlist_title')
3883
3884         playlist_entries = []
3885         for m_video, m_name in zip(m_videos,m_names):
3886             video_id=m_video.group('video_id')
3887             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3888             playlist_entries.append(self.url_result(talk_url, 'TED'))
3889         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3890
3891     def _talk_info(self, url, video_id=0):
3892         """Return the video for the talk in the url"""
3893         m=re.match(self._VALID_URL, url,re.VERBOSE)
3894         videoName=m.group('name')
3895         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3896         # If the url includes the language we get the title translated
3897         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3898         title=re.search(title_RE, webpage).group('title')
3899         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3900                         "id":(?P<videoID>[\d]+).*?
3901                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3902         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3903         thumb_match=re.search(thumb_RE,webpage)
3904         info_match=re.search(info_RE,webpage,re.VERBOSE)
3905         video_id=info_match.group('videoID')
3906         mediaSlug=info_match.group('mediaSlug')
3907         video_url=self._talk_video_link(mediaSlug)
3908         info = {
3909                 'id': video_id,
3910                 'url': video_url,
3911                 'ext': 'mp4',
3912                 'title': title,
3913                 'thumbnail': thumb_match.group('thumbnail')
3914                 }
3915         return info
3916
3917 class MySpassIE(InfoExtractor):
3918     _VALID_URL = r'http://www.myspass.de/.*'
3919
3920     def _real_extract(self, url):
3921         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3922
3923         # video id is the last path element of the URL
3924         # usually there is a trailing slash, so also try the second but last
3925         url_path = compat_urllib_parse_urlparse(url).path
3926         url_parent_path, video_id = os.path.split(url_path)
3927         if not video_id:
3928             _, video_id = os.path.split(url_parent_path)
3929
3930         # get metadata
3931         metadata_url = META_DATA_URL_TEMPLATE % video_id
3932         metadata_text = self._download_webpage(metadata_url, video_id)
3933         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3934
3935         # extract values from metadata
3936         url_flv_el = metadata.find('url_flv')
3937         if url_flv_el is None:
3938             raise ExtractorError(u'Unable to extract download url')
3939         video_url = url_flv_el.text
3940         extension = os.path.splitext(video_url)[1][1:]
3941         title_el = metadata.find('title')
3942         if title_el is None:
3943             raise ExtractorError(u'Unable to extract title')
3944         title = title_el.text
3945         format_id_el = metadata.find('format_id')
3946         if format_id_el is None:
3947             format = ext
3948         else:
3949             format = format_id_el.text
3950         description_el = metadata.find('description')
3951         if description_el is not None:
3952             description = description_el.text
3953         else:
3954             description = None
3955         imagePreview_el = metadata.find('imagePreview')
3956         if imagePreview_el is not None:
3957             thumbnail = imagePreview_el.text
3958         else:
3959             thumbnail = None
3960         info = {
3961             'id': video_id,
3962             'url': video_url,
3963             'title': title,
3964             'ext': extension,
3965             'format': format,
3966             'thumbnail': thumbnail,
3967             'description': description
3968         }
3969         return [info]
3970
3971 class SpiegelIE(InfoExtractor):
3972     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3973
3974     def _real_extract(self, url):
3975         m = re.match(self._VALID_URL, url)
3976         video_id = m.group('videoID')
3977
3978         webpage = self._download_webpage(url, video_id)
3979         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3980         if not m:
3981             raise ExtractorError(u'Cannot find title')
3982         video_title = unescapeHTML(m.group(1))
3983
3984         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3985         xml_code = self._download_webpage(xml_url, video_id,
3986                     note=u'Downloading XML', errnote=u'Failed to download XML')
3987
3988         idoc = xml.etree.ElementTree.fromstring(xml_code)
3989         last_type = idoc[-1]
3990         filename = last_type.findall('./filename')[0].text
3991         duration = float(last_type.findall('./duration')[0].text)
3992
3993         video_url = 'http://video2.spiegel.de/flash/' + filename
3994         video_ext = filename.rpartition('.')[2]
3995         info = {
3996             'id': video_id,
3997             'url': video_url,
3998             'ext': video_ext,
3999             'title': video_title,
4000             'duration': duration,
4001         }
4002         return [info]
4003
4004 class LiveLeakIE(InfoExtractor):
4005
4006     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4007     IE_NAME = u'liveleak'
4008
4009     def _real_extract(self, url):
4010         mobj = re.match(self._VALID_URL, url)
4011         if mobj is None:
4012             raise ExtractorError(u'Invalid URL: %s' % url)
4013
4014         video_id = mobj.group('video_id')
4015
4016         webpage = self._download_webpage(url, video_id)
4017
4018         m = re.search(r'file: "(.*?)",', webpage)
4019         if not m:
4020             raise ExtractorError(u'Unable to find video url')
4021         video_url = m.group(1)
4022
4023         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4024         if not m:
4025             raise ExtractorError(u'Cannot find video title')
4026         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4027
4028         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4029         if m:
4030             desc = unescapeHTML(m.group('desc'))
4031         else:
4032             desc = None
4033
4034         m = re.search(r'By:.*?(\w+)</a>', webpage)
4035         if m:
4036             uploader = clean_html(m.group(1))
4037         else:
4038             uploader = None
4039
4040         info = {
4041             'id':  video_id,
4042             'url': video_url,
4043             'ext': 'mp4',
4044             'title': title,
4045             'description': desc,
4046             'uploader': uploader
4047         }
4048
4049         return [info]
4050
4051 class ARDIE(InfoExtractor):
4052     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4053     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4054     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4055
4056     def _real_extract(self, url):
4057         # determine video id from url
4058         m = re.match(self._VALID_URL, url)
4059
4060         numid = re.search(r'documentId=([0-9]+)', url)
4061         if numid:
4062             video_id = numid.group(1)
4063         else:
4064             video_id = m.group('video_id')
4065
4066         # determine title and media streams from webpage
4067         html = self._download_webpage(url, video_id)
4068         title = re.search(self._TITLE, html).group('title')
4069         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4070         if not streams:
4071             assert '"fsk"' in html
4072             raise ExtractorError(u'This video is only available after 8:00 pm')
4073
4074         # choose default media type and highest quality for now
4075         stream = max([s for s in streams if int(s["media_type"]) == 0],
4076                      key=lambda s: int(s["quality"]))
4077
4078         # there's two possibilities: RTMP stream or HTTP download
4079         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4080         if stream['rtmp_url']:
4081             self.to_screen(u'RTMP download detected')
4082             assert stream['video_url'].startswith('mp4:')
4083             info["url"] = stream["rtmp_url"]
4084             info["play_path"] = stream['video_url']
4085         else:
4086             assert stream["video_url"].endswith('.mp4')
4087             info["url"] = stream["video_url"]
4088         return [info]
4089
4090 class TumblrIE(InfoExtractor):
4091     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4092
4093     def _real_extract(self, url):
4094         m_url = re.match(self._VALID_URL, url)
4095         video_id = m_url.group('id')
4096         blog = m_url.group('blog_name')
4097
4098         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4099         webpage = self._download_webpage(url, video_id)
4100
4101         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4102         video = re.search(re_video, webpage)
4103         if video is None:
4104             self.to_screen("No video found")
4105             return []
4106         video_url = video.group('video_url')
4107         ext = video.group('ext')
4108
4109         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4110         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4111
4112         # The only place where you can get a title, it's not complete,
4113         # but searching in other places doesn't work for all videos
4114         re_title = r'<title>(?P<title>.*?)</title>'
4115         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4116
4117         return [{'id': video_id,
4118                  'url': video_url,
4119                  'title': title,
4120                  'thumbnail': thumb,
4121                  'ext': ext
4122                  }]
4123
4124 class BandcampIE(InfoExtractor):
4125     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4126
4127     def _real_extract(self, url):
4128         mobj = re.match(self._VALID_URL, url)
4129         title = mobj.group('title')
4130         webpage = self._download_webpage(url, title)
4131         # We get the link to the free download page
4132         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4133         if m_download is None:
4134             raise ExtractorError(u'No free songs founded')
4135
4136         download_link = m_download.group(1)
4137         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4138                        webpage, re.MULTILINE|re.DOTALL).group('id')
4139
4140         download_webpage = self._download_webpage(download_link, id,
4141                                                   'Downloading free downloads page')
4142         # We get the dictionary of the track from some javascrip code
4143         info = re.search(r'items: (.*?),$',
4144                          download_webpage, re.MULTILINE).group(1)
4145         info = json.loads(info)[0]
4146         # We pick mp3-320 for now, until format selection can be easily implemented.
4147         mp3_info = info[u'downloads'][u'mp3-320']
4148         # If we try to use this url it says the link has expired
4149         initial_url = mp3_info[u'url']
4150         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4151         m_url = re.match(re_url, initial_url)
4152         #We build the url we will use to get the final track url
4153         # This url is build in Bandcamp in the script download_bunde_*.js
4154         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4155         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4156         # If we could correctly generate the .rand field the url would be
4157         #in the "download_url" key
4158         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4159
4160         track_info = {'id':id,
4161                       'title' : info[u'title'],
4162                       'ext' : 'mp3',
4163                       'url' : final_url,
4164                       'thumbnail' : info[u'thumb_url'],
4165                       'uploader' : info[u'artist']
4166                       }
4167
4168         return [track_info]
4169
4170 class RedTubeIE(InfoExtractor):
4171     """Information Extractor for redtube"""
4172     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4173
4174     def _real_extract(self,url):
4175         mobj = re.match(self._VALID_URL, url)
4176         if mobj is None:
4177             raise ExtractorError(u'Invalid URL: %s' % url)
4178
4179         video_id = mobj.group('id')
4180         video_extension = 'mp4'
4181         webpage = self._download_webpage(url, video_id)
4182         self.report_extraction(video_id)
4183         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4184
4185         if mobj is None:
4186             raise ExtractorError(u'Unable to extract media URL')
4187
4188         video_url = mobj.group(1)
4189         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4190         if mobj is None:
4191             raise ExtractorError(u'Unable to extract title')
4192         video_title = mobj.group(1)
4193
4194         return [{
4195             'id':       video_id,
4196             'url':      video_url,
4197             'ext':      video_extension,
4198             'title':    video_title,
4199         }]
4200
4201 class InaIE(InfoExtractor):
4202     """Information Extractor for Ina.fr"""
4203     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4204
4205     def _real_extract(self,url):
4206         mobj = re.match(self._VALID_URL, url)
4207
4208         video_id = mobj.group('id')
4209         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4210         video_extension = 'mp4'
4211         webpage = self._download_webpage(mrss_url, video_id)
4212
4213         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4214         if mobj is None:
4215             raise ExtractorError(u'Unable to extract media URL')
4216         video_url = mobj.group(1)
4217
4218         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4219         if mobj is None:
4220             raise ExtractorError(u'Unable to extract title')
4221         video_title = mobj.group(1)
4222
4223         return [{
4224             'id':       video_id,
4225             'url':      video_url,
4226             'ext':      video_extension,
4227             'title':    video_title,
4228         }]
4229
4230 class HowcastIE(InfoExtractor):
4231     """Information Extractor for Howcast.com"""
4232     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4233
4234     def _real_extract(self, url):
4235         mobj = re.match(self._VALID_URL, url)
4236
4237         video_id = mobj.group('id')
4238         webpage_url = 'http://www.howcast.com/videos/' + video_id
4239         webpage = self._download_webpage(webpage_url, video_id)
4240
4241         self.report_extraction(video_id)
4242
4243         mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4244         if mobj is None:
4245             raise ExtractorError(u'Unable to extract video URL')
4246         video_url = mobj.group(1)
4247
4248         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4249         if mobj is None:
4250             raise ExtractorError(u'Unable to extract title')
4251         video_title = mobj.group(1) or mobj.group(2)
4252
4253         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4254         if mobj is None:
4255             self._downloader.report_warning(u'unable to extract description')
4256             video_description = None
4257         else:
4258             video_description = mobj.group(1) or mobj.group(2)
4259
4260         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4261         if mobj is None:
4262             raise ExtractorError(u'Unable to extract thumbnail')
4263         thumbnail = mobj.group(1)
4264
4265         return [{
4266             'id':       video_id,
4267             'url':      video_url,
4268             'ext':      'mp4',
4269             'title':    video_title,
4270             'description': video_description,
4271             'thumbnail': thumbnail,
4272         }]
4273
4274 class VineIE(InfoExtractor):
4275     """Information Extractor for Vine.co"""
4276     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4277
4278     def _real_extract(self, url):
4279
4280         mobj = re.match(self._VALID_URL, url)
4281
4282         video_id = mobj.group('id')
4283         webpage_url = 'https://vine.co/v/' + video_id
4284         webpage = self._download_webpage(webpage_url, video_id)
4285
4286         self.report_extraction(video_id)
4287
4288         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4289         if mobj is None:
4290             raise ExtractorError(u'Unable to extract video URL')
4291         video_url = mobj.group(1)
4292
4293         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4294         if mobj is None:
4295             raise ExtractorError(u'Unable to extract title')
4296         video_title = mobj.group(1)
4297
4298         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4299         if mobj is None:
4300             raise ExtractorError(u'Unable to extract thumbnail')
4301         thumbnail = mobj.group(1)
4302
4303         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4304         if mobj is None:
4305             raise ExtractorError(u'Unable to extract uploader')
4306         uploader = mobj.group(1)
4307
4308         return [{
4309             'id':        video_id,
4310             'url':       video_url,
4311             'ext':       'mp4',
4312             'title':     video_title,
4313             'thumbnail': thumbnail,
4314             'uploader':  uploader,
4315         }]
4316
4317 class FlickrIE(InfoExtractor):
4318     """Information Extractor for Flickr videos"""
4319     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4320
4321     def _real_extract(self, url):
4322         mobj = re.match(self._VALID_URL, url)
4323
4324         video_id = mobj.group('id')
4325         video_uploader_id = mobj.group('uploader_id')
4326         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4327         webpage = self._download_webpage(webpage_url, video_id)
4328
4329         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4330         if mobj is None:
4331             raise ExtractorError(u'Unable to extract video secret')
4332         secret = mobj.group(1)
4333
4334         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4335         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4336
4337         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4338         if mobj is None:
4339             raise ExtractorError(u'Unable to extract node_id')
4340         node_id = mobj.group(1)
4341
4342         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4343         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4344
4345         self.report_extraction(video_id)
4346
4347         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4348         if mobj is None:
4349             raise ExtractorError(u'Unable to extract video url')
4350         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4351
4352         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4353         if mobj is None:
4354             raise ExtractorError(u'Unable to extract title')
4355         video_title = mobj.group(1) or mobj.group(2)
4356
4357         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4358         if mobj is None:
4359             self._downloader.report_warning(u'unable to extract description')
4360             video_description = None
4361         else:
4362             video_description = mobj.group(1) or mobj.group(2)
4363
4364         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4365         if mobj is None:
4366             raise ExtractorError(u'Unable to extract thumbnail')
4367         thumbnail = mobj.group(1) or mobj.group(2)
4368
4369         return [{
4370             'id':          video_id,
4371             'url':         video_url,
4372             'ext':         'mp4',
4373             'title':       video_title,
4374             'description': video_description,
4375             'thumbnail':   thumbnail,
4376             'uploader_id': video_uploader_id,
4377         }]
4378
4379 class TeamcocoIE(InfoExtractor):
4380     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4381
4382     def _real_extract(self, url):
4383         mobj = re.match(self._VALID_URL, url)
4384         if mobj is None:
4385             raise ExtractorError(u'Invalid URL: %s' % url)
4386         url_title = mobj.group('url_title')
4387         webpage = self._download_webpage(url, url_title)
4388
4389         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4390         video_id = mobj.group(1)
4391
4392         self.report_extraction(video_id)
4393
4394         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4395         if mobj is None:
4396             raise ExtractorError(u'Unable to extract title')
4397         video_title = mobj.group(1)
4398
4399         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4400         if mobj is None:
4401             raise ExtractorError(u'Unable to extract thumbnail')
4402         thumbnail = mobj.group(1)
4403
4404         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4405         if mobj is None:
4406             raise ExtractorError(u'Unable to extract description')
4407         description = mobj.group(1)
4408
4409         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4410         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4411         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4412         if mobj is None:
4413             raise ExtractorError(u'Unable to extract video url')
4414         video_url = mobj.group(1)
4415
4416         return [{
4417             'id':          video_id,
4418             'url':         video_url,
4419             'ext':         'mp4',
4420             'title':       video_title,
4421             'thumbnail':   thumbnail,
4422             'description': description,
4423         }]
4424
4425 def gen_extractors():
4426     """ Return a list of an instance of every supported extractor.
4427     The order does matter; the first extractor matched is the one handling the URL.
4428     """
4429     return [
4430         YoutubePlaylistIE(),
4431         YoutubeChannelIE(),
4432         YoutubeUserIE(),
4433         YoutubeSearchIE(),
4434         YoutubeIE(),
4435         MetacafeIE(),
4436         DailymotionIE(),
4437         GoogleSearchIE(),
4438         PhotobucketIE(),
4439         YahooIE(),
4440         YahooSearchIE(),
4441         DepositFilesIE(),
4442         FacebookIE(),
4443         BlipTVUserIE(),
4444         BlipTVIE(),
4445         VimeoIE(),
4446         MyVideoIE(),
4447         ComedyCentralIE(),
4448         EscapistIE(),
4449         CollegeHumorIE(),
4450         XVideosIE(),
4451         SoundcloudSetIE(),
4452         SoundcloudIE(),
4453         InfoQIE(),
4454         MixcloudIE(),
4455         StanfordOpenClassroomIE(),
4456         MTVIE(),
4457         YoukuIE(),
4458         XNXXIE(),
4459         YouJizzIE(),
4460         PornotubeIE(),
4461         YouPornIE(),
4462         GooglePlusIE(),
4463         ArteTvIE(),
4464         NBAIE(),
4465         WorldStarHipHopIE(),
4466         JustinTVIE(),
4467         FunnyOrDieIE(),
4468         SteamIE(),
4469         UstreamIE(),
4470         RBMARadioIE(),
4471         EightTracksIE(),
4472         KeekIE(),
4473         TEDIE(),
4474         MySpassIE(),
4475         SpiegelIE(),
4476         LiveLeakIE(),
4477         ARDIE(),
4478         TumblrIE(),
4479         BandcampIE(),
4480         RedTubeIE(),
4481         InaIE(),
4482         HowcastIE(),
4483         VineIE(),
4484         FlickrIE(),
4485         TeamcocoIE(),
4486         GenericIE()
4487     ]
4488
4489 def get_info_extractor(ie_name):
4490     """Returns the info extractor class with the given ie_name"""
4491     return globals()[ie_name+'IE']