git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 736                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 737                     url_map[url_data['itag'][0]] = url
 738
 739             format_limit = self._downloader.params.get('format_limit', None)
 740             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 741             if format_limit is not None and format_limit in available_formats:
 742                 format_list = available_formats[available_formats.index(format_limit):]
 743             else:
 744                 format_list = available_formats
 745             existing_formats = [x for x in format_list if x in url_map]
 746             if len(existing_formats) == 0:
 747                 raise ExtractorError(u'no known formats available for video')
 748             if self._downloader.params.get('listformats', None):
 749                 self._print_formats(existing_formats)
 750                 return
 751             if req_format is None or req_format == 'best':
 752                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 753             elif req_format == 'worst':
 754                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 755             elif req_format in ('-1', 'all'):
 756                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 757             else:
 758                 # Specific formats. We pick the first in a slash-delimeted sequence.
 759                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 760                 req_formats = req_format.split('/')
 761                 video_url_list = None
 762                 for rf in req_formats:
 763                     if rf in url_map:
 764                         video_url_list = [(rf, url_map[rf])]
 765                         break
 766                 if video_url_list is None:
 767                     raise ExtractorError(u'requested format not available')
 768         else:
 769             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 770
 771         results = []
 772         for format_param, video_real_url in video_url_list:
 773             # Extension
 774             video_extension = self._video_extensions.get(format_param, 'flv')
 775
 776             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 777                                               self._video_dimensions.get(format_param, '???'))
 778
 779             results.append({
 780                 'id':       video_id,
 781                 'url':      video_real_url,
 782                 'uploader': video_uploader,
 783                 'uploader_id': video_uploader_id,
 784                 'upload_date':  upload_date,
 785                 'title':    video_title,
 786                 'ext':      video_extension,
 787                 'format':   video_format,
 788                 'thumbnail':    video_thumbnail,
 789                 'description':  video_description,
 790                 'player_url':   player_url,
 791                 'subtitles':    video_subtitles,
 792                 'duration':     video_duration
 793             })
 794         return results
 795
 796
 797 class MetacafeIE(InfoExtractor):
 798     """Information Extractor for metacafe.com."""
 799
 800     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 801     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 802     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 803     IE_NAME = u'metacafe'
 804
 805     def report_disclaimer(self):
 806         """Report disclaimer retrieval."""
 807         self.to_screen(u'Retrieving disclaimer')
 808
 809     def _real_initialize(self):
 810         # Retrieve disclaimer
 811         request = compat_urllib_request.Request(self._DISCLAIMER)
 812         try:
 813             self.report_disclaimer()
 814             disclaimer = compat_urllib_request.urlopen(request).read()
 815         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 816             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 817
 818         # Confirm age
 819         disclaimer_form = {
 820             'filters': '0',
 821             'submit': "Continue - I'm over 18",
 822             }
 823         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 824         try:
 825             self.report_age_confirmation()
 826             disclaimer = compat_urllib_request.urlopen(request).read()
 827         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 828             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 829
 830     def _real_extract(self, url):
 831         # Extract id and simplified title from URL
 832         mobj = re.match(self._VALID_URL, url)
 833         if mobj is None:
 834             raise ExtractorError(u'Invalid URL: %s' % url)
 835
 836         video_id = mobj.group(1)
 837
 838         # Check if video comes from YouTube
 839         mobj2 = re.match(r'^yt-(.*)$', video_id)
 840         if mobj2 is not None:
 841             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 842
 843         # Retrieve video webpage to extract further information
 844         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 845
 846         # Extract URL, uploader and title from webpage
 847         self.report_extraction(video_id)
 848         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 849         if mobj is not None:
 850             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 851             video_extension = mediaURL[-3:]
 852
 853             # Extract gdaKey if available
 854             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 855             if mobj is None:
 856                 video_url = mediaURL
 857             else:
 858                 gdaKey = mobj.group(1)
 859                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 860         else:
 861             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 862             if mobj is None:
 863                 raise ExtractorError(u'Unable to extract media URL')
 864             vardict = compat_parse_qs(mobj.group(1))
 865             if 'mediaData' not in vardict:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 868             if mobj is None:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 871             video_extension = mediaURL[-3:]
 872             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 873
 874         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 875         if mobj is None:
 876             raise ExtractorError(u'Unable to extract title')
 877         video_title = mobj.group(1).decode('utf-8')
 878
 879         mobj = re.search(r'submitter=(.*?);', webpage)
 880         if mobj is None:
 881             raise ExtractorError(u'Unable to extract uploader nickname')
 882         video_uploader = mobj.group(1)
 883
 884         return [{
 885             'id':       video_id.decode('utf-8'),
 886             'url':      video_url.decode('utf-8'),
 887             'uploader': video_uploader.decode('utf-8'),
 888             'upload_date':  None,
 889             'title':    video_title,
 890             'ext':      video_extension.decode('utf-8'),
 891         }]
 892
 893 class DailymotionIE(InfoExtractor):
 894     """Information Extractor for Dailymotion"""
 895
 896     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 897     IE_NAME = u'dailymotion'
 898
 899     def _real_extract(self, url):
 900         # Extract id and simplified title from URL
 901         mobj = re.match(self._VALID_URL, url)
 902         if mobj is None:
 903             raise ExtractorError(u'Invalid URL: %s' % url)
 904
 905         video_id = mobj.group(1).split('_')[0].split('?')[0]
 906
 907         video_extension = 'mp4'
 908
 909         # Retrieve video webpage to extract further information
 910         request = compat_urllib_request.Request(url)
 911         request.add_header('Cookie', 'family_filter=off')
 912         webpage = self._download_webpage(request, video_id)
 913
 914         # Extract URL, uploader and title from webpage
 915         self.report_extraction(video_id)
 916         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 917         if mobj is None:
 918             raise ExtractorError(u'Unable to extract media URL')
 919         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 920
 921         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 922             if key in flashvars:
 923                 max_quality = key
 924                 self.to_screen(u'Using %s' % key)
 925                 break
 926         else:
 927             raise ExtractorError(u'Unable to extract video URL')
 928
 929         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 930         if mobj is None:
 931             raise ExtractorError(u'Unable to extract video URL')
 932
 933         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 934
 935         # TODO: support choosing qualities
 936
 937         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 938         if mobj is None:
 939             raise ExtractorError(u'Unable to extract title')
 940         video_title = unescapeHTML(mobj.group('title'))
 941
 942         video_uploader = None
 943         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 944         if mobj is None:
 945             # lookin for official user
 946             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 947             if mobj_official is None:
 948                 self._downloader.report_warning(u'unable to extract uploader nickname')
 949             else:
 950                 video_uploader = mobj_official.group(1)
 951         else:
 952             video_uploader = mobj.group(1)
 953
 954         video_upload_date = None
 955         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 956         if mobj is not None:
 957             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 958
 959         return [{
 960             'id':       video_id,
 961             'url':      video_url,
 962             'uploader': video_uploader,
 963             'upload_date':  video_upload_date,
 964             'title':    video_title,
 965             'ext':      video_extension,
 966         }]
 967
 968
 969 class PhotobucketIE(InfoExtractor):
 970     """Information extractor for photobucket.com."""
 971
 972     # TODO: the original _VALID_URL was:
 973     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 974     # Check if it's necessary to keep the old extracion process
 975     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 976     IE_NAME = u'photobucket'
 977
 978     def _real_extract(self, url):
 979         # Extract id from URL
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             raise ExtractorError(u'Invalid URL: %s' % url)
 983
 984         video_id = mobj.group('id')
 985
 986         video_extension = mobj.group('ext')
 987
 988         # Retrieve video webpage to extract further information
 989         webpage = self._download_webpage(url, video_id)
 990
 991         # Extract URL, uploader, and title from webpage
 992         self.report_extraction(video_id)
 993         # We try first by looking the javascript code:
 994         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 995         if mobj is not None:
 996             info = json.loads(mobj.group('json'))
 997             return [{
 998                 'id':       video_id,
 999                 'url':      info[u'downloadUrl'],
1000                 'uploader': info[u'username'],
1001                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1002                 'title':    info[u'title'],
1003                 'ext':      video_extension,
1004                 'thumbnail': info[u'thumbUrl'],
1005             }]
1006
1007         # We try looking in other parts of the webpage
1008         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1009             webpage, u'video URL')
1010
1011         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1012         if mobj is None:
1013             raise ExtractorError(u'Unable to extract title')
1014         video_title = mobj.group(1).decode('utf-8')
1015         video_uploader = mobj.group(2).decode('utf-8')
1016
1017         return [{
1018             'id':       video_id.decode('utf-8'),
1019             'url':      video_url.decode('utf-8'),
1020             'uploader': video_uploader,
1021             'upload_date':  None,
1022             'title':    video_title,
1023             'ext':      video_extension.decode('utf-8'),
1024         }]
1025
1026
1027 class YahooIE(InfoExtractor):
1028     """Information extractor for screen.yahoo.com."""
1029     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1030
1031     def _real_extract(self, url):
1032         mobj = re.match(self._VALID_URL, url)
1033         if mobj is None:
1034             raise ExtractorError(u'Invalid URL: %s' % url)
1035         video_id = mobj.group('id')
1036         webpage = self._download_webpage(url, video_id)
1037         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1038
1039         if m_id is None:
1040             # TODO: Check which url parameters are required
1041             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1042             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1043             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1044                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1045                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1046                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1047                         '''
1048             self.report_extraction(video_id)
1049             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1050             if m_info is None:
1051                 raise ExtractorError(u'Unable to extract video info')
1052             video_title = m_info.group('title')
1053             video_description = m_info.group('description')
1054             video_thumb = m_info.group('thumb')
1055             video_date = m_info.group('date')
1056             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1057
1058             # TODO: Find a way to get mp4 videos
1059             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1061             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1062             video_url = m_rest.group('url')
1063             video_path = m_rest.group('path')
1064             if m_rest is None:
1065                 raise ExtractorError(u'Unable to extract video url')
1066
1067         else: # We have to use a different method if another id is defined
1068             long_id = m_id.group('new_id')
1069             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1070             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1071             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1072             info = json.loads(json_str)
1073             res = info[u'query'][u'results'][u'mediaObj'][0]
1074             stream = res[u'streams'][0]
1075             video_path = stream[u'path']
1076             video_url = stream[u'host']
1077             meta = res[u'meta']
1078             video_title = meta[u'title']
1079             video_description = meta[u'description']
1080             video_thumb = meta[u'thumbnail']
1081             video_date = None # I can't find it
1082
1083         info_dict = {
1084                      'id': video_id,
1085                      'url': video_url,
1086                      'play_path': video_path,
1087                      'title':video_title,
1088                      'description': video_description,
1089                      'thumbnail': video_thumb,
1090                      'upload_date': video_date,
1091                      'ext': 'flv',
1092                      }
1093         return info_dict
1094
1095 class VimeoIE(InfoExtractor):
1096     """Information extractor for vimeo.com."""
1097
1098     # _VALID_URL matches Vimeo URLs
1099     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1100     IE_NAME = u'vimeo'
1101
1102     def _real_extract(self, url, new_video=True):
1103         # Extract ID from URL
1104         mobj = re.match(self._VALID_URL, url)
1105         if mobj is None:
1106             raise ExtractorError(u'Invalid URL: %s' % url)
1107
1108         video_id = mobj.group('id')
1109         if not mobj.group('proto'):
1110             url = 'https://' + url
1111         if mobj.group('direct_link') or mobj.group('pro'):
1112             url = 'https://vimeo.com/' + video_id
1113
1114         # Retrieve video webpage to extract further information
1115         request = compat_urllib_request.Request(url, None, std_headers)
1116         webpage = self._download_webpage(request, video_id)
1117
1118         # Now we begin extracting as much information as we can from what we
1119         # retrieved. First we extract the information common to all extractors,
1120         # and latter we extract those that are Vimeo specific.
1121         self.report_extraction(video_id)
1122
1123         # Extract the config JSON
1124         try:
1125             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1126             config = json.loads(config)
1127         except:
1128             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1129                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1130             else:
1131                 raise ExtractorError(u'Unable to extract info section')
1132
1133         # Extract title
1134         video_title = config["video"]["title"]
1135
1136         # Extract uploader and uploader_id
1137         video_uploader = config["video"]["owner"]["name"]
1138         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1139
1140         # Extract video thumbnail
1141         video_thumbnail = config["video"]["thumbnail"]
1142
1143         # Extract video description
1144         video_description = get_element_by_attribute("itemprop", "description", webpage)
1145         if video_description: video_description = clean_html(video_description)
1146         else: video_description = u''
1147
1148         # Extract upload date
1149         video_upload_date = None
1150         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1151         if mobj is not None:
1152             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1153
1154         # Vimeo specific: extract request signature and timestamp
1155         sig = config['request']['signature']
1156         timestamp = config['request']['timestamp']
1157
1158         # Vimeo specific: extract video codec and quality information
1159         # First consider quality, then codecs, then take everything
1160         # TODO bind to format param
1161         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1162         files = { 'hd': [], 'sd': [], 'other': []}
1163         for codec_name, codec_extension in codecs:
1164             if codec_name in config["video"]["files"]:
1165                 if 'hd' in config["video"]["files"][codec_name]:
1166                     files['hd'].append((codec_name, codec_extension, 'hd'))
1167                 elif 'sd' in config["video"]["files"][codec_name]:
1168                     files['sd'].append((codec_name, codec_extension, 'sd'))
1169                 else:
1170                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1171
1172         for quality in ('hd', 'sd', 'other'):
1173             if len(files[quality]) > 0:
1174                 video_quality = files[quality][0][2]
1175                 video_codec = files[quality][0][0]
1176                 video_extension = files[quality][0][1]
1177                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1178                 break
1179         else:
1180             raise ExtractorError(u'No known codec found')
1181
1182         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1183                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1184
1185         return [{
1186             'id':       video_id,
1187             'url':      video_url,
1188             'uploader': video_uploader,
1189             'uploader_id': video_uploader_id,
1190             'upload_date':  video_upload_date,
1191             'title':    video_title,
1192             'ext':      video_extension,
1193             'thumbnail':    video_thumbnail,
1194             'description':  video_description,
1195         }]
1196
1197
1198 class ArteTvIE(InfoExtractor):
1199     """arte.tv information extractor."""
1200
1201     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1202     _LIVE_URL = r'index-[0-9]+\.html$'
1203
1204     IE_NAME = u'arte.tv'
1205
1206     def fetch_webpage(self, url):
1207         request = compat_urllib_request.Request(url)
1208         try:
1209             self.report_download_webpage(url)
1210             webpage = compat_urllib_request.urlopen(request).read()
1211         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213         except ValueError as err:
1214             raise ExtractorError(u'Invalid URL: %s' % url)
1215         return webpage
1216
1217     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1218         page = self.fetch_webpage(url)
1219         mobj = re.search(regex, page, regexFlags)
1220         info = {}
1221
1222         if mobj is None:
1223             raise ExtractorError(u'Invalid URL: %s' % url)
1224
1225         for (i, key, err) in matchTuples:
1226             if mobj.group(i) is None:
1227                 raise ExtractorError(err)
1228             else:
1229                 info[key] = mobj.group(i)
1230
1231         return info
1232
1233     def extractLiveStream(self, url):
1234         video_lang = url.split('/')[-4]
1235         info = self.grep_webpage(
1236             url,
1237             r'src="(.*?/videothek_js.*?\.js)',
1238             0,
1239             [
1240                 (1, 'url', u'Invalid URL: %s' % url)
1241             ]
1242         )
1243         http_host = url.split('/')[2]
1244         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245         info = self.grep_webpage(
1246             next_url,
1247             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248                 '(http://.*?\.swf).*?' +
1249                 '(rtmp://.*?)\'',
1250             re.DOTALL,
1251             [
1252                 (1, 'path',   u'could not extract video path: %s' % url),
1253                 (2, 'player', u'could not extract video player: %s' % url),
1254                 (3, 'url',    u'could not extract video url: %s' % url)
1255             ]
1256         )
1257         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1258
1259     def extractPlus7Stream(self, url):
1260         video_lang = url.split('/')[-3]
1261         info = self.grep_webpage(
1262             url,
1263             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1264             0,
1265             [
1266                 (1, 'url', u'Invalid URL: %s' % url)
1267             ]
1268         )
1269         next_url = compat_urllib_parse.unquote(info.get('url'))
1270         info = self.grep_webpage(
1271             next_url,
1272             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1273             0,
1274             [
1275                 (1, 'url', u'Could not find <video> tag: %s' % url)
1276             ]
1277         )
1278         next_url = compat_urllib_parse.unquote(info.get('url'))
1279
1280         info = self.grep_webpage(
1281             next_url,
1282             r'<video id="(.*?)".*?>.*?' +
1283                 '<name>(.*?)</name>.*?' +
1284                 '<dateVideo>(.*?)</dateVideo>.*?' +
1285                 '<url quality="hd">(.*?)</url>',
1286             re.DOTALL,
1287             [
1288                 (1, 'id',    u'could not extract video id: %s' % url),
1289                 (2, 'title', u'could not extract video title: %s' % url),
1290                 (3, 'date',  u'could not extract video date: %s' % url),
1291                 (4, 'url',   u'could not extract video url: %s' % url)
1292             ]
1293         )
1294
1295         return {
1296             'id':           info.get('id'),
1297             'url':          compat_urllib_parse.unquote(info.get('url')),
1298             'uploader':     u'arte.tv',
1299             'upload_date':  unified_strdate(info.get('date')),
1300             'title':        info.get('title').decode('utf-8'),
1301             'ext':          u'mp4',
1302             'format':       u'NA',
1303             'player_url':   None,
1304         }
1305
1306     def _real_extract(self, url):
1307         video_id = url.split('/')[-1]
1308         self.report_extraction(video_id)
1309
1310         if re.search(self._LIVE_URL, video_id) is not None:
1311             self.extractLiveStream(url)
1312             return
1313         else:
1314             info = self.extractPlus7Stream(url)
1315
1316         return [info]
1317
1318
1319 class GenericIE(InfoExtractor):
1320     """Generic last-resort information extractor."""
1321
1322     _VALID_URL = r'.*'
1323     IE_NAME = u'generic'
1324
1325     def report_download_webpage(self, video_id):
1326         """Report webpage download."""
1327         if not self._downloader.params.get('test', False):
1328             self._downloader.report_warning(u'Falling back on generic information extractor.')
1329         super(GenericIE, self).report_download_webpage(video_id)
1330
1331     def report_following_redirect(self, new_url):
1332         """Report information extraction."""
1333         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1334
1335     def _test_redirect(self, url):
1336         """Check if it is a redirect, like url shorteners, in case return the new url."""
1337         class HeadRequest(compat_urllib_request.Request):
1338             def get_method(self):
1339                 return "HEAD"
1340
1341         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1342             """
1343             Subclass the HTTPRedirectHandler to make it use our
1344             HeadRequest also on the redirected URL
1345             """
1346             def redirect_request(self, req, fp, code, msg, headers, newurl):
1347                 if code in (301, 302, 303, 307):
1348                     newurl = newurl.replace(' ', '%20')
1349                     newheaders = dict((k,v) for k,v in req.headers.items()
1350                                       if k.lower() not in ("content-length", "content-type"))
1351                     return HeadRequest(newurl,
1352                                        headers=newheaders,
1353                                        origin_req_host=req.get_origin_req_host(),
1354                                        unverifiable=True)
1355                 else:
1356                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1357
1358         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1359             """
1360             Fallback to GET if HEAD is not allowed (405 HTTP error)
1361             """
1362             def http_error_405(self, req, fp, code, msg, headers):
1363                 fp.read()
1364                 fp.close()
1365
1366                 newheaders = dict((k,v) for k,v in req.headers.items()
1367                                   if k.lower() not in ("content-length", "content-type"))
1368                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1369                                                  headers=newheaders,
1370                                                  origin_req_host=req.get_origin_req_host(),
1371                                                  unverifiable=True))
1372
1373         # Build our opener
1374         opener = compat_urllib_request.OpenerDirector()
1375         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1376                         HTTPMethodFallback, HEADRedirectHandler,
1377                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1378             opener.add_handler(handler())
1379
1380         response = opener.open(HeadRequest(url))
1381         if response is None:
1382             raise ExtractorError(u'Invalid URL protocol')
1383         new_url = response.geturl()
1384
1385         if url == new_url:
1386             return False
1387
1388         self.report_following_redirect(new_url)
1389         return new_url
1390
1391     def _real_extract(self, url):
1392         new_url = self._test_redirect(url)
1393         if new_url: return [self.url_result(new_url)]
1394
1395         video_id = url.split('/')[-1]
1396         try:
1397             webpage = self._download_webpage(url, video_id)
1398         except ValueError as err:
1399             # since this is the last-resort InfoExtractor, if
1400             # this error is thrown, it'll be thrown here
1401             raise ExtractorError(u'Invalid URL: %s' % url)
1402
1403         self.report_extraction(video_id)
1404         # Start with something easy: JW Player in SWFObject
1405         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1406         if mobj is None:
1407             # Broaden the search a little bit
1408             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1409         if mobj is None:
1410             # Broaden the search a little bit: JWPlayer JS loader
1411             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1412         if mobj is None:
1413             raise ExtractorError(u'Invalid URL: %s' % url)
1414
1415         # It's possible that one of the regexes
1416         # matched, but returned an empty group:
1417         if mobj.group(1) is None:
1418             raise ExtractorError(u'Invalid URL: %s' % url)
1419
1420         video_url = compat_urllib_parse.unquote(mobj.group(1))
1421         video_id = os.path.basename(video_url)
1422
1423         # here's a fun little line of code for you:
1424         video_extension = os.path.splitext(video_id)[1][1:]
1425         video_id = os.path.splitext(video_id)[0]
1426
1427         # it's tempting to parse this further, but you would
1428         # have to take into account all the variations like
1429         #   Video Title - Site Name
1430         #   Site Name | Video Title
1431         #   Video Title - Tagline | Site Name
1432         # and so on and so forth; it's just not practical
1433         mobj = re.search(r'<title>(.*)</title>', webpage)
1434         if mobj is None:
1435             raise ExtractorError(u'Unable to extract title')
1436         video_title = mobj.group(1)
1437
1438         # video uploader is domain name
1439         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1440         if mobj is None:
1441             raise ExtractorError(u'Unable to extract title')
1442         video_uploader = mobj.group(1)
1443
1444         return [{
1445             'id':       video_id,
1446             'url':      video_url,
1447             'uploader': video_uploader,
1448             'upload_date':  None,
1449             'title':    video_title,
1450             'ext':      video_extension,
1451         }]
1452
1453
1454 class YoutubeSearchIE(SearchInfoExtractor):
1455     """Information Extractor for YouTube search queries."""
1456     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1457     _MAX_RESULTS = 1000
1458     IE_NAME = u'youtube:search'
1459     _SEARCH_KEY = 'ytsearch'
1460
1461     def report_download_page(self, query, pagenum):
1462         """Report attempt to download search page with given number."""
1463         query = query.decode(preferredencoding())
1464         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1465
1466     def _get_n_results(self, query, n):
1467         """Get a specified number of results for a query"""
1468
1469         video_ids = []
1470         pagenum = 0
1471         limit = n
1472
1473         while (50 * pagenum) < limit:
1474             self.report_download_page(query, pagenum+1)
1475             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1476             request = compat_urllib_request.Request(result_url)
1477             try:
1478                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1479             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1481             api_response = json.loads(data)['data']
1482
1483             if not 'items' in api_response:
1484                 raise ExtractorError(u'[youtube] No video results')
1485
1486             new_ids = list(video['id'] for video in api_response['items'])
1487             video_ids += new_ids
1488
1489             limit = min(n, api_response['totalItems'])
1490             pagenum += 1
1491
1492         if len(video_ids) > n:
1493             video_ids = video_ids[:n]
1494         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495         return self.playlist_result(videos, query)
1496
1497
1498 class GoogleSearchIE(SearchInfoExtractor):
1499     """Information Extractor for Google Video search queries."""
1500     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1501     _MAX_RESULTS = 1000
1502     IE_NAME = u'video.google:search'
1503     _SEARCH_KEY = 'gvsearch'
1504
1505     def _get_n_results(self, query, n):
1506         """Get a specified number of results for a query"""
1507
1508         res = {
1509             '_type': 'playlist',
1510             'id': query,
1511             'entries': []
1512         }
1513
1514         for pagenum in itertools.count(1):
1515             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1516             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1517                                              note='Downloading result page ' + str(pagenum))
1518
1519             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1520                 e = {
1521                     '_type': 'url',
1522                     'url': mobj.group(1)
1523                 }
1524                 res['entries'].append(e)
1525
1526             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1527                 return res
1528
1529 class YahooSearchIE(SearchInfoExtractor):
1530     """Information Extractor for Yahoo! Video search queries."""
1531
1532     _MAX_RESULTS = 1000
1533     IE_NAME = u'screen.yahoo:search'
1534     _SEARCH_KEY = 'yvsearch'
1535
1536     def _get_n_results(self, query, n):
1537         """Get a specified number of results for a query"""
1538
1539         res = {
1540             '_type': 'playlist',
1541             'id': query,
1542             'entries': []
1543         }
1544         for pagenum in itertools.count(0):
1545             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1546             webpage = self._download_webpage(result_url, query,
1547                                              note='Downloading results page '+str(pagenum+1))
1548             info = json.loads(webpage)
1549             m = info[u'm']
1550             results = info[u'results']
1551
1552             for (i, r) in enumerate(results):
1553                 if (pagenum * 30) +i >= n:
1554                     break
1555                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1556                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1557                 res['entries'].append(e)
1558             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1559                 break
1560
1561         return res
1562
1563
1564 class YoutubePlaylistIE(InfoExtractor):
1565     """Information Extractor for YouTube playlists."""
1566
1567     _VALID_URL = r"""(?:
1568                         (?:https?://)?
1569                         (?:\w+\.)?
1570                         youtube\.com/
1571                         (?:
1572                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1573                            \? (?:.*?&)*? (?:p|a|list)=
1574                         |  p/
1575                         )
1576                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1577                         .*
1578                      |
1579                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1580                      )"""
1581     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1582     _MAX_RESULTS = 50
1583     IE_NAME = u'youtube:playlist'
1584
1585     @classmethod
1586     def suitable(cls, url):
1587         """Receives a URL and returns True if suitable for this IE."""
1588         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1589
1590     def _real_extract(self, url):
1591         # Extract playlist id
1592         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1593         if mobj is None:
1594             raise ExtractorError(u'Invalid URL: %s' % url)
1595
1596         # Download playlist videos from API
1597         playlist_id = mobj.group(1) or mobj.group(2)
1598         page_num = 1
1599         videos = []
1600
1601         while True:
1602             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1603             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1604
1605             try:
1606                 response = json.loads(page)
1607             except ValueError as err:
1608                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1609
1610             if 'feed' not in response:
1611                 raise ExtractorError(u'Got a malformed response from YouTube API')
1612             playlist_title = response['feed']['title']['$t']
1613             if 'entry' not in response['feed']:
1614                 # Number of videos is a multiple of self._MAX_RESULTS
1615                 break
1616
1617             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1618                         for entry in response['feed']['entry']
1619                         if 'content' in entry ]
1620
1621             if len(response['feed']['entry']) < self._MAX_RESULTS:
1622                 break
1623             page_num += 1
1624
1625         videos = [v[1] for v in sorted(videos)]
1626
1627         url_results = [self.url_result(url, 'Youtube') for url in videos]
1628         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1629
1630
1631 class YoutubeChannelIE(InfoExtractor):
1632     """Information Extractor for YouTube channels."""
1633
1634     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1635     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1636     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1637     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1638     IE_NAME = u'youtube:channel'
1639
1640     def extract_videos_from_page(self, page):
1641         ids_in_page = []
1642         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1643             if mobj.group(1) not in ids_in_page:
1644                 ids_in_page.append(mobj.group(1))
1645         return ids_in_page
1646
1647     def _real_extract(self, url):
1648         # Extract channel id
1649         mobj = re.match(self._VALID_URL, url)
1650         if mobj is None:
1651             raise ExtractorError(u'Invalid URL: %s' % url)
1652
1653         # Download channel page
1654         channel_id = mobj.group(1)
1655         video_ids = []
1656         pagenum = 1
1657
1658         url = self._TEMPLATE_URL % (channel_id, pagenum)
1659         page = self._download_webpage(url, channel_id,
1660                                       u'Downloading page #%s' % pagenum)
1661
1662         # Extract video identifiers
1663         ids_in_page = self.extract_videos_from_page(page)
1664         video_ids.extend(ids_in_page)
1665
1666         # Download any subsequent channel pages using the json-based channel_ajax query
1667         if self._MORE_PAGES_INDICATOR in page:
1668             while True:
1669                 pagenum = pagenum + 1
1670
1671                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1672                 page = self._download_webpage(url, channel_id,
1673                                               u'Downloading page #%s' % pagenum)
1674
1675                 page = json.loads(page)
1676
1677                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1678                 video_ids.extend(ids_in_page)
1679
1680                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1681                     break
1682
1683         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1684
1685         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1686         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1687         return [self.playlist_result(url_entries, channel_id)]
1688
1689
1690 class YoutubeUserIE(InfoExtractor):
1691     """Information Extractor for YouTube users."""
1692
1693     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1694     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1695     _GDATA_PAGE_SIZE = 50
1696     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1697     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1698     IE_NAME = u'youtube:user'
1699
1700     def _real_extract(self, url):
1701         # Extract username
1702         mobj = re.match(self._VALID_URL, url)
1703         if mobj is None:
1704             raise ExtractorError(u'Invalid URL: %s' % url)
1705
1706         username = mobj.group(1)
1707
1708         # Download video ids using YouTube Data API. Result size per
1709         # query is limited (currently to 50 videos) so we need to query
1710         # page by page until there are no video ids - it means we got
1711         # all of them.
1712
1713         video_ids = []
1714         pagenum = 0
1715
1716         while True:
1717             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1718
1719             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1720             page = self._download_webpage(gdata_url, username,
1721                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1722
1723             # Extract video identifiers
1724             ids_in_page = []
1725
1726             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1727                 if mobj.group(1) not in ids_in_page:
1728                     ids_in_page.append(mobj.group(1))
1729
1730             video_ids.extend(ids_in_page)
1731
1732             # A little optimization - if current page is not
1733             # "full", ie. does not contain PAGE_SIZE video ids then
1734             # we can assume that this page is the last one - there
1735             # are no more ids on further pages - no need to query
1736             # again.
1737
1738             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1739                 break
1740
1741             pagenum += 1
1742
1743         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1744         url_results = [self.url_result(url, 'Youtube') for url in urls]
1745         return [self.playlist_result(url_results, playlist_title = username)]
1746
1747
1748 class BlipTVUserIE(InfoExtractor):
1749     """Information Extractor for blip.tv users."""
1750
1751     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1752     _PAGE_SIZE = 12
1753     IE_NAME = u'blip.tv:user'
1754
1755     def _real_extract(self, url):
1756         # Extract username
1757         mobj = re.match(self._VALID_URL, url)
1758         if mobj is None:
1759             raise ExtractorError(u'Invalid URL: %s' % url)
1760
1761         username = mobj.group(1)
1762
1763         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1764
1765         page = self._download_webpage(url, username, u'Downloading user page')
1766         mobj = re.search(r'data-users-id="([^"]+)"', page)
1767         page_base = page_base % mobj.group(1)
1768
1769
1770         # Download video ids using BlipTV Ajax calls. Result size per
1771         # query is limited (currently to 12 videos) so we need to query
1772         # page by page until there are no video ids - it means we got
1773         # all of them.
1774
1775         video_ids = []
1776         pagenum = 1
1777
1778         while True:
1779             url = page_base + "&page=" + str(pagenum)
1780             page = self._download_webpage(url, username,
1781                                           u'Downloading video ids from page %d' % pagenum)
1782
1783             # Extract video identifiers
1784             ids_in_page = []
1785
1786             for mobj in re.finditer(r'href="/([^"]+)"', page):
1787                 if mobj.group(1) not in ids_in_page:
1788                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1789
1790             video_ids.extend(ids_in_page)
1791
1792             # A little optimization - if current page is not
1793             # "full", ie. does not contain PAGE_SIZE video ids then
1794             # we can assume that this page is the last one - there
1795             # are no more ids on further pages - no need to query
1796             # again.
1797
1798             if len(ids_in_page) < self._PAGE_SIZE:
1799                 break
1800
1801             pagenum += 1
1802
1803         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1804         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1805         return [self.playlist_result(url_entries, playlist_title = username)]
1806
1807
1808 class DepositFilesIE(InfoExtractor):
1809     """Information extractor for depositfiles.com"""
1810
1811     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1812
1813     def _real_extract(self, url):
1814         file_id = url.split('/')[-1]
1815         # Rebuild url in english locale
1816         url = 'http://depositfiles.com/en/files/' + file_id
1817
1818         # Retrieve file webpage with 'Free download' button pressed
1819         free_download_indication = { 'gateway_result' : '1' }
1820         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1821         try:
1822             self.report_download_webpage(file_id)
1823             webpage = compat_urllib_request.urlopen(request).read()
1824         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1825             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1826
1827         # Search for the real file URL
1828         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1829         if (mobj is None) or (mobj.group(1) is None):
1830             # Try to figure out reason of the error.
1831             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1832             if (mobj is not None) and (mobj.group(1) is not None):
1833                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1834                 raise ExtractorError(u'%s' % restriction_message)
1835             else:
1836                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1837
1838         file_url = mobj.group(1)
1839         file_extension = os.path.splitext(file_url)[1][1:]
1840
1841         # Search for file title
1842         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1843
1844         return [{
1845             'id':       file_id.decode('utf-8'),
1846             'url':      file_url.decode('utf-8'),
1847             'uploader': None,
1848             'upload_date':  None,
1849             'title':    file_title,
1850             'ext':      file_extension.decode('utf-8'),
1851         }]
1852
1853
1854 class FacebookIE(InfoExtractor):
1855     """Information Extractor for Facebook"""
1856
1857     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1858     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1859     _NETRC_MACHINE = 'facebook'
1860     IE_NAME = u'facebook'
1861
1862     def report_login(self):
1863         """Report attempt to log in."""
1864         self.to_screen(u'Logging in')
1865
1866     def _real_initialize(self):
1867         if self._downloader is None:
1868             return
1869
1870         useremail = None
1871         password = None
1872         downloader_params = self._downloader.params
1873
1874         # Attempt to use provided username and password or .netrc data
1875         if downloader_params.get('username', None) is not None:
1876             useremail = downloader_params['username']
1877             password = downloader_params['password']
1878         elif downloader_params.get('usenetrc', False):
1879             try:
1880                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1881                 if info is not None:
1882                     useremail = info[0]
1883                     password = info[2]
1884                 else:
1885                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1886             except (IOError, netrc.NetrcParseError) as err:
1887                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1888                 return
1889
1890         if useremail is None:
1891             return
1892
1893         # Log in
1894         login_form = {
1895             'email': useremail,
1896             'pass': password,
1897             'login': 'Log+In'
1898             }
1899         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1900         try:
1901             self.report_login()
1902             login_results = compat_urllib_request.urlopen(request).read()
1903             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1904                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1905                 return
1906         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1908             return
1909
1910     def _real_extract(self, url):
1911         mobj = re.match(self._VALID_URL, url)
1912         if mobj is None:
1913             raise ExtractorError(u'Invalid URL: %s' % url)
1914         video_id = mobj.group('ID')
1915
1916         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1917         webpage = self._download_webpage(url, video_id)
1918
1919         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1920         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1921         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1922         if not m:
1923             raise ExtractorError(u'Cannot parse data')
1924         data = dict(json.loads(m.group(1)))
1925         params_raw = compat_urllib_parse.unquote(data['params'])
1926         params = json.loads(params_raw)
1927         video_data = params['video_data'][0]
1928         video_url = video_data.get('hd_src')
1929         if not video_url:
1930             video_url = video_data['sd_src']
1931         if not video_url:
1932             raise ExtractorError(u'Cannot find video URL')
1933         video_duration = int(video_data['video_duration'])
1934         thumbnail = video_data['thumbnail_src']
1935
1936         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1937             webpage, u'title')
1938
1939         info = {
1940             'id': video_id,
1941             'title': video_title,
1942             'url': video_url,
1943             'ext': 'mp4',
1944             'duration': video_duration,
1945             'thumbnail': thumbnail,
1946         }
1947         return [info]
1948
1949
1950 class BlipTVIE(InfoExtractor):
1951     """Information extractor for blip.tv"""
1952
1953     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1954     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1955     IE_NAME = u'blip.tv'
1956
1957     def report_direct_download(self, title):
1958         """Report information extraction."""
1959         self.to_screen(u'%s: Direct download detected' % title)
1960
1961     def _real_extract(self, url):
1962         mobj = re.match(self._VALID_URL, url)
1963         if mobj is None:
1964             raise ExtractorError(u'Invalid URL: %s' % url)
1965
1966         # See https://github.com/rg3/youtube-dl/issues/857
1967         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1968         if api_mobj is not None:
1969             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1970         urlp = compat_urllib_parse_urlparse(url)
1971         if urlp.path.startswith('/play/'):
1972             request = compat_urllib_request.Request(url)
1973             response = compat_urllib_request.urlopen(request)
1974             redirecturl = response.geturl()
1975             rurlp = compat_urllib_parse_urlparse(redirecturl)
1976             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1977             url = 'http://blip.tv/a/a-' + file_id
1978             return self._real_extract(url)
1979
1980
1981         if '?' in url:
1982             cchar = '&'
1983         else:
1984             cchar = '?'
1985         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1986         request = compat_urllib_request.Request(json_url)
1987         request.add_header('User-Agent', 'iTunes/10.6.1')
1988         self.report_extraction(mobj.group(1))
1989         info = None
1990         try:
1991             urlh = compat_urllib_request.urlopen(request)
1992             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1993                 basename = url.split('/')[-1]
1994                 title,ext = os.path.splitext(basename)
1995                 title = title.decode('UTF-8')
1996                 ext = ext.replace('.', '')
1997                 self.report_direct_download(title)
1998                 info = {
1999                     'id': title,
2000                     'url': url,
2001                     'uploader': None,
2002                     'upload_date': None,
2003                     'title': title,
2004                     'ext': ext,
2005                     'urlhandle': urlh
2006                 }
2007         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2009         if info is None: # Regular URL
2010             try:
2011                 json_code_bytes = urlh.read()
2012                 json_code = json_code_bytes.decode('utf-8')
2013             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2014                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2015
2016             try:
2017                 json_data = json.loads(json_code)
2018                 if 'Post' in json_data:
2019                     data = json_data['Post']
2020                 else:
2021                     data = json_data
2022
2023                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2024                 video_url = data['media']['url']
2025                 umobj = re.match(self._URL_EXT, video_url)
2026                 if umobj is None:
2027                     raise ValueError('Can not determine filename extension')
2028                 ext = umobj.group(1)
2029
2030                 info = {
2031                     'id': data['item_id'],
2032                     'url': video_url,
2033                     'uploader': data['display_name'],
2034                     'upload_date': upload_date,
2035                     'title': data['title'],
2036                     'ext': ext,
2037                     'format': data['media']['mimeType'],
2038                     'thumbnail': data['thumbnailUrl'],
2039                     'description': data['description'],
2040                     'player_url': data['embedUrl'],
2041                     'user_agent': 'iTunes/10.6.1',
2042                 }
2043             except (ValueError,KeyError) as err:
2044                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2045
2046         return [info]
2047
2048
2049 class MyVideoIE(InfoExtractor):
2050     """Information Extractor for myvideo.de."""
2051
2052     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2053     IE_NAME = u'myvideo'
2054
2055     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2056     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2057     # https://github.com/rg3/youtube-dl/pull/842
2058     def __rc4crypt(self,data, key):
2059         x = 0
2060         box = list(range(256))
2061         for i in list(range(256)):
2062             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2063             box[i], box[x] = box[x], box[i]
2064         x = 0
2065         y = 0
2066         out = ''
2067         for char in data:
2068             x = (x + 1) % 256
2069             y = (y + box[x]) % 256
2070             box[x], box[y] = box[y], box[x]
2071             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2072         return out
2073
2074     def __md5(self,s):
2075         return hashlib.md5(s).hexdigest().encode()
2076
2077     def _real_extract(self,url):
2078         mobj = re.match(self._VALID_URL, url)
2079         if mobj is None:
2080             raise ExtractorError(u'invalid URL: %s' % url)
2081
2082         video_id = mobj.group(1)
2083
2084         GK = (
2085           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2086           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2087           b'TnpsbA0KTVRkbU1tSTRNdz09'
2088         )
2089
2090         # Get video webpage
2091         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2092         webpage = self._download_webpage(webpage_url, video_id)
2093
2094         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2095         if mobj is not None:
2096             self.report_extraction(video_id)
2097             video_url = mobj.group(1) + '.flv'
2098
2099             video_title = self._html_search_regex('<title>([^<]+)</title>',
2100                 webpage, u'title')
2101
2102             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2103
2104             return [{
2105                 'id':       video_id,
2106                 'url':      video_url,
2107                 'uploader': None,
2108                 'upload_date':  None,
2109                 'title':    video_title,
2110                 'ext':      u'flv',
2111             }]
2112
2113         # try encxml
2114         mobj = re.search('var flashvars={(.+?)}', webpage)
2115         if mobj is None:
2116             raise ExtractorError(u'Unable to extract video')
2117
2118         params = {}
2119         encxml = ''
2120         sec = mobj.group(1)
2121         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2122             if not a == '_encxml':
2123                 params[a] = b
2124             else:
2125                 encxml = compat_urllib_parse.unquote(b)
2126         if not params.get('domain'):
2127             params['domain'] = 'www.myvideo.de'
2128         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2129         if 'flash_playertype=MTV' in xmldata_url:
2130             self._downloader.report_warning(u'avoiding MTV player')
2131             xmldata_url = (
2132                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2133                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2134             ) % video_id
2135
2136         # get enc data
2137         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2138         enc_data_b = binascii.unhexlify(enc_data)
2139         sk = self.__md5(
2140             base64.b64decode(base64.b64decode(GK)) +
2141             self.__md5(
2142                 str(video_id).encode('utf-8')
2143             )
2144         )
2145         dec_data = self.__rc4crypt(enc_data_b, sk)
2146
2147         # extracting infos
2148         self.report_extraction(video_id)
2149
2150         video_url = None
2151         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2152         if mobj:
2153             video_url = compat_urllib_parse.unquote(mobj.group(1))
2154             if 'myvideo2flash' in video_url:
2155                 self._downloader.report_warning(u'forcing RTMPT ...')
2156                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2157
2158         if not video_url:
2159             # extract non rtmp videos
2160             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2161             if mobj is None:
2162                 raise ExtractorError(u'unable to extract url')
2163             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2164
2165         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2166         video_file = compat_urllib_parse.unquote(video_file)
2167
2168         if not video_file.endswith('f4m'):
2169             ppath, prefix = video_file.split('.')
2170             video_playpath = '%s:%s' % (prefix, ppath)
2171             video_hls_playlist = ''
2172         else:
2173             video_playpath = ''
2174             video_hls_playlist = (
2175                 video_filepath + video_file
2176             ).replace('.f4m', '.m3u8')
2177
2178         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2179         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2180
2181         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2182             webpage, u'title')
2183
2184         return [{
2185             'id':                 video_id,
2186             'url':                video_url,
2187             'tc_url':             video_url,
2188             'uploader':           None,
2189             'upload_date':        None,
2190             'title':              video_title,
2191             'ext':                u'flv',
2192             'play_path':          video_playpath,
2193             'video_file':         video_file,
2194             'video_hls_playlist': video_hls_playlist,
2195             'player_url':         video_swfobj,
2196         }]
2197
2198
2199 class ComedyCentralIE(InfoExtractor):
2200     """Information extractor for The Daily Show and Colbert Report """
2201
2202     # urls can be abbreviations like :thedailyshow or :colbert
2203     # urls for episodes like:
2204     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2205     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2206     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2207     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2208                       |(https?://)?(www\.)?
2209                           (?P<showname>thedailyshow|colbertnation)\.com/
2210                          (full-episodes/(?P<episode>.*)|
2211                           (?P<clip>
2212                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2213                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2214                      $"""
2215
2216     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2217
2218     _video_extensions = {
2219         '3500': 'mp4',
2220         '2200': 'mp4',
2221         '1700': 'mp4',
2222         '1200': 'mp4',
2223         '750': 'mp4',
2224         '400': 'mp4',
2225     }
2226     _video_dimensions = {
2227         '3500': '1280x720',
2228         '2200': '960x540',
2229         '1700': '768x432',
2230         '1200': '640x360',
2231         '750': '512x288',
2232         '400': '384x216',
2233     }
2234
2235     @classmethod
2236     def suitable(cls, url):
2237         """Receives a URL and returns True if suitable for this IE."""
2238         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2239
2240     def _print_formats(self, formats):
2241         print('Available formats:')
2242         for x in formats:
2243             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2244
2245
2246     def _real_extract(self, url):
2247         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2248         if mobj is None:
2249             raise ExtractorError(u'Invalid URL: %s' % url)
2250
2251         if mobj.group('shortname'):
2252             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253                 url = u'http://www.thedailyshow.com/full-episodes/'
2254             else:
2255                 url = u'http://www.colbertnation.com/full-episodes/'
2256             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257             assert mobj is not None
2258
2259         if mobj.group('clip'):
2260             if mobj.group('showname') == 'thedailyshow':
2261                 epTitle = mobj.group('tdstitle')
2262             else:
2263                 epTitle = mobj.group('cntitle')
2264             dlNewest = False
2265         else:
2266             dlNewest = not mobj.group('episode')
2267             if dlNewest:
2268                 epTitle = mobj.group('showname')
2269             else:
2270                 epTitle = mobj.group('episode')
2271
2272         self.report_extraction(epTitle)
2273         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2274         if dlNewest:
2275             url = htmlHandle.geturl()
2276             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2277             if mobj is None:
2278                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2279             if mobj.group('episode') == '':
2280                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2281             epTitle = mobj.group('episode')
2282
2283         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2284
2285         if len(mMovieParams) == 0:
2286             # The Colbert Report embeds the information in a without
2287             # a URL prefix; so extract the alternate reference
2288             # and then add the URL prefix manually.
2289
2290             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2291             if len(altMovieParams) == 0:
2292                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2293             else:
2294                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2295
2296         uri = mMovieParams[0][1]
2297         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2298         indexXml = self._download_webpage(indexUrl, epTitle,
2299                                           u'Downloading show index',
2300                                           u'unable to download episode index')
2301
2302         results = []
2303
2304         idoc = xml.etree.ElementTree.fromstring(indexXml)
2305         itemEls = idoc.findall('.//item')
2306         for partNum,itemEl in enumerate(itemEls):
2307             mediaId = itemEl.findall('./guid')[0].text
2308             shortMediaId = mediaId.split(':')[-1]
2309             showId = mediaId.split(':')[-2].replace('.com', '')
2310             officialTitle = itemEl.findall('./title')[0].text
2311             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2312
2313             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2314                         compat_urllib_parse.urlencode({'uri': mediaId}))
2315             configXml = self._download_webpage(configUrl, epTitle,
2316                                                u'Downloading configuration for %s' % shortMediaId)
2317
2318             cdoc = xml.etree.ElementTree.fromstring(configXml)
2319             turls = []
2320             for rendition in cdoc.findall('.//rendition'):
2321                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2322                 turls.append(finfo)
2323
2324             if len(turls) == 0:
2325                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2326                 continue
2327
2328             if self._downloader.params.get('listformats', None):
2329                 self._print_formats([i[0] for i in turls])
2330                 return
2331
2332             # For now, just pick the highest bitrate
2333             format,rtmp_video_url = turls[-1]
2334
2335             # Get the format arg from the arg stream
2336             req_format = self._downloader.params.get('format', None)
2337
2338             # Select format if we can find one
2339             for f,v in turls:
2340                 if f == req_format:
2341                     format, rtmp_video_url = f, v
2342                     break
2343
2344             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2345             if not m:
2346                 raise ExtractorError(u'Cannot transform RTMP url')
2347             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2348             video_url = base + m.group('finalid')
2349
2350             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2351             info = {
2352                 'id': shortMediaId,
2353                 'url': video_url,
2354                 'uploader': showId,
2355                 'upload_date': officialDate,
2356                 'title': effTitle,
2357                 'ext': 'mp4',
2358                 'format': format,
2359                 'thumbnail': None,
2360                 'description': officialTitle,
2361             }
2362             results.append(info)
2363
2364         return results
2365
2366
2367 class EscapistIE(InfoExtractor):
2368     """Information extractor for The Escapist """
2369
2370     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371     IE_NAME = u'escapist'
2372
2373     def _real_extract(self, url):
2374         mobj = re.match(self._VALID_URL, url)
2375         if mobj is None:
2376             raise ExtractorError(u'Invalid URL: %s' % url)
2377         showName = mobj.group('showname')
2378         videoId = mobj.group('episode')
2379
2380         self.report_extraction(showName)
2381         webpage = self._download_webpage(url, showName)
2382
2383         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2384             webpage, u'description', fatal=False)
2385
2386         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2387             webpage, u'thumbnail', fatal=False)
2388
2389         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2390             webpage, u'player url')
2391
2392         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2393         configUrl = compat_urllib_parse.unquote(configUrl)
2394
2395         configJSON = self._download_webpage(configUrl, showName,
2396                                             u'Downloading configuration',
2397                                             u'unable to download configuration')
2398
2399         # Technically, it's JavaScript, not JSON
2400         configJSON = configJSON.replace("'", '"')
2401
2402         try:
2403             config = json.loads(configJSON)
2404         except (ValueError,) as err:
2405             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2406
2407         playlist = config['playlist']
2408         videoUrl = playlist[1]['url']
2409
2410         info = {
2411             'id': videoId,
2412             'url': videoUrl,
2413             'uploader': showName,
2414             'upload_date': None,
2415             'title': showName,
2416             'ext': 'mp4',
2417             'thumbnail': imgUrl,
2418             'description': videoDesc,
2419             'player_url': playerUrl,
2420         }
2421
2422         return [info]
2423
2424 class CollegeHumorIE(InfoExtractor):
2425     """Information extractor for collegehumor.com"""
2426
2427     _WORKING = False
2428     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2429     IE_NAME = u'collegehumor'
2430
2431     def report_manifest(self, video_id):
2432         """Report information extraction."""
2433         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2434
2435     def _real_extract(self, url):
2436         mobj = re.match(self._VALID_URL, url)
2437         if mobj is None:
2438             raise ExtractorError(u'Invalid URL: %s' % url)
2439         video_id = mobj.group('videoid')
2440
2441         info = {
2442             'id': video_id,
2443             'uploader': None,
2444             'upload_date': None,
2445         }
2446
2447         self.report_extraction(video_id)
2448         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2449         try:
2450             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2451         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2453
2454         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2455         try:
2456             videoNode = mdoc.findall('./video')[0]
2457             info['description'] = videoNode.findall('./description')[0].text
2458             info['title'] = videoNode.findall('./caption')[0].text
2459             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2460             manifest_url = videoNode.findall('./file')[0].text
2461         except IndexError:
2462             raise ExtractorError(u'Invalid metadata XML file')
2463
2464         manifest_url += '?hdcore=2.10.3'
2465         self.report_manifest(video_id)
2466         try:
2467             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2468         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2470
2471         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2472         try:
2473             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2474             node_id = media_node.attrib['url']
2475             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2476         except IndexError as err:
2477             raise ExtractorError(u'Invalid manifest file')
2478
2479         url_pr = compat_urllib_parse_urlparse(manifest_url)
2480         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2481
2482         info['url'] = url
2483         info['ext'] = 'f4f'
2484         return [info]
2485
2486
2487 class XVideosIE(InfoExtractor):
2488     """Information extractor for xvideos.com"""
2489
2490     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2491     IE_NAME = u'xvideos'
2492
2493     def _real_extract(self, url):
2494         mobj = re.match(self._VALID_URL, url)
2495         if mobj is None:
2496             raise ExtractorError(u'Invalid URL: %s' % url)
2497         video_id = mobj.group(1)
2498
2499         webpage = self._download_webpage(url, video_id)
2500
2501         self.report_extraction(video_id)
2502
2503         # Extract video URL
2504         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2505             webpage, u'video URL'))
2506
2507         # Extract title
2508         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2509             webpage, u'title')
2510
2511         # Extract video thumbnail
2512         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2513             webpage, u'thumbnail', fatal=False)
2514
2515         info = {
2516             'id': video_id,
2517             'url': video_url,
2518             'uploader': None,
2519             'upload_date': None,
2520             'title': video_title,
2521             'ext': 'flv',
2522             'thumbnail': video_thumbnail,
2523             'description': None,
2524         }
2525
2526         return [info]
2527
2528
2529 class SoundcloudIE(InfoExtractor):
2530     """Information extractor for soundcloud.com
2531        To access the media, the uid of the song and a stream token
2532        must be extracted from the page source and the script must make
2533        a request to media.soundcloud.com/crossdomain.xml. Then
2534        the media can be grabbed by requesting from an url composed
2535        of the stream token and uid
2536      """
2537
2538     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2539     IE_NAME = u'soundcloud'
2540
2541     def report_resolve(self, video_id):
2542         """Report information extraction."""
2543         self.to_screen(u'%s: Resolving id' % video_id)
2544
2545     def _real_extract(self, url):
2546         mobj = re.match(self._VALID_URL, url)
2547         if mobj is None:
2548             raise ExtractorError(u'Invalid URL: %s' % url)
2549
2550         # extract uploader (which is in the url)
2551         uploader = mobj.group(1)
2552         # extract simple title (uploader + slug of song title)
2553         slug_title =  mobj.group(2)
2554         simple_title = uploader + u'-' + slug_title
2555         full_title = '%s/%s' % (uploader, slug_title)
2556
2557         self.report_resolve(full_title)
2558
2559         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2560         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2561         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2562
2563         info = json.loads(info_json)
2564         video_id = info['id']
2565         self.report_extraction(full_title)
2566
2567         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2568         stream_json = self._download_webpage(streams_url, full_title,
2569                                              u'Downloading stream definitions',
2570                                              u'unable to download stream definitions')
2571
2572         streams = json.loads(stream_json)
2573         mediaURL = streams['http_mp3_128_url']
2574         upload_date = unified_strdate(info['created_at'])
2575
2576         return [{
2577             'id':       info['id'],
2578             'url':      mediaURL,
2579             'uploader': info['user']['username'],
2580             'upload_date': upload_date,
2581             'title':    info['title'],
2582             'ext':      u'mp3',
2583             'description': info['description'],
2584         }]
2585
2586 class SoundcloudSetIE(InfoExtractor):
2587     """Information extractor for soundcloud.com sets
2588        To access the media, the uid of the song and a stream token
2589        must be extracted from the page source and the script must make
2590        a request to media.soundcloud.com/crossdomain.xml. Then
2591        the media can be grabbed by requesting from an url composed
2592        of the stream token and uid
2593      """
2594
2595     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2596     IE_NAME = u'soundcloud:set'
2597
2598     def report_resolve(self, video_id):
2599         """Report information extraction."""
2600         self.to_screen(u'%s: Resolving id' % video_id)
2601
2602     def _real_extract(self, url):
2603         mobj = re.match(self._VALID_URL, url)
2604         if mobj is None:
2605             raise ExtractorError(u'Invalid URL: %s' % url)
2606
2607         # extract uploader (which is in the url)
2608         uploader = mobj.group(1)
2609         # extract simple title (uploader + slug of song title)
2610         slug_title =  mobj.group(2)
2611         simple_title = uploader + u'-' + slug_title
2612         full_title = '%s/sets/%s' % (uploader, slug_title)
2613
2614         self.report_resolve(full_title)
2615
2616         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2617         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2618         info_json = self._download_webpage(resolv_url, full_title)
2619
2620         videos = []
2621         info = json.loads(info_json)
2622         if 'errors' in info:
2623             for err in info['errors']:
2624                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2625             return
2626
2627         self.report_extraction(full_title)
2628         for track in info['tracks']:
2629             video_id = track['id']
2630
2631             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2632             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2633
2634             self.report_extraction(video_id)
2635             streams = json.loads(stream_json)
2636             mediaURL = streams['http_mp3_128_url']
2637
2638             videos.append({
2639                 'id':       video_id,
2640                 'url':      mediaURL,
2641                 'uploader': track['user']['username'],
2642                 'upload_date':  unified_strdate(track['created_at']),
2643                 'title':    track['title'],
2644                 'ext':      u'mp3',
2645                 'description': track['description'],
2646             })
2647         return videos
2648
2649
2650 class InfoQIE(InfoExtractor):
2651     """Information extractor for infoq.com"""
2652     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2653
2654     def _real_extract(self, url):
2655         mobj = re.match(self._VALID_URL, url)
2656         if mobj is None:
2657             raise ExtractorError(u'Invalid URL: %s' % url)
2658
2659         webpage = self._download_webpage(url, video_id=url)
2660         self.report_extraction(url)
2661
2662         # Extract video URL
2663         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2664         if mobj is None:
2665             raise ExtractorError(u'Unable to extract video url')
2666         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2667         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2668
2669         # Extract title
2670         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2671             webpage, u'title')
2672
2673         # Extract description
2674         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2675             webpage, u'description', fatal=False)
2676
2677         video_filename = video_url.split('/')[-1]
2678         video_id, extension = video_filename.split('.')
2679
2680         info = {
2681             'id': video_id,
2682             'url': video_url,
2683             'uploader': None,
2684             'upload_date': None,
2685             'title': video_title,
2686             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2687             'thumbnail': None,
2688             'description': video_description,
2689         }
2690
2691         return [info]
2692
2693 class MixcloudIE(InfoExtractor):
2694     """Information extractor for www.mixcloud.com"""
2695
2696     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2697     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2698     IE_NAME = u'mixcloud'
2699
2700     def report_download_json(self, file_id):
2701         """Report JSON download."""
2702         self.to_screen(u'Downloading json')
2703
2704     def get_urls(self, jsonData, fmt, bitrate='best'):
2705         """Get urls from 'audio_formats' section in json"""
2706         file_url = None
2707         try:
2708             bitrate_list = jsonData[fmt]
2709             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2710                 bitrate = max(bitrate_list) # select highest
2711
2712             url_list = jsonData[fmt][bitrate]
2713         except TypeError: # we have no bitrate info.
2714             url_list = jsonData[fmt]
2715         return url_list
2716
2717     def check_urls(self, url_list):
2718         """Returns 1st active url from list"""
2719         for url in url_list:
2720             try:
2721                 compat_urllib_request.urlopen(url)
2722                 return url
2723             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2724                 url = None
2725
2726         return None
2727
2728     def _print_formats(self, formats):
2729         print('Available formats:')
2730         for fmt in formats.keys():
2731             for b in formats[fmt]:
2732                 try:
2733                     ext = formats[fmt][b][0]
2734                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2735                 except TypeError: # we have no bitrate info
2736                     ext = formats[fmt][0]
2737                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2738                     break
2739
2740     def _real_extract(self, url):
2741         mobj = re.match(self._VALID_URL, url)
2742         if mobj is None:
2743             raise ExtractorError(u'Invalid URL: %s' % url)
2744         # extract uploader & filename from url
2745         uploader = mobj.group(1).decode('utf-8')
2746         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2747
2748         # construct API request
2749         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2750         # retrieve .json file with links to files
2751         request = compat_urllib_request.Request(file_url)
2752         try:
2753             self.report_download_json(file_url)
2754             jsonData = compat_urllib_request.urlopen(request).read()
2755         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2756             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2757
2758         # parse JSON
2759         json_data = json.loads(jsonData)
2760         player_url = json_data['player_swf_url']
2761         formats = dict(json_data['audio_formats'])
2762
2763         req_format = self._downloader.params.get('format', None)
2764         bitrate = None
2765
2766         if self._downloader.params.get('listformats', None):
2767             self._print_formats(formats)
2768             return
2769
2770         if req_format is None or req_format == 'best':
2771             for format_param in formats.keys():
2772                 url_list = self.get_urls(formats, format_param)
2773                 # check urls
2774                 file_url = self.check_urls(url_list)
2775                 if file_url is not None:
2776                     break # got it!
2777         else:
2778             if req_format not in formats:
2779                 raise ExtractorError(u'Format is not available')
2780
2781             url_list = self.get_urls(formats, req_format)
2782             file_url = self.check_urls(url_list)
2783             format_param = req_format
2784
2785         return [{
2786             'id': file_id.decode('utf-8'),
2787             'url': file_url.decode('utf-8'),
2788             'uploader': uploader.decode('utf-8'),
2789             'upload_date': None,
2790             'title': json_data['name'],
2791             'ext': file_url.split('.')[-1].decode('utf-8'),
2792             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2793             'thumbnail': json_data['thumbnail_url'],
2794             'description': json_data['description'],
2795             'player_url': player_url.decode('utf-8'),
2796         }]
2797
2798 class StanfordOpenClassroomIE(InfoExtractor):
2799     """Information extractor for Stanford's Open ClassRoom"""
2800
2801     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2802     IE_NAME = u'stanfordoc'
2803
2804     def _real_extract(self, url):
2805         mobj = re.match(self._VALID_URL, url)
2806         if mobj is None:
2807             raise ExtractorError(u'Invalid URL: %s' % url)
2808
2809         if mobj.group('course') and mobj.group('video'): # A specific video
2810             course = mobj.group('course')
2811             video = mobj.group('video')
2812             info = {
2813                 'id': course + '_' + video,
2814                 'uploader': None,
2815                 'upload_date': None,
2816             }
2817
2818             self.report_extraction(info['id'])
2819             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2820             xmlUrl = baseUrl + video + '.xml'
2821             try:
2822                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2823             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2824                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2825             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2826             try:
2827                 info['title'] = mdoc.findall('./title')[0].text
2828                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2829             except IndexError:
2830                 raise ExtractorError(u'Invalid metadata XML file')
2831             info['ext'] = info['url'].rpartition('.')[2]
2832             return [info]
2833         elif mobj.group('course'): # A course page
2834             course = mobj.group('course')
2835             info = {
2836                 'id': course,
2837                 'type': 'playlist',
2838                 'uploader': None,
2839                 'upload_date': None,
2840             }
2841
2842             coursepage = self._download_webpage(url, info['id'],
2843                                         note='Downloading course info page',
2844                                         errnote='Unable to download course info page')
2845
2846             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2847
2848             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2849                 coursepage, u'description', fatal=False)
2850
2851             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2852             info['list'] = [
2853                 {
2854                     'type': 'reference',
2855                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2856                 }
2857                     for vpage in links]
2858             results = []
2859             for entry in info['list']:
2860                 assert entry['type'] == 'reference'
2861                 results += self.extract(entry['url'])
2862             return results
2863         else: # Root page
2864             info = {
2865                 'id': 'Stanford OpenClassroom',
2866                 'type': 'playlist',
2867                 'uploader': None,
2868                 'upload_date': None,
2869             }
2870
2871             self.report_download_webpage(info['id'])
2872             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2873             try:
2874                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2875             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2876                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2877
2878             info['title'] = info['id']
2879
2880             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2881             info['list'] = [
2882                 {
2883                     'type': 'reference',
2884                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2885                 }
2886                     for cpage in links]
2887
2888             results = []
2889             for entry in info['list']:
2890                 assert entry['type'] == 'reference'
2891                 results += self.extract(entry['url'])
2892             return results
2893
2894 class MTVIE(InfoExtractor):
2895     """Information extractor for MTV.com"""
2896
2897     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2898     IE_NAME = u'mtv'
2899
2900     def _real_extract(self, url):
2901         mobj = re.match(self._VALID_URL, url)
2902         if mobj is None:
2903             raise ExtractorError(u'Invalid URL: %s' % url)
2904         if not mobj.group('proto'):
2905             url = 'http://' + url
2906         video_id = mobj.group('videoid')
2907
2908         webpage = self._download_webpage(url, video_id)
2909
2910         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2911             webpage, u'song name', fatal=False)
2912
2913         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2914             webpage, u'title')
2915
2916         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2917             webpage, u'mtvn_uri', fatal=False)
2918
2919         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2920             webpage, u'content id', fatal=False)
2921
2922         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2923         self.report_extraction(video_id)
2924         request = compat_urllib_request.Request(videogen_url)
2925         try:
2926             metadataXml = compat_urllib_request.urlopen(request).read()
2927         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2928             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2929
2930         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2931         renditions = mdoc.findall('.//rendition')
2932
2933         # For now, always pick the highest quality.
2934         rendition = renditions[-1]
2935
2936         try:
2937             _,_,ext = rendition.attrib['type'].partition('/')
2938             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2939             video_url = rendition.find('./src').text
2940         except KeyError:
2941             raise ExtractorError('Invalid rendition field.')
2942
2943         info = {
2944             'id': video_id,
2945             'url': video_url,
2946             'uploader': performer,
2947             'upload_date': None,
2948             'title': video_title,
2949             'ext': ext,
2950             'format': format,
2951         }
2952
2953         return [info]
2954
2955
2956 class YoukuIE(InfoExtractor):
2957     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2958
2959     def _gen_sid(self):
2960         nowTime = int(time.time() * 1000)
2961         random1 = random.randint(1000,1998)
2962         random2 = random.randint(1000,9999)
2963
2964         return "%d%d%d" %(nowTime,random1,random2)
2965
2966     def _get_file_ID_mix_string(self, seed):
2967         mixed = []
2968         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2969         seed = float(seed)
2970         for i in range(len(source)):
2971             seed  =  (seed * 211 + 30031 ) % 65536
2972             index  =  math.floor(seed / 65536 * len(source) )
2973             mixed.append(source[int(index)])
2974             source.remove(source[int(index)])
2975         #return ''.join(mixed)
2976         return mixed
2977
2978     def _get_file_id(self, fileId, seed):
2979         mixed = self._get_file_ID_mix_string(seed)
2980         ids = fileId.split('*')
2981         realId = []
2982         for ch in ids:
2983             if ch:
2984                 realId.append(mixed[int(ch)])
2985         return ''.join(realId)
2986
2987     def _real_extract(self, url):
2988         mobj = re.match(self._VALID_URL, url)
2989         if mobj is None:
2990             raise ExtractorError(u'Invalid URL: %s' % url)
2991         video_id = mobj.group('ID')
2992
2993         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2994
2995         jsondata = self._download_webpage(info_url, video_id)
2996
2997         self.report_extraction(video_id)
2998         try:
2999             config = json.loads(jsondata)
3000
3001             video_title =  config['data'][0]['title']
3002             seed = config['data'][0]['seed']
3003
3004             format = self._downloader.params.get('format', None)
3005             supported_format = list(config['data'][0]['streamfileids'].keys())
3006
3007             if format is None or format == 'best':
3008                 if 'hd2' in supported_format:
3009                     format = 'hd2'
3010                 else:
3011                     format = 'flv'
3012                 ext = u'flv'
3013             elif format == 'worst':
3014                 format = 'mp4'
3015                 ext = u'mp4'
3016             else:
3017                 format = 'flv'
3018                 ext = u'flv'
3019
3020
3021             fileid = config['data'][0]['streamfileids'][format]
3022             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3023         except (UnicodeDecodeError, ValueError, KeyError):
3024             raise ExtractorError(u'Unable to extract info section')
3025
3026         files_info=[]
3027         sid = self._gen_sid()
3028         fileid = self._get_file_id(fileid, seed)
3029
3030         #column 8,9 of fileid represent the segment number
3031         #fileid[7:9] should be changed
3032         for index, key in enumerate(keys):
3033
3034             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3035             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3036
3037             info = {
3038                 'id': '%s_part%02d' % (video_id, index),
3039                 'url': download_url,
3040                 'uploader': None,
3041                 'upload_date': None,
3042                 'title': video_title,
3043                 'ext': ext,
3044             }
3045             files_info.append(info)
3046
3047         return files_info
3048
3049
3050 class XNXXIE(InfoExtractor):
3051     """Information extractor for xnxx.com"""
3052
3053     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3054     IE_NAME = u'xnxx'
3055     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3056     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3057     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3058
3059     def _real_extract(self, url):
3060         mobj = re.match(self._VALID_URL, url)
3061         if mobj is None:
3062             raise ExtractorError(u'Invalid URL: %s' % url)
3063         video_id = mobj.group(1)
3064
3065         # Get webpage content
3066         webpage = self._download_webpage(url, video_id)
3067
3068         video_url = self._search_regex(self.VIDEO_URL_RE,
3069             webpage, u'video URL')
3070         video_url = compat_urllib_parse.unquote(video_url)
3071
3072         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3073             webpage, u'title')
3074
3075         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3076             webpage, u'thumbnail', fatal=False)
3077
3078         return [{
3079             'id': video_id,
3080             'url': video_url,
3081             'uploader': None,
3082             'upload_date': None,
3083             'title': video_title,
3084             'ext': 'flv',
3085             'thumbnail': video_thumbnail,
3086             'description': None,
3087         }]
3088
3089
3090 class GooglePlusIE(InfoExtractor):
3091     """Information extractor for plus.google.com."""
3092
3093     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3094     IE_NAME = u'plus.google'
3095
3096     def _real_extract(self, url):
3097         # Extract id from URL
3098         mobj = re.match(self._VALID_URL, url)
3099         if mobj is None:
3100             raise ExtractorError(u'Invalid URL: %s' % url)
3101
3102         post_url = mobj.group(0)
3103         video_id = mobj.group(1)
3104
3105         video_extension = 'flv'
3106
3107         # Step 1, Retrieve post webpage to extract further information
3108         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3109
3110         self.report_extraction(video_id)
3111
3112         # Extract update date
3113         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3114             webpage, u'upload date', fatal=False)
3115         if upload_date:
3116             # Convert timestring to a format suitable for filename
3117             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3118             upload_date = upload_date.strftime('%Y%m%d')
3119
3120         # Extract uploader
3121         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3122             webpage, u'uploader', fatal=False)
3123
3124         # Extract title
3125         # Get the first line for title
3126         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3127             webpage, 'title', default=u'NA')
3128
3129         # Step 2, Stimulate clicking the image box to launch video
3130         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3131             webpage, u'video page URL')
3132         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3133
3134         # Extract video links on video page
3135         """Extract video links of all sizes"""
3136         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3137         mobj = re.findall(pattern, webpage)
3138         if len(mobj) == 0:
3139             raise ExtractorError(u'Unable to extract video links')
3140
3141         # Sort in resolution
3142         links = sorted(mobj)
3143
3144         # Choose the lowest of the sort, i.e. highest resolution
3145         video_url = links[-1]
3146         # Only get the url. The resolution part in the tuple has no use anymore
3147         video_url = video_url[-1]
3148         # Treat escaped \u0026 style hex
3149         try:
3150             video_url = video_url.decode("unicode_escape")
3151         except AttributeError: # Python 3
3152             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3153
3154
3155         return [{
3156             'id':       video_id,
3157             'url':      video_url,
3158             'uploader': uploader,
3159             'upload_date':  upload_date,
3160             'title':    video_title,
3161             'ext':      video_extension,
3162         }]
3163
3164 class NBAIE(InfoExtractor):
3165     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3166     IE_NAME = u'nba'
3167
3168     def _real_extract(self, url):
3169         mobj = re.match(self._VALID_URL, url)
3170         if mobj is None:
3171             raise ExtractorError(u'Invalid URL: %s' % url)
3172
3173         video_id = mobj.group(1)
3174
3175         webpage = self._download_webpage(url, video_id)
3176
3177         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3178
3179         shortened_video_id = video_id.rpartition('/')[2]
3180         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3181             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3182
3183         # It isn't there in the HTML it returns to us
3184         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3185
3186         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3187
3188         info = {
3189             'id': shortened_video_id,
3190             'url': video_url,
3191             'ext': 'mp4',
3192             'title': title,
3193             # 'uploader_date': uploader_date,
3194             'description': description,
3195         }
3196         return [info]
3197
3198 class JustinTVIE(InfoExtractor):
3199     """Information extractor for justin.tv and twitch.tv"""
3200     # TODO: One broadcast may be split into multiple videos. The key
3201     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3202     # starts at 1 and increases. Can we treat all parts as one video?
3203
3204     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3205         (?:
3206             (?P<channelid>[^/]+)|
3207             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3208             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3209         )
3210         /?(?:\#.*)?$
3211         """
3212     _JUSTIN_PAGE_LIMIT = 100
3213     IE_NAME = u'justin.tv'
3214
3215     def report_download_page(self, channel, offset):
3216         """Report attempt to download a single page of videos."""
3217         self.to_screen(u'%s: Downloading video information from %d to %d' %
3218                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3219
3220     # Return count of items, list of *valid* items
3221     def _parse_page(self, url, video_id):
3222         webpage = self._download_webpage(url, video_id,
3223                                          u'Downloading video info JSON',
3224                                          u'unable to download video info JSON')
3225
3226         response = json.loads(webpage)
3227         if type(response) != list:
3228             error_text = response.get('error', 'unknown error')
3229             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3230         info = []
3231         for clip in response:
3232             video_url = clip['video_file_url']
3233             if video_url:
3234                 video_extension = os.path.splitext(video_url)[1][1:]
3235                 video_date = re.sub('-', '', clip['start_time'][:10])
3236                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3237                 video_id = clip['id']
3238                 video_title = clip.get('title', video_id)
3239                 info.append({
3240                     'id': video_id,
3241                     'url': video_url,
3242                     'title': video_title,
3243                     'uploader': clip.get('channel_name', video_uploader_id),
3244                     'uploader_id': video_uploader_id,
3245                     'upload_date': video_date,
3246                     'ext': video_extension,
3247                 })
3248         return (len(response), info)
3249
3250     def _real_extract(self, url):
3251         mobj = re.match(self._VALID_URL, url)
3252         if mobj is None:
3253             raise ExtractorError(u'invalid URL: %s' % url)
3254
3255         api_base = 'http://api.justin.tv'
3256         paged = False
3257         if mobj.group('channelid'):
3258             paged = True
3259             video_id = mobj.group('channelid')
3260             api = api_base + '/channel/archives/%s.json' % video_id
3261         elif mobj.group('chapterid'):
3262             chapter_id = mobj.group('chapterid')
3263
3264             webpage = self._download_webpage(url, chapter_id)
3265             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3266             if not m:
3267                 raise ExtractorError(u'Cannot find archive of a chapter')
3268             archive_id = m.group(1)
3269
3270             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3271             chapter_info_xml = self._download_webpage(api, chapter_id,
3272                                              note=u'Downloading chapter information',
3273                                              errnote=u'Chapter information download failed')
3274             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3275             for a in doc.findall('.//archive'):
3276                 if archive_id == a.find('./id').text:
3277                     break
3278             else:
3279                 raise ExtractorError(u'Could not find chapter in chapter information')
3280
3281             video_url = a.find('./video_file_url').text
3282             video_ext = video_url.rpartition('.')[2] or u'flv'
3283
3284             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3285             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3286                                    note='Downloading chapter metadata',
3287                                    errnote='Download of chapter metadata failed')
3288             chapter_info = json.loads(chapter_info_json)
3289
3290             bracket_start = int(doc.find('.//bracket_start').text)
3291             bracket_end = int(doc.find('.//bracket_end').text)
3292
3293             # TODO determine start (and probably fix up file)
3294             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3295             #video_url += u'?start=' + TODO:start_timestamp
3296             # bracket_start is 13290, but we want 51670615
3297             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3298                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3299
3300             info = {
3301                 'id': u'c' + chapter_id,
3302                 'url': video_url,
3303                 'ext': video_ext,
3304                 'title': chapter_info['title'],
3305                 'thumbnail': chapter_info['preview'],
3306                 'description': chapter_info['description'],
3307                 'uploader': chapter_info['channel']['display_name'],
3308                 'uploader_id': chapter_info['channel']['name'],
3309             }
3310             return [info]
3311         else:
3312             video_id = mobj.group('videoid')
3313             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3314
3315         self.report_extraction(video_id)
3316
3317         info = []
3318         offset = 0
3319         limit = self._JUSTIN_PAGE_LIMIT
3320         while True:
3321             if paged:
3322                 self.report_download_page(video_id, offset)
3323             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3324             page_count, page_info = self._parse_page(page_url, video_id)
3325             info.extend(page_info)
3326             if not paged or page_count != limit:
3327                 break
3328             offset += limit
3329         return info
3330
3331 class FunnyOrDieIE(InfoExtractor):
3332     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3333
3334     def _real_extract(self, url):
3335         mobj = re.match(self._VALID_URL, url)
3336         if mobj is None:
3337             raise ExtractorError(u'invalid URL: %s' % url)
3338
3339         video_id = mobj.group('id')
3340         webpage = self._download_webpage(url, video_id)
3341
3342         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3343             webpage, u'video URL', flags=re.DOTALL)
3344
3345         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3346             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3347
3348         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3349             webpage, u'description', fatal=False, flags=re.DOTALL)
3350
3351         info = {
3352             'id': video_id,
3353             'url': video_url,
3354             'ext': 'mp4',
3355             'title': title,
3356             'description': video_description,
3357         }
3358         return [info]
3359
3360 class SteamIE(InfoExtractor):
3361     _VALID_URL = r"""http://store\.steampowered\.com/
3362                 (agecheck/)?
3363                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3364                 (?P<gameID>\d+)/?
3365                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3366                 """
3367
3368     @classmethod
3369     def suitable(cls, url):
3370         """Receives a URL and returns True if suitable for this IE."""
3371         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3372
3373     def _real_extract(self, url):
3374         m = re.match(self._VALID_URL, url, re.VERBOSE)
3375         gameID = m.group('gameID')
3376         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3377         self.report_age_confirmation()
3378         webpage = self._download_webpage(videourl, gameID)
3379         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3380
3381         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3382         mweb = re.finditer(urlRE, webpage)
3383         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3384         titles = re.finditer(namesRE, webpage)
3385         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3386         thumbs = re.finditer(thumbsRE, webpage)
3387         videos = []
3388         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3389             video_id = vid.group('videoID')
3390             title = vtitle.group('videoName')
3391             video_url = vid.group('videoURL')
3392             video_thumb = thumb.group('thumbnail')
3393             if not video_url:
3394                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3395             info = {
3396                 'id':video_id,
3397                 'url':video_url,
3398                 'ext': 'flv',
3399                 'title': unescapeHTML(title),
3400                 'thumbnail': video_thumb
3401                   }
3402             videos.append(info)
3403         return [self.playlist_result(videos, gameID, game_title)]
3404
3405 class UstreamIE(InfoExtractor):
3406     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3407     IE_NAME = u'ustream'
3408
3409     def _real_extract(self, url):
3410         m = re.match(self._VALID_URL, url)
3411         video_id = m.group('videoID')
3412
3413         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3414         webpage = self._download_webpage(url, video_id)
3415
3416         self.report_extraction(video_id)
3417
3418         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3419             webpage, u'title')
3420
3421         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3422             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3423
3424         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3425             webpage, u'thumbnail', fatal=False)
3426
3427         info = {
3428                 'id': video_id,
3429                 'url': video_url,
3430                 'ext': 'flv',
3431                 'title': video_title,
3432                 'uploader': uploader,
3433                 'thumbnail': thumbnail,
3434                }
3435         return info
3436
3437 class WorldStarHipHopIE(InfoExtractor):
3438     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3439     IE_NAME = u'WorldStarHipHop'
3440
3441     def _real_extract(self, url):
3442         m = re.match(self._VALID_URL, url)
3443         video_id = m.group('id')
3444
3445         webpage_src = self._download_webpage(url, video_id)
3446
3447         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3448             webpage_src, u'video URL')
3449
3450         if 'mp4' in video_url:
3451             ext = 'mp4'
3452         else:
3453             ext = 'flv'
3454
3455         video_title = self._html_search_regex(r"<title>(.*)</title>",
3456             webpage_src, u'title')
3457
3458         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3459         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3460             webpage_src, u'thumbnail', fatal=False)
3461
3462         if not thumbnail:
3463             _title = r"""candytitles.*>(.*)</span>"""
3464             mobj = re.search(_title, webpage_src)
3465             if mobj is not None:
3466                 video_title = mobj.group(1)
3467
3468         results = [{
3469                     'id': video_id,
3470                     'url' : video_url,
3471                     'title' : video_title,
3472                     'thumbnail' : thumbnail,
3473                     'ext' : ext,
3474                     }]
3475         return results
3476
3477 class RBMARadioIE(InfoExtractor):
3478     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3479
3480     def _real_extract(self, url):
3481         m = re.match(self._VALID_URL, url)
3482         video_id = m.group('videoID')
3483
3484         webpage = self._download_webpage(url, video_id)
3485
3486         json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3487             webpage, u'json data')
3488
3489         try:
3490             data = json.loads(json_data)
3491         except ValueError as e:
3492             raise ExtractorError(u'Invalid JSON: ' + str(e))
3493
3494         video_url = data['akamai_url'] + '&cbr=256'
3495         url_parts = compat_urllib_parse_urlparse(video_url)
3496         video_ext = url_parts.path.rpartition('.')[2]
3497         info = {
3498                 'id': video_id,
3499                 'url': video_url,
3500                 'ext': video_ext,
3501                 'title': data['title'],
3502                 'description': data.get('teaser_text'),
3503                 'location': data.get('country_of_origin'),
3504                 'uploader': data.get('host', {}).get('name'),
3505                 'uploader_id': data.get('host', {}).get('slug'),
3506                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3507                 'duration': data.get('duration'),
3508         }
3509         return [info]
3510
3511
3512 class YouPornIE(InfoExtractor):
3513     """Information extractor for youporn.com."""
3514     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3515
3516     def _print_formats(self, formats):
3517         """Print all available formats"""
3518         print(u'Available formats:')
3519         print(u'ext\t\tformat')
3520         print(u'---------------------------------')
3521         for format in formats:
3522             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3523
3524     def _specific(self, req_format, formats):
3525         for x in formats:
3526             if(x["format"]==req_format):
3527                 return x
3528         return None
3529
3530     def _real_extract(self, url):
3531         mobj = re.match(self._VALID_URL, url)
3532         if mobj is None:
3533             raise ExtractorError(u'Invalid URL: %s' % url)
3534         video_id = mobj.group('videoid')
3535
3536         req = compat_urllib_request.Request(url)
3537         req.add_header('Cookie', 'age_verified=1')
3538         webpage = self._download_webpage(req, video_id)
3539
3540         # Get JSON parameters
3541         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3542         try:
3543             params = json.loads(json_params)
3544         except:
3545             raise ExtractorError(u'Invalid JSON')
3546
3547         self.report_extraction(video_id)
3548         try:
3549             video_title = params['title']
3550             upload_date = unified_strdate(params['release_date_f'])
3551             video_description = params['description']
3552             video_uploader = params['submitted_by']
3553             thumbnail = params['thumbnails'][0]['image']
3554         except KeyError:
3555             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3556
3557         # Get all of the formats available
3558         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3559         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3560             webpage, u'download list').strip()
3561
3562         # Get all of the links from the page
3563         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3564         links = re.findall(LINK_RE, download_list_html)
3565         if(len(links) == 0):
3566             raise ExtractorError(u'ERROR: no known formats available for video')
3567
3568         self.to_screen(u'Links found: %d' % len(links))
3569
3570         formats = []
3571         for link in links:
3572
3573             # A link looks like this:
3574             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3575             # A path looks like this:
3576             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3577             video_url = unescapeHTML( link )
3578             path = compat_urllib_parse_urlparse( video_url ).path
3579             extension = os.path.splitext( path )[1][1:]
3580             format = path.split('/')[4].split('_')[:2]
3581             size = format[0]
3582             bitrate = format[1]
3583             format = "-".join( format )
3584             title = u'%s-%s-%s' % (video_title, size, bitrate)
3585
3586             formats.append({
3587                 'id': video_id,
3588                 'url': video_url,
3589                 'uploader': video_uploader,
3590                 'upload_date': upload_date,
3591                 'title': title,
3592                 'ext': extension,
3593                 'format': format,
3594                 'thumbnail': thumbnail,
3595                 'description': video_description
3596             })
3597
3598         if self._downloader.params.get('listformats', None):
3599             self._print_formats(formats)
3600             return
3601
3602         req_format = self._downloader.params.get('format', None)
3603         self.to_screen(u'Format: %s' % req_format)
3604
3605         if req_format is None or req_format == 'best':
3606             return [formats[0]]
3607         elif req_format == 'worst':
3608             return [formats[-1]]
3609         elif req_format in ('-1', 'all'):
3610             return formats
3611         else:
3612             format = self._specific( req_format, formats )
3613             if result is None:
3614                 raise ExtractorError(u'Requested format not available')
3615             return [format]
3616
3617
3618
3619 class PornotubeIE(InfoExtractor):
3620     """Information extractor for pornotube.com."""
3621     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3622
3623     def _real_extract(self, url):
3624         mobj = re.match(self._VALID_URL, url)
3625         if mobj is None:
3626             raise ExtractorError(u'Invalid URL: %s' % url)
3627
3628         video_id = mobj.group('videoid')
3629         video_title = mobj.group('title')
3630
3631         # Get webpage content
3632         webpage = self._download_webpage(url, video_id)
3633
3634         # Get the video URL
3635         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3636         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3637         video_url = compat_urllib_parse.unquote(video_url)
3638
3639         #Get the uploaded date
3640         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3641         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3642         if upload_date: upload_date = unified_strdate(upload_date)
3643
3644         info = {'id': video_id,
3645                 'url': video_url,
3646                 'uploader': None,
3647                 'upload_date': upload_date,
3648                 'title': video_title,
3649                 'ext': 'flv',
3650                 'format': 'flv'}
3651
3652         return [info]
3653
3654 class YouJizzIE(InfoExtractor):
3655     """Information extractor for youjizz.com."""
3656     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3657
3658     def _real_extract(self, url):
3659         mobj = re.match(self._VALID_URL, url)
3660         if mobj is None:
3661             raise ExtractorError(u'Invalid URL: %s' % url)
3662
3663         video_id = mobj.group('videoid')
3664
3665         # Get webpage content
3666         webpage = self._download_webpage(url, video_id)
3667
3668         # Get the video title
3669         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3670             webpage, u'title').strip()
3671
3672         # Get the embed page
3673         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3674         if result is None:
3675             raise ExtractorError(u'ERROR: unable to extract embed page')
3676
3677         embed_page_url = result.group(0).strip()
3678         video_id = result.group('videoid')
3679
3680         webpage = self._download_webpage(embed_page_url, video_id)
3681
3682         # Get the video URL
3683         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3684             webpage, u'video URL')
3685
3686         info = {'id': video_id,
3687                 'url': video_url,
3688                 'title': video_title,
3689                 'ext': 'flv',
3690                 'format': 'flv',
3691                 'player_url': embed_page_url}
3692
3693         return [info]
3694
3695 class EightTracksIE(InfoExtractor):
3696     IE_NAME = '8tracks'
3697     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3698
3699     def _real_extract(self, url):
3700         mobj = re.match(self._VALID_URL, url)
3701         if mobj is None:
3702             raise ExtractorError(u'Invalid URL: %s' % url)
3703         playlist_id = mobj.group('id')
3704
3705         webpage = self._download_webpage(url, playlist_id)
3706
3707         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3708         data = json.loads(json_like)
3709
3710         session = str(random.randint(0, 1000000000))
3711         mix_id = data['id']
3712         track_count = data['tracks_count']
3713         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3714         next_url = first_url
3715         res = []
3716         for i in itertools.count():
3717             api_json = self._download_webpage(next_url, playlist_id,
3718                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3719                 errnote=u'Failed to download song information')
3720             api_data = json.loads(api_json)
3721             track_data = api_data[u'set']['track']
3722             info = {
3723                 'id': track_data['id'],
3724                 'url': track_data['track_file_stream_url'],
3725                 'title': track_data['performer'] + u' - ' + track_data['name'],
3726                 'raw_title': track_data['name'],
3727                 'uploader_id': data['user']['login'],
3728                 'ext': 'm4a',
3729             }
3730             res.append(info)
3731             if api_data['set']['at_last_track']:
3732                 break
3733             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3734         return res
3735
3736 class KeekIE(InfoExtractor):
3737     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3738     IE_NAME = u'keek'
3739
3740     def _real_extract(self, url):
3741         m = re.match(self._VALID_URL, url)
3742         video_id = m.group('videoID')
3743
3744         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3745         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3746         webpage = self._download_webpage(url, video_id)
3747
3748         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3749             webpage, u'title')
3750
3751         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3752             webpage, u'uploader', fatal=False)
3753
3754         info = {
3755                 'id': video_id,
3756                 'url': video_url,
3757                 'ext': 'mp4',
3758                 'title': video_title,
3759                 'thumbnail': thumbnail,
3760                 'uploader': uploader
3761         }
3762         return [info]
3763
3764 class TEDIE(InfoExtractor):
3765     _VALID_URL=r'''http://www\.ted\.com/
3766                    (
3767                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3768                         |
3769                         ((?P<type_talk>talks)) # We have a simple talk
3770                    )
3771                    (/lang/(.*?))? # The url may contain the language
3772                    /(?P<name>\w+) # Here goes the name and then ".html"
3773                    '''
3774
3775     @classmethod
3776     def suitable(cls, url):
3777         """Receives a URL and returns True if suitable for this IE."""
3778         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3779
3780     def _real_extract(self, url):
3781         m=re.match(self._VALID_URL, url, re.VERBOSE)
3782         if m.group('type_talk'):
3783             return [self._talk_info(url)]
3784         else :
3785             playlist_id=m.group('playlist_id')
3786             name=m.group('name')
3787             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3788             return [self._playlist_videos_info(url,name,playlist_id)]
3789
3790     def _talk_video_link(self,mediaSlug):
3791         '''Returns the video link for that mediaSlug'''
3792         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3793
3794     def _playlist_videos_info(self,url,name,playlist_id=0):
3795         '''Returns the videos of the playlist'''
3796         video_RE=r'''
3797                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3798                      ([.\s]*?)data-playlist_item_id="(\d+)"
3799                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3800                      '''
3801         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3802         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3803         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3804         m_names=re.finditer(video_name_RE,webpage)
3805
3806         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3807         m_playlist = re.search(playlist_RE, webpage)
3808         playlist_title = m_playlist.group('playlist_title')
3809
3810         playlist_entries = []
3811         for m_video, m_name in zip(m_videos,m_names):
3812             video_id=m_video.group('video_id')
3813             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3814             playlist_entries.append(self.url_result(talk_url, 'TED'))
3815         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3816
3817     def _talk_info(self, url, video_id=0):
3818         """Return the video for the talk in the url"""
3819         m=re.match(self._VALID_URL, url,re.VERBOSE)
3820         videoName=m.group('name')
3821         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3822         # If the url includes the language we get the title translated
3823         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3824         title=re.search(title_RE, webpage).group('title')
3825         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3826                         "id":(?P<videoID>[\d]+).*?
3827                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3828         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3829         thumb_match=re.search(thumb_RE,webpage)
3830         info_match=re.search(info_RE,webpage,re.VERBOSE)
3831         video_id=info_match.group('videoID')
3832         mediaSlug=info_match.group('mediaSlug')
3833         video_url=self._talk_video_link(mediaSlug)
3834         info = {
3835                 'id': video_id,
3836                 'url': video_url,
3837                 'ext': 'mp4',
3838                 'title': title,
3839                 'thumbnail': thumb_match.group('thumbnail')
3840                 }
3841         return info
3842
3843 class MySpassIE(InfoExtractor):
3844     _VALID_URL = r'http://www.myspass.de/.*'
3845
3846     def _real_extract(self, url):
3847         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3848
3849         # video id is the last path element of the URL
3850         # usually there is a trailing slash, so also try the second but last
3851         url_path = compat_urllib_parse_urlparse(url).path
3852         url_parent_path, video_id = os.path.split(url_path)
3853         if not video_id:
3854             _, video_id = os.path.split(url_parent_path)
3855
3856         # get metadata
3857         metadata_url = META_DATA_URL_TEMPLATE % video_id
3858         metadata_text = self._download_webpage(metadata_url, video_id)
3859         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3860
3861         # extract values from metadata
3862         url_flv_el = metadata.find('url_flv')
3863         if url_flv_el is None:
3864             raise ExtractorError(u'Unable to extract download url')
3865         video_url = url_flv_el.text
3866         extension = os.path.splitext(video_url)[1][1:]
3867         title_el = metadata.find('title')
3868         if title_el is None:
3869             raise ExtractorError(u'Unable to extract title')
3870         title = title_el.text
3871         format_id_el = metadata.find('format_id')
3872         if format_id_el is None:
3873             format = ext
3874         else:
3875             format = format_id_el.text
3876         description_el = metadata.find('description')
3877         if description_el is not None:
3878             description = description_el.text
3879         else:
3880             description = None
3881         imagePreview_el = metadata.find('imagePreview')
3882         if imagePreview_el is not None:
3883             thumbnail = imagePreview_el.text
3884         else:
3885             thumbnail = None
3886         info = {
3887             'id': video_id,
3888             'url': video_url,
3889             'title': title,
3890             'ext': extension,
3891             'format': format,
3892             'thumbnail': thumbnail,
3893             'description': description
3894         }
3895         return [info]
3896
3897 class SpiegelIE(InfoExtractor):
3898     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3899
3900     def _real_extract(self, url):
3901         m = re.match(self._VALID_URL, url)
3902         video_id = m.group('videoID')
3903
3904         webpage = self._download_webpage(url, video_id)
3905
3906         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3907             webpage, u'title')
3908
3909         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3910         xml_code = self._download_webpage(xml_url, video_id,
3911                     note=u'Downloading XML', errnote=u'Failed to download XML')
3912
3913         idoc = xml.etree.ElementTree.fromstring(xml_code)
3914         last_type = idoc[-1]
3915         filename = last_type.findall('./filename')[0].text
3916         duration = float(last_type.findall('./duration')[0].text)
3917
3918         video_url = 'http://video2.spiegel.de/flash/' + filename
3919         video_ext = filename.rpartition('.')[2]
3920         info = {
3921             'id': video_id,
3922             'url': video_url,
3923             'ext': video_ext,
3924             'title': video_title,
3925             'duration': duration,
3926         }
3927         return [info]
3928
3929 class LiveLeakIE(InfoExtractor):
3930
3931     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3932     IE_NAME = u'liveleak'
3933
3934     def _real_extract(self, url):
3935         mobj = re.match(self._VALID_URL, url)
3936         if mobj is None:
3937             raise ExtractorError(u'Invalid URL: %s' % url)
3938
3939         video_id = mobj.group('video_id')
3940
3941         webpage = self._download_webpage(url, video_id)
3942
3943         video_url = self._search_regex(r'file: "(.*?)",',
3944             webpage, u'video URL')
3945
3946         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3947             webpage, u'title').replace('LiveLeak.com -', '').strip()
3948
3949         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3950             webpage, u'description', fatal=False)
3951
3952         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3953             webpage, u'uploader', fatal=False)
3954
3955         info = {
3956             'id':  video_id,
3957             'url': video_url,
3958             'ext': 'mp4',
3959             'title': video_title,
3960             'description': video_description,
3961             'uploader': video_uploader
3962         }
3963
3964         return [info]
3965
3966 class ARDIE(InfoExtractor):
3967     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3968     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3969     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3970
3971     def _real_extract(self, url):
3972         # determine video id from url
3973         m = re.match(self._VALID_URL, url)
3974
3975         numid = re.search(r'documentId=([0-9]+)', url)
3976         if numid:
3977             video_id = numid.group(1)
3978         else:
3979             video_id = m.group('video_id')
3980
3981         # determine title and media streams from webpage
3982         html = self._download_webpage(url, video_id)
3983         title = re.search(self._TITLE, html).group('title')
3984         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3985         if not streams:
3986             assert '"fsk"' in html
3987             raise ExtractorError(u'This video is only available after 8:00 pm')
3988
3989         # choose default media type and highest quality for now
3990         stream = max([s for s in streams if int(s["media_type"]) == 0],
3991                      key=lambda s: int(s["quality"]))
3992
3993         # there's two possibilities: RTMP stream or HTTP download
3994         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3995         if stream['rtmp_url']:
3996             self.to_screen(u'RTMP download detected')
3997             assert stream['video_url'].startswith('mp4:')
3998             info["url"] = stream["rtmp_url"]
3999             info["play_path"] = stream['video_url']
4000         else:
4001             assert stream["video_url"].endswith('.mp4')
4002             info["url"] = stream["video_url"]
4003         return [info]
4004
4005 class TumblrIE(InfoExtractor):
4006     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4007
4008     def _real_extract(self, url):
4009         m_url = re.match(self._VALID_URL, url)
4010         video_id = m_url.group('id')
4011         blog = m_url.group('blog_name')
4012
4013         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4014         webpage = self._download_webpage(url, video_id)
4015
4016         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4017         video = re.search(re_video, webpage)
4018         if video is None:
4019            raise ExtractorError(u'Unable to extract video')
4020         video_url = video.group('video_url')
4021         ext = video.group('ext')
4022
4023         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4024             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4025         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4026
4027         # The only place where you can get a title, it's not complete,
4028         # but searching in other places doesn't work for all videos
4029         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4030             webpage, u'title', flags=re.DOTALL)
4031
4032         return [{'id': video_id,
4033                  'url': video_url,
4034                  'title': video_title,
4035                  'thumbnail': video_thumbnail,
4036                  'ext': ext
4037                  }]
4038
4039 class BandcampIE(InfoExtractor):
4040     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4041
4042     def _real_extract(self, url):
4043         mobj = re.match(self._VALID_URL, url)
4044         title = mobj.group('title')
4045         webpage = self._download_webpage(url, title)
4046         # We get the link to the free download page
4047         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4048         if m_download is None:
4049             raise ExtractorError(u'No free songs found')
4050
4051         download_link = m_download.group(1)
4052         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4053                        webpage, re.MULTILINE|re.DOTALL).group('id')
4054
4055         download_webpage = self._download_webpage(download_link, id,
4056                                                   'Downloading free downloads page')
4057         # We get the dictionary of the track from some javascrip code
4058         info = re.search(r'items: (.*?),$',
4059                          download_webpage, re.MULTILINE).group(1)
4060         info = json.loads(info)[0]
4061         # We pick mp3-320 for now, until format selection can be easily implemented.
4062         mp3_info = info[u'downloads'][u'mp3-320']
4063         # If we try to use this url it says the link has expired
4064         initial_url = mp3_info[u'url']
4065         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4066         m_url = re.match(re_url, initial_url)
4067         #We build the url we will use to get the final track url
4068         # This url is build in Bandcamp in the script download_bunde_*.js
4069         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4070         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4071         # If we could correctly generate the .rand field the url would be
4072         #in the "download_url" key
4073         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4074
4075         track_info = {'id':id,
4076                       'title' : info[u'title'],
4077                       'ext' :   'mp3',
4078                       'url' :   final_url,
4079                       'thumbnail' : info[u'thumb_url'],
4080                       'uploader' :  info[u'artist']
4081                       }
4082
4083         return [track_info]
4084
4085 class RedTubeIE(InfoExtractor):
4086     """Information Extractor for redtube"""
4087     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4088
4089     def _real_extract(self,url):
4090         mobj = re.match(self._VALID_URL, url)
4091         if mobj is None:
4092             raise ExtractorError(u'Invalid URL: %s' % url)
4093
4094         video_id = mobj.group('id')
4095         video_extension = 'mp4'
4096         webpage = self._download_webpage(url, video_id)
4097
4098         self.report_extraction(video_id)
4099
4100         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4101             webpage, u'video URL')
4102
4103         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4104             webpage, u'title')
4105
4106         return [{
4107             'id':       video_id,
4108             'url':      video_url,
4109             'ext':      video_extension,
4110             'title':    video_title,
4111         }]
4112
4113 class InaIE(InfoExtractor):
4114     """Information Extractor for Ina.fr"""
4115     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4116
4117     def _real_extract(self,url):
4118         mobj = re.match(self._VALID_URL, url)
4119
4120         video_id = mobj.group('id')
4121         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4122         video_extension = 'mp4'
4123         webpage = self._download_webpage(mrss_url, video_id)
4124
4125         self.report_extraction(video_id)
4126
4127         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4128             webpage, u'video URL')
4129
4130         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4131             webpage, u'title')
4132
4133         return [{
4134             'id':       video_id,
4135             'url':      video_url,
4136             'ext':      video_extension,
4137             'title':    video_title,
4138         }]
4139
4140 class HowcastIE(InfoExtractor):
4141     """Information Extractor for Howcast.com"""
4142     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4143
4144     def _real_extract(self, url):
4145         mobj = re.match(self._VALID_URL, url)
4146
4147         video_id = mobj.group('id')
4148         webpage_url = 'http://www.howcast.com/videos/' + video_id
4149         webpage = self._download_webpage(webpage_url, video_id)
4150
4151         self.report_extraction(video_id)
4152
4153         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4154             webpage, u'video URL')
4155
4156         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4157             webpage, u'title')
4158
4159         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4160             webpage, u'description', fatal=False)
4161
4162         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4163             webpage, u'thumbnail', fatal=False)
4164
4165         return [{
4166             'id':       video_id,
4167             'url':      video_url,
4168             'ext':      'mp4',
4169             'title':    video_title,
4170             'description': video_description,
4171             'thumbnail': thumbnail,
4172         }]
4173
4174 class VineIE(InfoExtractor):
4175     """Information Extractor for Vine.co"""
4176     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4177
4178     def _real_extract(self, url):
4179         mobj = re.match(self._VALID_URL, url)
4180
4181         video_id = mobj.group('id')
4182         webpage_url = 'https://vine.co/v/' + video_id
4183         webpage = self._download_webpage(webpage_url, video_id)
4184
4185         self.report_extraction(video_id)
4186
4187         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4188             webpage, u'video URL')
4189
4190         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4191             webpage, u'title')
4192
4193         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4194             webpage, u'thumbnail', fatal=False)
4195
4196         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4197             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4198
4199         return [{
4200             'id':        video_id,
4201             'url':       video_url,
4202             'ext':       'mp4',
4203             'title':     video_title,
4204             'thumbnail': thumbnail,
4205             'uploader':  uploader,
4206         }]
4207
4208 class FlickrIE(InfoExtractor):
4209     """Information Extractor for Flickr videos"""
4210     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4211
4212     def _real_extract(self, url):
4213         mobj = re.match(self._VALID_URL, url)
4214
4215         video_id = mobj.group('id')
4216         video_uploader_id = mobj.group('uploader_id')
4217         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4218         webpage = self._download_webpage(webpage_url, video_id)
4219
4220         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4221
4222         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4223         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4224
4225         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4226             first_xml, u'node_id')
4227
4228         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4229         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4230
4231         self.report_extraction(video_id)
4232
4233         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4234         if mobj is None:
4235             raise ExtractorError(u'Unable to extract video url')
4236         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4237
4238         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4239             webpage, u'video title')
4240
4241         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4242             webpage, u'description', fatal=False)
4243
4244         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4245             webpage, u'thumbnail', fatal=False)
4246
4247         return [{
4248             'id':          video_id,
4249             'url':         video_url,
4250             'ext':         'mp4',
4251             'title':       video_title,
4252             'description': video_description,
4253             'thumbnail':   thumbnail,
4254             'uploader_id': video_uploader_id,
4255         }]
4256
4257 class TeamcocoIE(InfoExtractor):
4258     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4259
4260     def _real_extract(self, url):
4261         mobj = re.match(self._VALID_URL, url)
4262         if mobj is None:
4263             raise ExtractorError(u'Invalid URL: %s' % url)
4264         url_title = mobj.group('url_title')
4265         webpage = self._download_webpage(url, url_title)
4266
4267         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4268             webpage, u'video id')
4269
4270         self.report_extraction(video_id)
4271
4272         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4273             webpage, u'title')
4274
4275         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4276             webpage, u'thumbnail', fatal=False)
4277
4278         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4279             webpage, u'description', fatal=False)
4280
4281         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4282         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4283
4284         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4285             data, u'video URL')
4286
4287         return [{
4288             'id':          video_id,
4289             'url':         video_url,
4290             'ext':         'mp4',
4291             'title':       video_title,
4292             'thumbnail':   thumbnail,
4293             'description': video_description,
4294         }]
4295
4296 class XHamsterIE(InfoExtractor):
4297     """Information Extractor for xHamster"""
4298     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4299
4300     def _real_extract(self,url):
4301         mobj = re.match(self._VALID_URL, url)
4302
4303         video_id = mobj.group('id')
4304         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4305         webpage = self._download_webpage(mrss_url, video_id)
4306
4307         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4308         if mobj is None:
4309             raise ExtractorError(u'Unable to extract media URL')
4310         if len(mobj.group('server')) == 0:
4311             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4312         else:
4313             video_url = mobj.group('server')+'/key='+mobj.group('file')
4314         video_extension = video_url.split('.')[-1]
4315
4316         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4317             webpage, u'title')
4318
4319         # Can't see the description anywhere in the UI
4320         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4321         #     webpage, u'description', fatal=False)
4322         # if video_description: video_description = unescapeHTML(video_description)
4323
4324         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4325         if mobj:
4326             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4327         else:
4328             video_upload_date = None
4329             self._downloader.report_warning(u'Unable to extract upload date')
4330
4331         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
4332             webpage, u'uploader id', default=u'anonymous')
4333
4334         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4335             webpage, u'thumbnail', fatal=False)
4336
4337         return [{
4338             'id':       video_id,
4339             'url':      video_url,
4340             'ext':      video_extension,
4341             'title':    video_title,
4342             # 'description': video_description,
4343             'upload_date': video_upload_date,
4344             'uploader_id': video_uploader_id,
4345             'thumbnail': video_thumbnail
4346         }]
4347
4348 class HypemIE(InfoExtractor):
4349     """Information Extractor for hypem"""
4350     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4351
4352     def _real_extract(self, url):
4353         mobj = re.match(self._VALID_URL, url)
4354         if mobj is None:
4355             raise ExtractorError(u'Invalid URL: %s' % url)
4356         track_id = mobj.group(1)
4357
4358         data = { 'ax': 1, 'ts': time.time() }
4359         data_encoded = compat_urllib_parse.urlencode(data)
4360         complete_url = url + "?" + data_encoded
4361         request = compat_urllib_request.Request(complete_url)
4362         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4363         cookie = urlh.headers.get('Set-Cookie', '')
4364
4365         self.report_extraction(track_id)
4366
4367         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4368             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4369         try:
4370             track_list = json.loads(html_tracks)
4371             track = track_list[u'tracks'][0]
4372         except ValueError:
4373             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4374
4375         key = track[u"key"]
4376         track_id = track[u"id"]
4377         artist = track[u"artist"]
4378         title = track[u"song"]
4379
4380         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4381         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4382         request.add_header('cookie', cookie)
4383         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4384         try:
4385             song_data = json.loads(song_data_json)
4386         except ValueError:
4387             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4388         final_url = song_data[u"url"]
4389
4390         return [{
4391             'id':       track_id,
4392             'url':      final_url,
4393             'ext':      "mp3",
4394             'title':    title,
4395             'artist':   artist,
4396         }]
4397
4398
4399 def gen_extractors():
4400     """ Return a list of an instance of every supported extractor.
4401     The order does matter; the first extractor matched is the one handling the URL.
4402     """
4403     return [
4404         YoutubePlaylistIE(),
4405         YoutubeChannelIE(),
4406         YoutubeUserIE(),
4407         YoutubeSearchIE(),
4408         YoutubeIE(),
4409         MetacafeIE(),
4410         DailymotionIE(),
4411         GoogleSearchIE(),
4412         PhotobucketIE(),
4413         YahooIE(),
4414         YahooSearchIE(),
4415         DepositFilesIE(),
4416         FacebookIE(),
4417         BlipTVIE(),
4418         BlipTVUserIE(),
4419         VimeoIE(),
4420         MyVideoIE(),
4421         ComedyCentralIE(),
4422         EscapistIE(),
4423         CollegeHumorIE(),
4424         XVideosIE(),
4425         SoundcloudSetIE(),
4426         SoundcloudIE(),
4427         InfoQIE(),
4428         MixcloudIE(),
4429         StanfordOpenClassroomIE(),
4430         MTVIE(),
4431         YoukuIE(),
4432         XNXXIE(),
4433         YouJizzIE(),
4434         PornotubeIE(),
4435         YouPornIE(),
4436         GooglePlusIE(),
4437         ArteTvIE(),
4438         NBAIE(),
4439         WorldStarHipHopIE(),
4440         JustinTVIE(),
4441         FunnyOrDieIE(),
4442         SteamIE(),
4443         UstreamIE(),
4444         RBMARadioIE(),
4445         EightTracksIE(),
4446         KeekIE(),
4447         TEDIE(),
4448         MySpassIE(),
4449         SpiegelIE(),
4450         LiveLeakIE(),
4451         ARDIE(),
4452         TumblrIE(),
4453         BandcampIE(),
4454         RedTubeIE(),
4455         InaIE(),
4456         HowcastIE(),
4457         VineIE(),
4458         FlickrIE(),
4459         TeamcocoIE(),
4460         XHamsterIE(),
4461         HypemIE(),
4462         GenericIE()
4463     ]
4464
4465 def get_info_extractor(ie_name):
4466     """Returns the info extractor class with the given ie_name"""
4467     return globals()[ie_name+'IE']