_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_warning(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_warning(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 728             self.report_rtmp_download()
 729             video_url_list = [(None, video_info['conn'][0])]
 730         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 731             url_map = {}
 732             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 733                 url_data = compat_parse_qs(url_data_str)
 734                 if 'itag' in url_data and 'url' in url_data:
 735                     url = url_data['url'][0]
 736                     if 'sig' in url_data:
 737                         url += '&signature=' + url_data['sig'][0]
 738                     if 'ratebypass' not in url:
 739                         url += '&ratebypass=yes'
 740                     url_map[url_data['itag'][0]] = url
 741
 742             format_limit = self._downloader.params.get('format_limit', None)
 743             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 744             if format_limit is not None and format_limit in available_formats:
 745                 format_list = available_formats[available_formats.index(format_limit):]
 746             else:
 747                 format_list = available_formats
 748             existing_formats = [x for x in format_list if x in url_map]
 749             if len(existing_formats) == 0:
 750                 raise ExtractorError(u'no known formats available for video')
 751             if self._downloader.params.get('listformats', None):
 752                 self._print_formats(existing_formats)
 753                 return
 754             if req_format is None or req_format == 'best':
 755                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 756             elif req_format == 'worst':
 757                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 758             elif req_format in ('-1', 'all'):
 759                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 760             else:
 761                 # Specific formats. We pick the first in a slash-delimeted sequence.
 762                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 763                 req_formats = req_format.split('/')
 764                 video_url_list = None
 765                 for rf in req_formats:
 766                     if rf in url_map:
 767                         video_url_list = [(rf, url_map[rf])]
 768                         break
 769                 if video_url_list is None:
 770                     raise ExtractorError(u'requested format not available')
 771         else:
 772             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 773
 774         results = []
 775         for format_param, video_real_url in video_url_list:
 776             # Extension
 777             video_extension = self._video_extensions.get(format_param, 'flv')
 778
 779             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 780                                               self._video_dimensions.get(format_param, '???'))
 781
 782             results.append({
 783                 'id':       video_id,
 784                 'url':      video_real_url,
 785                 'uploader': video_uploader,
 786                 'uploader_id': video_uploader_id,
 787                 'upload_date':  upload_date,
 788                 'title':    video_title,
 789                 'ext':      video_extension,
 790                 'format':   video_format,
 791                 'thumbnail':    video_thumbnail,
 792                 'description':  video_description,
 793                 'player_url':   player_url,
 794                 'subtitles':    video_subtitles,
 795                 'duration':     video_duration
 796             })
 797         return results
 798
 799
 800 class MetacafeIE(InfoExtractor):
 801     """Information Extractor for metacafe.com."""
 802
 803     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 804     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 805     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 806     IE_NAME = u'metacafe'
 807
 808     def report_disclaimer(self):
 809         """Report disclaimer retrieval."""
 810         self.to_screen(u'Retrieving disclaimer')
 811
 812     def _real_initialize(self):
 813         # Retrieve disclaimer
 814         request = compat_urllib_request.Request(self._DISCLAIMER)
 815         try:
 816             self.report_disclaimer()
 817             disclaimer = compat_urllib_request.urlopen(request).read()
 818         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 819             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 820
 821         # Confirm age
 822         disclaimer_form = {
 823             'filters': '0',
 824             'submit': "Continue - I'm over 18",
 825             }
 826         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 827         try:
 828             self.report_age_confirmation()
 829             disclaimer = compat_urllib_request.urlopen(request).read()
 830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 831             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 832
 833     def _real_extract(self, url):
 834         # Extract id and simplified title from URL
 835         mobj = re.match(self._VALID_URL, url)
 836         if mobj is None:
 837             raise ExtractorError(u'Invalid URL: %s' % url)
 838
 839         video_id = mobj.group(1)
 840
 841         # Check if video comes from YouTube
 842         mobj2 = re.match(r'^yt-(.*)$', video_id)
 843         if mobj2 is not None:
 844             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 845
 846         # Retrieve video webpage to extract further information
 847         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 848
 849         # Extract URL, uploader and title from webpage
 850         self.report_extraction(video_id)
 851         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 852         if mobj is not None:
 853             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 854             video_extension = mediaURL[-3:]
 855
 856             # Extract gdaKey if available
 857             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 858             if mobj is None:
 859                 video_url = mediaURL
 860             else:
 861                 gdaKey = mobj.group(1)
 862                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 863         else:
 864             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 865             if mobj is None:
 866                 raise ExtractorError(u'Unable to extract media URL')
 867             vardict = compat_parse_qs(mobj.group(1))
 868             if 'mediaData' not in vardict:
 869                 raise ExtractorError(u'Unable to extract media URL')
 870             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 871             if mobj is None:
 872                 raise ExtractorError(u'Unable to extract media URL')
 873             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 874             video_extension = mediaURL[-3:]
 875             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 876
 877         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 878         if mobj is None:
 879             raise ExtractorError(u'Unable to extract title')
 880         video_title = mobj.group(1).decode('utf-8')
 881
 882         mobj = re.search(r'submitter=(.*?);', webpage)
 883         if mobj is None:
 884             raise ExtractorError(u'Unable to extract uploader nickname')
 885         video_uploader = mobj.group(1)
 886
 887         return [{
 888             'id':       video_id.decode('utf-8'),
 889             'url':      video_url.decode('utf-8'),
 890             'uploader': video_uploader.decode('utf-8'),
 891             'upload_date':  None,
 892             'title':    video_title,
 893             'ext':      video_extension.decode('utf-8'),
 894         }]
 895
 896 class DailymotionIE(InfoExtractor):
 897     """Information Extractor for Dailymotion"""
 898
 899     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 900     IE_NAME = u'dailymotion'
 901
 902     def _real_extract(self, url):
 903         # Extract id and simplified title from URL
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             raise ExtractorError(u'Invalid URL: %s' % url)
 907
 908         video_id = mobj.group(1).split('_')[0].split('?')[0]
 909
 910         video_extension = 'mp4'
 911
 912         # Retrieve video webpage to extract further information
 913         request = compat_urllib_request.Request(url)
 914         request.add_header('Cookie', 'family_filter=off')
 915         webpage = self._download_webpage(request, video_id)
 916
 917         # Extract URL, uploader and title from webpage
 918         self.report_extraction(video_id)
 919         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 920         if mobj is None:
 921             raise ExtractorError(u'Unable to extract media URL')
 922         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 923
 924         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 925             if key in flashvars:
 926                 max_quality = key
 927                 self.to_screen(u'Using %s' % key)
 928                 break
 929         else:
 930             raise ExtractorError(u'Unable to extract video URL')
 931
 932         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 933         if mobj is None:
 934             raise ExtractorError(u'Unable to extract video URL')
 935
 936         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 937
 938         # TODO: support choosing qualities
 939
 940         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 941         if mobj is None:
 942             raise ExtractorError(u'Unable to extract title')
 943         video_title = unescapeHTML(mobj.group('title'))
 944
 945         video_uploader = None
 946         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 947                                              # Looking for official user
 948                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 949                                             webpage, 'video uploader')
 950
 951         video_upload_date = None
 952         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 953         if mobj is not None:
 954             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 955
 956         return [{
 957             'id':       video_id,
 958             'url':      video_url,
 959             'uploader': video_uploader,
 960             'upload_date':  video_upload_date,
 961             'title':    video_title,
 962             'ext':      video_extension,
 963         }]
 964
 965
 966 class PhotobucketIE(InfoExtractor):
 967     """Information extractor for photobucket.com."""
 968
 969     # TODO: the original _VALID_URL was:
 970     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 971     # Check if it's necessary to keep the old extracion process
 972     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 973     IE_NAME = u'photobucket'
 974
 975     def _real_extract(self, url):
 976         # Extract id from URL
 977         mobj = re.match(self._VALID_URL, url)
 978         if mobj is None:
 979             raise ExtractorError(u'Invalid URL: %s' % url)
 980
 981         video_id = mobj.group('id')
 982
 983         video_extension = mobj.group('ext')
 984
 985         # Retrieve video webpage to extract further information
 986         webpage = self._download_webpage(url, video_id)
 987
 988         # Extract URL, uploader, and title from webpage
 989         self.report_extraction(video_id)
 990         # We try first by looking the javascript code:
 991         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 992         if mobj is not None:
 993             info = json.loads(mobj.group('json'))
 994             return [{
 995                 'id':       video_id,
 996                 'url':      info[u'downloadUrl'],
 997                 'uploader': info[u'username'],
 998                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 999                 'title':    info[u'title'],
1000                 'ext':      video_extension,
1001                 'thumbnail': info[u'thumbUrl'],
1002             }]
1003
1004         # We try looking in other parts of the webpage
1005         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006             webpage, u'video URL')
1007
1008         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1009         if mobj is None:
1010             raise ExtractorError(u'Unable to extract title')
1011         video_title = mobj.group(1).decode('utf-8')
1012         video_uploader = mobj.group(2).decode('utf-8')
1013
1014         return [{
1015             'id':       video_id.decode('utf-8'),
1016             'url':      video_url.decode('utf-8'),
1017             'uploader': video_uploader,
1018             'upload_date':  None,
1019             'title':    video_title,
1020             'ext':      video_extension.decode('utf-8'),
1021         }]
1022
1023
1024 class YahooIE(InfoExtractor):
1025     """Information extractor for screen.yahoo.com."""
1026     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1027
1028     def _real_extract(self, url):
1029         mobj = re.match(self._VALID_URL, url)
1030         if mobj is None:
1031             raise ExtractorError(u'Invalid URL: %s' % url)
1032         video_id = mobj.group('id')
1033         webpage = self._download_webpage(url, video_id)
1034         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1035
1036         if m_id is None:
1037             # TODO: Check which url parameters are required
1038             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1044                         '''
1045             self.report_extraction(video_id)
1046             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1047             if m_info is None:
1048                 raise ExtractorError(u'Unable to extract video info')
1049             video_title = m_info.group('title')
1050             video_description = m_info.group('description')
1051             video_thumb = m_info.group('thumb')
1052             video_date = m_info.group('date')
1053             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1054
1055             # TODO: Find a way to get mp4 videos
1056             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059             video_url = m_rest.group('url')
1060             video_path = m_rest.group('path')
1061             if m_rest is None:
1062                 raise ExtractorError(u'Unable to extract video url')
1063
1064         else: # We have to use a different method if another id is defined
1065             long_id = m_id.group('new_id')
1066             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069             info = json.loads(json_str)
1070             res = info[u'query'][u'results'][u'mediaObj'][0]
1071             stream = res[u'streams'][0]
1072             video_path = stream[u'path']
1073             video_url = stream[u'host']
1074             meta = res[u'meta']
1075             video_title = meta[u'title']
1076             video_description = meta[u'description']
1077             video_thumb = meta[u'thumbnail']
1078             video_date = None # I can't find it
1079
1080         info_dict = {
1081                      'id': video_id,
1082                      'url': video_url,
1083                      'play_path': video_path,
1084                      'title':video_title,
1085                      'description': video_description,
1086                      'thumbnail': video_thumb,
1087                      'upload_date': video_date,
1088                      'ext': 'flv',
1089                      }
1090         return info_dict
1091
1092 class VimeoIE(InfoExtractor):
1093     """Information extractor for vimeo.com."""
1094
1095     # _VALID_URL matches Vimeo URLs
1096     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1097     IE_NAME = u'vimeo'
1098
1099     def _verify_video_password(self, url, video_id, webpage):
1100         password = self._downloader.params.get('password', None)
1101         if password is None:
1102             raise ExtractorError(u'This video is protected by a password, use the --password option')
1103         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
1104         data = compat_urllib_parse.urlencode({'password': password,
1105                                               'token': token})
1106         # I didn't manage to use the password with https
1107         if url.startswith('https'):
1108             pass_url = url.replace('https','http')
1109         else:
1110             pass_url = url
1111         password_request = compat_urllib_request.Request(pass_url+'/password', data)
1112         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1113         password_request.add_header('Cookie', 'xsrft=%s' % token)
1114         pass_web = self._download_webpage(password_request, video_id,
1115                                           u'Verifying the password',
1116                                           u'Wrong password')
1117
1118     def _real_extract(self, url, new_video=True):
1119         # Extract ID from URL
1120         mobj = re.match(self._VALID_URL, url)
1121         if mobj is None:
1122             raise ExtractorError(u'Invalid URL: %s' % url)
1123
1124         video_id = mobj.group('id')
1125         if not mobj.group('proto'):
1126             url = 'https://' + url
1127         if mobj.group('direct_link') or mobj.group('pro'):
1128             url = 'https://vimeo.com/' + video_id
1129
1130         # Retrieve video webpage to extract further information
1131         request = compat_urllib_request.Request(url, None, std_headers)
1132         webpage = self._download_webpage(request, video_id)
1133
1134         # Now we begin extracting as much information as we can from what we
1135         # retrieved. First we extract the information common to all extractors,
1136         # and latter we extract those that are Vimeo specific.
1137         self.report_extraction(video_id)
1138
1139         # Extract the config JSON
1140         try:
1141             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1142             config = json.loads(config)
1143         except:
1144             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1145                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1146
1147             if re.search('If so please provide the correct password.', webpage):
1148                 self._verify_video_password(url, video_id, webpage)
1149                 return self._real_extract(url)
1150             else:
1151                 raise ExtractorError(u'Unable to extract info section')
1152
1153         # Extract title
1154         video_title = config["video"]["title"]
1155
1156         # Extract uploader and uploader_id
1157         video_uploader = config["video"]["owner"]["name"]
1158         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1159
1160         # Extract video thumbnail
1161         video_thumbnail = config["video"]["thumbnail"]
1162
1163         # Extract video description
1164         video_description = get_element_by_attribute("itemprop", "description", webpage)
1165         if video_description: video_description = clean_html(video_description)
1166         else: video_description = u''
1167
1168         # Extract upload date
1169         video_upload_date = None
1170         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1171         if mobj is not None:
1172             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1173
1174         # Vimeo specific: extract request signature and timestamp
1175         sig = config['request']['signature']
1176         timestamp = config['request']['timestamp']
1177
1178         # Vimeo specific: extract video codec and quality information
1179         # First consider quality, then codecs, then take everything
1180         # TODO bind to format param
1181         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1182         files = { 'hd': [], 'sd': [], 'other': []}
1183         for codec_name, codec_extension in codecs:
1184             if codec_name in config["video"]["files"]:
1185                 if 'hd' in config["video"]["files"][codec_name]:
1186                     files['hd'].append((codec_name, codec_extension, 'hd'))
1187                 elif 'sd' in config["video"]["files"][codec_name]:
1188                     files['sd'].append((codec_name, codec_extension, 'sd'))
1189                 else:
1190                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1191
1192         for quality in ('hd', 'sd', 'other'):
1193             if len(files[quality]) > 0:
1194                 video_quality = files[quality][0][2]
1195                 video_codec = files[quality][0][0]
1196                 video_extension = files[quality][0][1]
1197                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1198                 break
1199         else:
1200             raise ExtractorError(u'No known codec found')
1201
1202         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1203                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1204
1205         return [{
1206             'id':       video_id,
1207             'url':      video_url,
1208             'uploader': video_uploader,
1209             'uploader_id': video_uploader_id,
1210             'upload_date':  video_upload_date,
1211             'title':    video_title,
1212             'ext':      video_extension,
1213             'thumbnail':    video_thumbnail,
1214             'description':  video_description,
1215         }]
1216
1217
1218 class ArteTvIE(InfoExtractor):
1219     """arte.tv information extractor."""
1220
1221     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1222     _LIVE_URL = r'index-[0-9]+\.html$'
1223
1224     IE_NAME = u'arte.tv'
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233         except ValueError as err:
1234             raise ExtractorError(u'Invalid URL: %s' % url)
1235         return webpage
1236
1237     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1238         page = self.fetch_webpage(url)
1239         mobj = re.search(regex, page, regexFlags)
1240         info = {}
1241
1242         if mobj is None:
1243             raise ExtractorError(u'Invalid URL: %s' % url)
1244
1245         for (i, key, err) in matchTuples:
1246             if mobj.group(i) is None:
1247                 raise ExtractorError(err)
1248             else:
1249                 info[key] = mobj.group(i)
1250
1251         return info
1252
1253     def extractLiveStream(self, url):
1254         video_lang = url.split('/')[-4]
1255         info = self.grep_webpage(
1256             url,
1257             r'src="(.*?/videothek_js.*?\.js)',
1258             0,
1259             [
1260                 (1, 'url', u'Invalid URL: %s' % url)
1261             ]
1262         )
1263         http_host = url.split('/')[2]
1264         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1265         info = self.grep_webpage(
1266             next_url,
1267             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1268                 '(http://.*?\.swf).*?' +
1269                 '(rtmp://.*?)\'',
1270             re.DOTALL,
1271             [
1272                 (1, 'path',   u'could not extract video path: %s' % url),
1273                 (2, 'player', u'could not extract video player: %s' % url),
1274                 (3, 'url',    u'could not extract video url: %s' % url)
1275             ]
1276         )
1277         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1278
1279     def extractPlus7Stream(self, url):
1280         video_lang = url.split('/')[-3]
1281         info = self.grep_webpage(
1282             url,
1283             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1284             0,
1285             [
1286                 (1, 'url', u'Invalid URL: %s' % url)
1287             ]
1288         )
1289         next_url = compat_urllib_parse.unquote(info.get('url'))
1290         info = self.grep_webpage(
1291             next_url,
1292             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1293             0,
1294             [
1295                 (1, 'url', u'Could not find <video> tag: %s' % url)
1296             ]
1297         )
1298         next_url = compat_urllib_parse.unquote(info.get('url'))
1299
1300         info = self.grep_webpage(
1301             next_url,
1302             r'<video id="(.*?)".*?>.*?' +
1303                 '<name>(.*?)</name>.*?' +
1304                 '<dateVideo>(.*?)</dateVideo>.*?' +
1305                 '<url quality="hd">(.*?)</url>',
1306             re.DOTALL,
1307             [
1308                 (1, 'id',    u'could not extract video id: %s' % url),
1309                 (2, 'title', u'could not extract video title: %s' % url),
1310                 (3, 'date',  u'could not extract video date: %s' % url),
1311                 (4, 'url',   u'could not extract video url: %s' % url)
1312             ]
1313         )
1314
1315         return {
1316             'id':           info.get('id'),
1317             'url':          compat_urllib_parse.unquote(info.get('url')),
1318             'uploader':     u'arte.tv',
1319             'upload_date':  unified_strdate(info.get('date')),
1320             'title':        info.get('title').decode('utf-8'),
1321             'ext':          u'mp4',
1322             'format':       u'NA',
1323             'player_url':   None,
1324         }
1325
1326     def _real_extract(self, url):
1327         video_id = url.split('/')[-1]
1328         self.report_extraction(video_id)
1329
1330         if re.search(self._LIVE_URL, video_id) is not None:
1331             self.extractLiveStream(url)
1332             return
1333         else:
1334             info = self.extractPlus7Stream(url)
1335
1336         return [info]
1337
1338
1339 class GenericIE(InfoExtractor):
1340     """Generic last-resort information extractor."""
1341
1342     _VALID_URL = r'.*'
1343     IE_NAME = u'generic'
1344
1345     def report_download_webpage(self, video_id):
1346         """Report webpage download."""
1347         if not self._downloader.params.get('test', False):
1348             self._downloader.report_warning(u'Falling back on generic information extractor.')
1349         super(GenericIE, self).report_download_webpage(video_id)
1350
1351     def report_following_redirect(self, new_url):
1352         """Report information extraction."""
1353         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1354
1355     def _test_redirect(self, url):
1356         """Check if it is a redirect, like url shorteners, in case return the new url."""
1357         class HeadRequest(compat_urllib_request.Request):
1358             def get_method(self):
1359                 return "HEAD"
1360
1361         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1362             """
1363             Subclass the HTTPRedirectHandler to make it use our
1364             HeadRequest also on the redirected URL
1365             """
1366             def redirect_request(self, req, fp, code, msg, headers, newurl):
1367                 if code in (301, 302, 303, 307):
1368                     newurl = newurl.replace(' ', '%20')
1369                     newheaders = dict((k,v) for k,v in req.headers.items()
1370                                       if k.lower() not in ("content-length", "content-type"))
1371                     return HeadRequest(newurl,
1372                                        headers=newheaders,
1373                                        origin_req_host=req.get_origin_req_host(),
1374                                        unverifiable=True)
1375                 else:
1376                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1377
1378         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1379             """
1380             Fallback to GET if HEAD is not allowed (405 HTTP error)
1381             """
1382             def http_error_405(self, req, fp, code, msg, headers):
1383                 fp.read()
1384                 fp.close()
1385
1386                 newheaders = dict((k,v) for k,v in req.headers.items()
1387                                   if k.lower() not in ("content-length", "content-type"))
1388                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1389                                                  headers=newheaders,
1390                                                  origin_req_host=req.get_origin_req_host(),
1391                                                  unverifiable=True))
1392
1393         # Build our opener
1394         opener = compat_urllib_request.OpenerDirector()
1395         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1396                         HTTPMethodFallback, HEADRedirectHandler,
1397                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1398             opener.add_handler(handler())
1399
1400         response = opener.open(HeadRequest(url))
1401         if response is None:
1402             raise ExtractorError(u'Invalid URL protocol')
1403         new_url = response.geturl()
1404
1405         if url == new_url:
1406             return False
1407
1408         self.report_following_redirect(new_url)
1409         return new_url
1410
1411     def _real_extract(self, url):
1412         new_url = self._test_redirect(url)
1413         if new_url: return [self.url_result(new_url)]
1414
1415         video_id = url.split('/')[-1]
1416         try:
1417             webpage = self._download_webpage(url, video_id)
1418         except ValueError as err:
1419             # since this is the last-resort InfoExtractor, if
1420             # this error is thrown, it'll be thrown here
1421             raise ExtractorError(u'Invalid URL: %s' % url)
1422
1423         self.report_extraction(video_id)
1424         # Start with something easy: JW Player in SWFObject
1425         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1426         if mobj is None:
1427             # Broaden the search a little bit
1428             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1429         if mobj is None:
1430             # Broaden the search a little bit: JWPlayer JS loader
1431             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1432         if mobj is None:
1433             # Try to find twitter cards info
1434             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1435         if mobj is None:
1436             # We look for Open Graph info:
1437             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1438             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1439             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1440             if m_video_type is not None:
1441                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1442         if mobj is None:
1443             raise ExtractorError(u'Invalid URL: %s' % url)
1444
1445         # It's possible that one of the regexes
1446         # matched, but returned an empty group:
1447         if mobj.group(1) is None:
1448             raise ExtractorError(u'Invalid URL: %s' % url)
1449
1450         video_url = compat_urllib_parse.unquote(mobj.group(1))
1451         video_id = os.path.basename(video_url)
1452
1453         # here's a fun little line of code for you:
1454         video_extension = os.path.splitext(video_id)[1][1:]
1455         video_id = os.path.splitext(video_id)[0]
1456
1457         # it's tempting to parse this further, but you would
1458         # have to take into account all the variations like
1459         #   Video Title - Site Name
1460         #   Site Name | Video Title
1461         #   Video Title - Tagline | Site Name
1462         # and so on and so forth; it's just not practical
1463         video_title = self._html_search_regex(r'<title>(.*)</title>',
1464             webpage, u'video title')
1465
1466         # video uploader is domain name
1467         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1468             url, u'video uploader')
1469
1470         return [{
1471             'id':       video_id,
1472             'url':      video_url,
1473             'uploader': video_uploader,
1474             'upload_date':  None,
1475             'title':    video_title,
1476             'ext':      video_extension,
1477         }]
1478
1479
1480 class YoutubeSearchIE(SearchInfoExtractor):
1481     """Information Extractor for YouTube search queries."""
1482     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1483     _MAX_RESULTS = 1000
1484     IE_NAME = u'youtube:search'
1485     _SEARCH_KEY = 'ytsearch'
1486
1487     def report_download_page(self, query, pagenum):
1488         """Report attempt to download search page with given number."""
1489         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1490
1491     def _get_n_results(self, query, n):
1492         """Get a specified number of results for a query"""
1493
1494         video_ids = []
1495         pagenum = 0
1496         limit = n
1497
1498         while (50 * pagenum) < limit:
1499             self.report_download_page(query, pagenum+1)
1500             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1501             request = compat_urllib_request.Request(result_url)
1502             try:
1503                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1504             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1506             api_response = json.loads(data)['data']
1507
1508             if not 'items' in api_response:
1509                 raise ExtractorError(u'[youtube] No video results')
1510
1511             new_ids = list(video['id'] for video in api_response['items'])
1512             video_ids += new_ids
1513
1514             limit = min(n, api_response['totalItems'])
1515             pagenum += 1
1516
1517         if len(video_ids) > n:
1518             video_ids = video_ids[:n]
1519         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1520         return self.playlist_result(videos, query)
1521
1522
1523 class GoogleSearchIE(SearchInfoExtractor):
1524     """Information Extractor for Google Video search queries."""
1525     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1526     _MAX_RESULTS = 1000
1527     IE_NAME = u'video.google:search'
1528     _SEARCH_KEY = 'gvsearch'
1529
1530     def _get_n_results(self, query, n):
1531         """Get a specified number of results for a query"""
1532
1533         res = {
1534             '_type': 'playlist',
1535             'id': query,
1536             'entries': []
1537         }
1538
1539         for pagenum in itertools.count(1):
1540             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1541             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1542                                              note='Downloading result page ' + str(pagenum))
1543
1544             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1545                 e = {
1546                     '_type': 'url',
1547                     'url': mobj.group(1)
1548                 }
1549                 res['entries'].append(e)
1550
1551             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1552                 return res
1553
1554 class YahooSearchIE(SearchInfoExtractor):
1555     """Information Extractor for Yahoo! Video search queries."""
1556
1557     _MAX_RESULTS = 1000
1558     IE_NAME = u'screen.yahoo:search'
1559     _SEARCH_KEY = 'yvsearch'
1560
1561     def _get_n_results(self, query, n):
1562         """Get a specified number of results for a query"""
1563
1564         res = {
1565             '_type': 'playlist',
1566             'id': query,
1567             'entries': []
1568         }
1569         for pagenum in itertools.count(0):
1570             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1571             webpage = self._download_webpage(result_url, query,
1572                                              note='Downloading results page '+str(pagenum+1))
1573             info = json.loads(webpage)
1574             m = info[u'm']
1575             results = info[u'results']
1576
1577             for (i, r) in enumerate(results):
1578                 if (pagenum * 30) +i >= n:
1579                     break
1580                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1581                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1582                 res['entries'].append(e)
1583             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1584                 break
1585
1586         return res
1587
1588
1589 class YoutubePlaylistIE(InfoExtractor):
1590     """Information Extractor for YouTube playlists."""
1591
1592     _VALID_URL = r"""(?:
1593                         (?:https?://)?
1594                         (?:\w+\.)?
1595                         youtube\.com/
1596                         (?:
1597                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1598                            \? (?:.*?&)*? (?:p|a|list)=
1599                         |  p/
1600                         )
1601                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1602                         .*
1603                      |
1604                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1605                      )"""
1606     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1607     _MAX_RESULTS = 50
1608     IE_NAME = u'youtube:playlist'
1609
1610     @classmethod
1611     def suitable(cls, url):
1612         """Receives a URL and returns True if suitable for this IE."""
1613         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1614
1615     def _real_extract(self, url):
1616         # Extract playlist id
1617         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1618         if mobj is None:
1619             raise ExtractorError(u'Invalid URL: %s' % url)
1620
1621         # Download playlist videos from API
1622         playlist_id = mobj.group(1) or mobj.group(2)
1623         page_num = 1
1624         videos = []
1625
1626         while True:
1627             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1628             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1629
1630             try:
1631                 response = json.loads(page)
1632             except ValueError as err:
1633                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1634
1635             if 'feed' not in response:
1636                 raise ExtractorError(u'Got a malformed response from YouTube API')
1637             playlist_title = response['feed']['title']['$t']
1638             if 'entry' not in response['feed']:
1639                 # Number of videos is a multiple of self._MAX_RESULTS
1640                 break
1641
1642             for entry in response['feed']['entry']:
1643                 index = entry['yt$position']['$t']
1644                 if 'media$group' in entry and 'media$player' in entry['media$group']:
1645                     videos.append((index, entry['media$group']['media$player']['url']))
1646
1647             if len(response['feed']['entry']) < self._MAX_RESULTS:
1648                 break
1649             page_num += 1
1650
1651         videos = [v[1] for v in sorted(videos)]
1652
1653         url_results = [self.url_result(url, 'Youtube') for url in videos]
1654         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1655
1656
1657 class YoutubeChannelIE(InfoExtractor):
1658     """Information Extractor for YouTube channels."""
1659
1660     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1661     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1662     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1663     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1664     IE_NAME = u'youtube:channel'
1665
1666     def extract_videos_from_page(self, page):
1667         ids_in_page = []
1668         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1669             if mobj.group(1) not in ids_in_page:
1670                 ids_in_page.append(mobj.group(1))
1671         return ids_in_page
1672
1673     def _real_extract(self, url):
1674         # Extract channel id
1675         mobj = re.match(self._VALID_URL, url)
1676         if mobj is None:
1677             raise ExtractorError(u'Invalid URL: %s' % url)
1678
1679         # Download channel page
1680         channel_id = mobj.group(1)
1681         video_ids = []
1682         pagenum = 1
1683
1684         url = self._TEMPLATE_URL % (channel_id, pagenum)
1685         page = self._download_webpage(url, channel_id,
1686                                       u'Downloading page #%s' % pagenum)
1687
1688         # Extract video identifiers
1689         ids_in_page = self.extract_videos_from_page(page)
1690         video_ids.extend(ids_in_page)
1691
1692         # Download any subsequent channel pages using the json-based channel_ajax query
1693         if self._MORE_PAGES_INDICATOR in page:
1694             while True:
1695                 pagenum = pagenum + 1
1696
1697                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1698                 page = self._download_webpage(url, channel_id,
1699                                               u'Downloading page #%s' % pagenum)
1700
1701                 page = json.loads(page)
1702
1703                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1704                 video_ids.extend(ids_in_page)
1705
1706                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1707                     break
1708
1709         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1710
1711         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1712         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1713         return [self.playlist_result(url_entries, channel_id)]
1714
1715
1716 class YoutubeUserIE(InfoExtractor):
1717     """Information Extractor for YouTube users."""
1718
1719     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1720     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1721     _GDATA_PAGE_SIZE = 50
1722     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1723     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1724     IE_NAME = u'youtube:user'
1725
1726     def _real_extract(self, url):
1727         # Extract username
1728         mobj = re.match(self._VALID_URL, url)
1729         if mobj is None:
1730             raise ExtractorError(u'Invalid URL: %s' % url)
1731
1732         username = mobj.group(1)
1733
1734         # Download video ids using YouTube Data API. Result size per
1735         # query is limited (currently to 50 videos) so we need to query
1736         # page by page until there are no video ids - it means we got
1737         # all of them.
1738
1739         video_ids = []
1740         pagenum = 0
1741
1742         while True:
1743             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1744
1745             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1746             page = self._download_webpage(gdata_url, username,
1747                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1748
1749             # Extract video identifiers
1750             ids_in_page = []
1751
1752             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1753                 if mobj.group(1) not in ids_in_page:
1754                     ids_in_page.append(mobj.group(1))
1755
1756             video_ids.extend(ids_in_page)
1757
1758             # A little optimization - if current page is not
1759             # "full", ie. does not contain PAGE_SIZE video ids then
1760             # we can assume that this page is the last one - there
1761             # are no more ids on further pages - no need to query
1762             # again.
1763
1764             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1765                 break
1766
1767             pagenum += 1
1768
1769         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1770         url_results = [self.url_result(url, 'Youtube') for url in urls]
1771         return [self.playlist_result(url_results, playlist_title = username)]
1772
1773
1774 class BlipTVUserIE(InfoExtractor):
1775     """Information Extractor for blip.tv users."""
1776
1777     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1778     _PAGE_SIZE = 12
1779     IE_NAME = u'blip.tv:user'
1780
1781     def _real_extract(self, url):
1782         # Extract username
1783         mobj = re.match(self._VALID_URL, url)
1784         if mobj is None:
1785             raise ExtractorError(u'Invalid URL: %s' % url)
1786
1787         username = mobj.group(1)
1788
1789         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1790
1791         page = self._download_webpage(url, username, u'Downloading user page')
1792         mobj = re.search(r'data-users-id="([^"]+)"', page)
1793         page_base = page_base % mobj.group(1)
1794
1795
1796         # Download video ids using BlipTV Ajax calls. Result size per
1797         # query is limited (currently to 12 videos) so we need to query
1798         # page by page until there are no video ids - it means we got
1799         # all of them.
1800
1801         video_ids = []
1802         pagenum = 1
1803
1804         while True:
1805             url = page_base + "&page=" + str(pagenum)
1806             page = self._download_webpage(url, username,
1807                                           u'Downloading video ids from page %d' % pagenum)
1808
1809             # Extract video identifiers
1810             ids_in_page = []
1811
1812             for mobj in re.finditer(r'href="/([^"]+)"', page):
1813                 if mobj.group(1) not in ids_in_page:
1814                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1815
1816             video_ids.extend(ids_in_page)
1817
1818             # A little optimization - if current page is not
1819             # "full", ie. does not contain PAGE_SIZE video ids then
1820             # we can assume that this page is the last one - there
1821             # are no more ids on further pages - no need to query
1822             # again.
1823
1824             if len(ids_in_page) < self._PAGE_SIZE:
1825                 break
1826
1827             pagenum += 1
1828
1829         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1830         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1831         return [self.playlist_result(url_entries, playlist_title = username)]
1832
1833
1834 class DepositFilesIE(InfoExtractor):
1835     """Information extractor for depositfiles.com"""
1836
1837     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1838
1839     def _real_extract(self, url):
1840         file_id = url.split('/')[-1]
1841         # Rebuild url in english locale
1842         url = 'http://depositfiles.com/en/files/' + file_id
1843
1844         # Retrieve file webpage with 'Free download' button pressed
1845         free_download_indication = { 'gateway_result' : '1' }
1846         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1847         try:
1848             self.report_download_webpage(file_id)
1849             webpage = compat_urllib_request.urlopen(request).read()
1850         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1852
1853         # Search for the real file URL
1854         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1855         if (mobj is None) or (mobj.group(1) is None):
1856             # Try to figure out reason of the error.
1857             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1858             if (mobj is not None) and (mobj.group(1) is not None):
1859                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1860                 raise ExtractorError(u'%s' % restriction_message)
1861             else:
1862                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1863
1864         file_url = mobj.group(1)
1865         file_extension = os.path.splitext(file_url)[1][1:]
1866
1867         # Search for file title
1868         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1869
1870         return [{
1871             'id':       file_id.decode('utf-8'),
1872             'url':      file_url.decode('utf-8'),
1873             'uploader': None,
1874             'upload_date':  None,
1875             'title':    file_title,
1876             'ext':      file_extension.decode('utf-8'),
1877         }]
1878
1879
1880 class FacebookIE(InfoExtractor):
1881     """Information Extractor for Facebook"""
1882
1883     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885     _NETRC_MACHINE = 'facebook'
1886     IE_NAME = u'facebook'
1887
1888     def report_login(self):
1889         """Report attempt to log in."""
1890         self.to_screen(u'Logging in')
1891
1892     def _real_initialize(self):
1893         if self._downloader is None:
1894             return
1895
1896         useremail = None
1897         password = None
1898         downloader_params = self._downloader.params
1899
1900         # Attempt to use provided username and password or .netrc data
1901         if downloader_params.get('username', None) is not None:
1902             useremail = downloader_params['username']
1903             password = downloader_params['password']
1904         elif downloader_params.get('usenetrc', False):
1905             try:
1906                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1907                 if info is not None:
1908                     useremail = info[0]
1909                     password = info[2]
1910                 else:
1911                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1912             except (IOError, netrc.NetrcParseError) as err:
1913                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1914                 return
1915
1916         if useremail is None:
1917             return
1918
1919         # Log in
1920         login_form = {
1921             'email': useremail,
1922             'pass': password,
1923             'login': 'Log+In'
1924             }
1925         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1926         try:
1927             self.report_login()
1928             login_results = compat_urllib_request.urlopen(request).read()
1929             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1930                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1931                 return
1932         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1934             return
1935
1936     def _real_extract(self, url):
1937         mobj = re.match(self._VALID_URL, url)
1938         if mobj is None:
1939             raise ExtractorError(u'Invalid URL: %s' % url)
1940         video_id = mobj.group('ID')
1941
1942         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1943         webpage = self._download_webpage(url, video_id)
1944
1945         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1946         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1947         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1948         if not m:
1949             raise ExtractorError(u'Cannot parse data')
1950         data = dict(json.loads(m.group(1)))
1951         params_raw = compat_urllib_parse.unquote(data['params'])
1952         params = json.loads(params_raw)
1953         video_data = params['video_data'][0]
1954         video_url = video_data.get('hd_src')
1955         if not video_url:
1956             video_url = video_data['sd_src']
1957         if not video_url:
1958             raise ExtractorError(u'Cannot find video URL')
1959         video_duration = int(video_data['video_duration'])
1960         thumbnail = video_data['thumbnail_src']
1961
1962         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1963             webpage, u'title')
1964
1965         info = {
1966             'id': video_id,
1967             'title': video_title,
1968             'url': video_url,
1969             'ext': 'mp4',
1970             'duration': video_duration,
1971             'thumbnail': thumbnail,
1972         }
1973         return [info]
1974
1975
1976 class BlipTVIE(InfoExtractor):
1977     """Information extractor for blip.tv"""
1978
1979     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1980     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1981     IE_NAME = u'blip.tv'
1982
1983     def report_direct_download(self, title):
1984         """Report information extraction."""
1985         self.to_screen(u'%s: Direct download detected' % title)
1986
1987     def _real_extract(self, url):
1988         mobj = re.match(self._VALID_URL, url)
1989         if mobj is None:
1990             raise ExtractorError(u'Invalid URL: %s' % url)
1991
1992         # See https://github.com/rg3/youtube-dl/issues/857
1993         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1994         if api_mobj is not None:
1995             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1996         urlp = compat_urllib_parse_urlparse(url)
1997         if urlp.path.startswith('/play/'):
1998             request = compat_urllib_request.Request(url)
1999             response = compat_urllib_request.urlopen(request)
2000             redirecturl = response.geturl()
2001             rurlp = compat_urllib_parse_urlparse(redirecturl)
2002             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2003             url = 'http://blip.tv/a/a-' + file_id
2004             return self._real_extract(url)
2005
2006
2007         if '?' in url:
2008             cchar = '&'
2009         else:
2010             cchar = '?'
2011         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2012         request = compat_urllib_request.Request(json_url)
2013         request.add_header('User-Agent', 'iTunes/10.6.1')
2014         self.report_extraction(mobj.group(1))
2015         info = None
2016         try:
2017             urlh = compat_urllib_request.urlopen(request)
2018             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2019                 basename = url.split('/')[-1]
2020                 title,ext = os.path.splitext(basename)
2021                 title = title.decode('UTF-8')
2022                 ext = ext.replace('.', '')
2023                 self.report_direct_download(title)
2024                 info = {
2025                     'id': title,
2026                     'url': url,
2027                     'uploader': None,
2028                     'upload_date': None,
2029                     'title': title,
2030                     'ext': ext,
2031                     'urlhandle': urlh
2032                 }
2033         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2035         if info is None: # Regular URL
2036             try:
2037                 json_code_bytes = urlh.read()
2038                 json_code = json_code_bytes.decode('utf-8')
2039             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2040                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2041
2042             try:
2043                 json_data = json.loads(json_code)
2044                 if 'Post' in json_data:
2045                     data = json_data['Post']
2046                 else:
2047                     data = json_data
2048
2049                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2050                 video_url = data['media']['url']
2051                 umobj = re.match(self._URL_EXT, video_url)
2052                 if umobj is None:
2053                     raise ValueError('Can not determine filename extension')
2054                 ext = umobj.group(1)
2055
2056                 info = {
2057                     'id': data['item_id'],
2058                     'url': video_url,
2059                     'uploader': data['display_name'],
2060                     'upload_date': upload_date,
2061                     'title': data['title'],
2062                     'ext': ext,
2063                     'format': data['media']['mimeType'],
2064                     'thumbnail': data['thumbnailUrl'],
2065                     'description': data['description'],
2066                     'player_url': data['embedUrl'],
2067                     'user_agent': 'iTunes/10.6.1',
2068                 }
2069             except (ValueError,KeyError) as err:
2070                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2071
2072         return [info]
2073
2074
2075 class MyVideoIE(InfoExtractor):
2076     """Information Extractor for myvideo.de."""
2077
2078     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2079     IE_NAME = u'myvideo'
2080
2081     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2082     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2083     # https://github.com/rg3/youtube-dl/pull/842
2084     def __rc4crypt(self,data, key):
2085         x = 0
2086         box = list(range(256))
2087         for i in list(range(256)):
2088             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2089             box[i], box[x] = box[x], box[i]
2090         x = 0
2091         y = 0
2092         out = ''
2093         for char in data:
2094             x = (x + 1) % 256
2095             y = (y + box[x]) % 256
2096             box[x], box[y] = box[y], box[x]
2097             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2098         return out
2099
2100     def __md5(self,s):
2101         return hashlib.md5(s).hexdigest().encode()
2102
2103     def _real_extract(self,url):
2104         mobj = re.match(self._VALID_URL, url)
2105         if mobj is None:
2106             raise ExtractorError(u'invalid URL: %s' % url)
2107
2108         video_id = mobj.group(1)
2109
2110         GK = (
2111           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2112           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2113           b'TnpsbA0KTVRkbU1tSTRNdz09'
2114         )
2115
2116         # Get video webpage
2117         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2118         webpage = self._download_webpage(webpage_url, video_id)
2119
2120         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2121         if mobj is not None:
2122             self.report_extraction(video_id)
2123             video_url = mobj.group(1) + '.flv'
2124
2125             video_title = self._html_search_regex('<title>([^<]+)</title>',
2126                 webpage, u'title')
2127
2128             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2129
2130             return [{
2131                 'id':       video_id,
2132                 'url':      video_url,
2133                 'uploader': None,
2134                 'upload_date':  None,
2135                 'title':    video_title,
2136                 'ext':      u'flv',
2137             }]
2138
2139         # try encxml
2140         mobj = re.search('var flashvars={(.+?)}', webpage)
2141         if mobj is None:
2142             raise ExtractorError(u'Unable to extract video')
2143
2144         params = {}
2145         encxml = ''
2146         sec = mobj.group(1)
2147         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2148             if not a == '_encxml':
2149                 params[a] = b
2150             else:
2151                 encxml = compat_urllib_parse.unquote(b)
2152         if not params.get('domain'):
2153             params['domain'] = 'www.myvideo.de'
2154         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2155         if 'flash_playertype=MTV' in xmldata_url:
2156             self._downloader.report_warning(u'avoiding MTV player')
2157             xmldata_url = (
2158                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2159                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2160             ) % video_id
2161
2162         # get enc data
2163         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2164         enc_data_b = binascii.unhexlify(enc_data)
2165         sk = self.__md5(
2166             base64.b64decode(base64.b64decode(GK)) +
2167             self.__md5(
2168                 str(video_id).encode('utf-8')
2169             )
2170         )
2171         dec_data = self.__rc4crypt(enc_data_b, sk)
2172
2173         # extracting infos
2174         self.report_extraction(video_id)
2175
2176         video_url = None
2177         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2178         if mobj:
2179             video_url = compat_urllib_parse.unquote(mobj.group(1))
2180             if 'myvideo2flash' in video_url:
2181                 self._downloader.report_warning(u'forcing RTMPT ...')
2182                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2183
2184         if not video_url:
2185             # extract non rtmp videos
2186             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2187             if mobj is None:
2188                 raise ExtractorError(u'unable to extract url')
2189             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2190
2191         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2192         video_file = compat_urllib_parse.unquote(video_file)
2193
2194         if not video_file.endswith('f4m'):
2195             ppath, prefix = video_file.split('.')
2196             video_playpath = '%s:%s' % (prefix, ppath)
2197             video_hls_playlist = ''
2198         else:
2199             video_playpath = ''
2200             video_hls_playlist = (
2201                 video_filepath + video_file
2202             ).replace('.f4m', '.m3u8')
2203
2204         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2205         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2206
2207         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2208             webpage, u'title')
2209
2210         return [{
2211             'id':                 video_id,
2212             'url':                video_url,
2213             'tc_url':             video_url,
2214             'uploader':           None,
2215             'upload_date':        None,
2216             'title':              video_title,
2217             'ext':                u'flv',
2218             'play_path':          video_playpath,
2219             'video_file':         video_file,
2220             'video_hls_playlist': video_hls_playlist,
2221             'player_url':         video_swfobj,
2222         }]
2223
2224
2225 class ComedyCentralIE(InfoExtractor):
2226     """Information extractor for The Daily Show and Colbert Report """
2227
2228     # urls can be abbreviations like :thedailyshow or :colbert
2229     # urls for episodes like:
2230     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2231     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2232     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2233     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2234                       |(https?://)?(www\.)?
2235                           (?P<showname>thedailyshow|colbertnation)\.com/
2236                          (full-episodes/(?P<episode>.*)|
2237                           (?P<clip>
2238                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2239                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2240                      $"""
2241
2242     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2243
2244     _video_extensions = {
2245         '3500': 'mp4',
2246         '2200': 'mp4',
2247         '1700': 'mp4',
2248         '1200': 'mp4',
2249         '750': 'mp4',
2250         '400': 'mp4',
2251     }
2252     _video_dimensions = {
2253         '3500': '1280x720',
2254         '2200': '960x540',
2255         '1700': '768x432',
2256         '1200': '640x360',
2257         '750': '512x288',
2258         '400': '384x216',
2259     }
2260
2261     @classmethod
2262     def suitable(cls, url):
2263         """Receives a URL and returns True if suitable for this IE."""
2264         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2265
2266     def _print_formats(self, formats):
2267         print('Available formats:')
2268         for x in formats:
2269             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2270
2271
2272     def _real_extract(self, url):
2273         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2274         if mobj is None:
2275             raise ExtractorError(u'Invalid URL: %s' % url)
2276
2277         if mobj.group('shortname'):
2278             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2279                 url = u'http://www.thedailyshow.com/full-episodes/'
2280             else:
2281                 url = u'http://www.colbertnation.com/full-episodes/'
2282             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2283             assert mobj is not None
2284
2285         if mobj.group('clip'):
2286             if mobj.group('showname') == 'thedailyshow':
2287                 epTitle = mobj.group('tdstitle')
2288             else:
2289                 epTitle = mobj.group('cntitle')
2290             dlNewest = False
2291         else:
2292             dlNewest = not mobj.group('episode')
2293             if dlNewest:
2294                 epTitle = mobj.group('showname')
2295             else:
2296                 epTitle = mobj.group('episode')
2297
2298         self.report_extraction(epTitle)
2299         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2300         if dlNewest:
2301             url = htmlHandle.geturl()
2302             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2303             if mobj is None:
2304                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2305             if mobj.group('episode') == '':
2306                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2307             epTitle = mobj.group('episode')
2308
2309         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2310
2311         if len(mMovieParams) == 0:
2312             # The Colbert Report embeds the information in a without
2313             # a URL prefix; so extract the alternate reference
2314             # and then add the URL prefix manually.
2315
2316             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2317             if len(altMovieParams) == 0:
2318                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2319             else:
2320                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2321
2322         uri = mMovieParams[0][1]
2323         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2324         indexXml = self._download_webpage(indexUrl, epTitle,
2325                                           u'Downloading show index',
2326                                           u'unable to download episode index')
2327
2328         results = []
2329
2330         idoc = xml.etree.ElementTree.fromstring(indexXml)
2331         itemEls = idoc.findall('.//item')
2332         for partNum,itemEl in enumerate(itemEls):
2333             mediaId = itemEl.findall('./guid')[0].text
2334             shortMediaId = mediaId.split(':')[-1]
2335             showId = mediaId.split(':')[-2].replace('.com', '')
2336             officialTitle = itemEl.findall('./title')[0].text
2337             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2338
2339             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2340                         compat_urllib_parse.urlencode({'uri': mediaId}))
2341             configXml = self._download_webpage(configUrl, epTitle,
2342                                                u'Downloading configuration for %s' % shortMediaId)
2343
2344             cdoc = xml.etree.ElementTree.fromstring(configXml)
2345             turls = []
2346             for rendition in cdoc.findall('.//rendition'):
2347                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2348                 turls.append(finfo)
2349
2350             if len(turls) == 0:
2351                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2352                 continue
2353
2354             if self._downloader.params.get('listformats', None):
2355                 self._print_formats([i[0] for i in turls])
2356                 return
2357
2358             # For now, just pick the highest bitrate
2359             format,rtmp_video_url = turls[-1]
2360
2361             # Get the format arg from the arg stream
2362             req_format = self._downloader.params.get('format', None)
2363
2364             # Select format if we can find one
2365             for f,v in turls:
2366                 if f == req_format:
2367                     format, rtmp_video_url = f, v
2368                     break
2369
2370             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2371             if not m:
2372                 raise ExtractorError(u'Cannot transform RTMP url')
2373             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2374             video_url = base + m.group('finalid')
2375
2376             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2377             info = {
2378                 'id': shortMediaId,
2379                 'url': video_url,
2380                 'uploader': showId,
2381                 'upload_date': officialDate,
2382                 'title': effTitle,
2383                 'ext': 'mp4',
2384                 'format': format,
2385                 'thumbnail': None,
2386                 'description': officialTitle,
2387             }
2388             results.append(info)
2389
2390         return results
2391
2392
2393 class EscapistIE(InfoExtractor):
2394     """Information extractor for The Escapist """
2395
2396     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2397     IE_NAME = u'escapist'
2398
2399     def _real_extract(self, url):
2400         mobj = re.match(self._VALID_URL, url)
2401         if mobj is None:
2402             raise ExtractorError(u'Invalid URL: %s' % url)
2403         showName = mobj.group('showname')
2404         videoId = mobj.group('episode')
2405
2406         self.report_extraction(videoId)
2407         webpage = self._download_webpage(url, videoId)
2408
2409         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2410             webpage, u'description', fatal=False)
2411
2412         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2413             webpage, u'thumbnail', fatal=False)
2414
2415         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2416             webpage, u'player url')
2417
2418         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2419             webpage, u'player url').split(' : ')[-1]
2420
2421         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2422         configUrl = compat_urllib_parse.unquote(configUrl)
2423
2424         configJSON = self._download_webpage(configUrl, videoId,
2425                                             u'Downloading configuration',
2426                                             u'unable to download configuration')
2427
2428         # Technically, it's JavaScript, not JSON
2429         configJSON = configJSON.replace("'", '"')
2430
2431         try:
2432             config = json.loads(configJSON)
2433         except (ValueError,) as err:
2434             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2435
2436         playlist = config['playlist']
2437         videoUrl = playlist[1]['url']
2438
2439         info = {
2440             'id': videoId,
2441             'url': videoUrl,
2442             'uploader': showName,
2443             'upload_date': None,
2444             'title': title,
2445             'ext': 'mp4',
2446             'thumbnail': imgUrl,
2447             'description': videoDesc,
2448             'player_url': playerUrl,
2449         }
2450
2451         return [info]
2452
2453 class CollegeHumorIE(InfoExtractor):
2454     """Information extractor for collegehumor.com"""
2455
2456     _WORKING = False
2457     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2458     IE_NAME = u'collegehumor'
2459
2460     def report_manifest(self, video_id):
2461         """Report information extraction."""
2462         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2463
2464     def _real_extract(self, url):
2465         mobj = re.match(self._VALID_URL, url)
2466         if mobj is None:
2467             raise ExtractorError(u'Invalid URL: %s' % url)
2468         video_id = mobj.group('videoid')
2469
2470         info = {
2471             'id': video_id,
2472             'uploader': None,
2473             'upload_date': None,
2474         }
2475
2476         self.report_extraction(video_id)
2477         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2478         try:
2479             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2480         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2481             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2482
2483         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2484         try:
2485             videoNode = mdoc.findall('./video')[0]
2486             info['description'] = videoNode.findall('./description')[0].text
2487             info['title'] = videoNode.findall('./caption')[0].text
2488             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2489             manifest_url = videoNode.findall('./file')[0].text
2490         except IndexError:
2491             raise ExtractorError(u'Invalid metadata XML file')
2492
2493         manifest_url += '?hdcore=2.10.3'
2494         self.report_manifest(video_id)
2495         try:
2496             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2499
2500         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2501         try:
2502             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2503             node_id = media_node.attrib['url']
2504             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2505         except IndexError as err:
2506             raise ExtractorError(u'Invalid manifest file')
2507
2508         url_pr = compat_urllib_parse_urlparse(manifest_url)
2509         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2510
2511         info['url'] = url
2512         info['ext'] = 'f4f'
2513         return [info]
2514
2515
2516 class XVideosIE(InfoExtractor):
2517     """Information extractor for xvideos.com"""
2518
2519     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2520     IE_NAME = u'xvideos'
2521
2522     def _real_extract(self, url):
2523         mobj = re.match(self._VALID_URL, url)
2524         if mobj is None:
2525             raise ExtractorError(u'Invalid URL: %s' % url)
2526         video_id = mobj.group(1)
2527
2528         webpage = self._download_webpage(url, video_id)
2529
2530         self.report_extraction(video_id)
2531
2532         # Extract video URL
2533         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2534             webpage, u'video URL'))
2535
2536         # Extract title
2537         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2538             webpage, u'title')
2539
2540         # Extract video thumbnail
2541         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2542             webpage, u'thumbnail', fatal=False)
2543
2544         info = {
2545             'id': video_id,
2546             'url': video_url,
2547             'uploader': None,
2548             'upload_date': None,
2549             'title': video_title,
2550             'ext': 'flv',
2551             'thumbnail': video_thumbnail,
2552             'description': None,
2553         }
2554
2555         return [info]
2556
2557
2558 class SoundcloudIE(InfoExtractor):
2559     """Information extractor for soundcloud.com
2560        To access the media, the uid of the song and a stream token
2561        must be extracted from the page source and the script must make
2562        a request to media.soundcloud.com/crossdomain.xml. Then
2563        the media can be grabbed by requesting from an url composed
2564        of the stream token and uid
2565      """
2566
2567     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2568     IE_NAME = u'soundcloud'
2569
2570     def report_resolve(self, video_id):
2571         """Report information extraction."""
2572         self.to_screen(u'%s: Resolving id' % video_id)
2573
2574     def _real_extract(self, url):
2575         mobj = re.match(self._VALID_URL, url)
2576         if mobj is None:
2577             raise ExtractorError(u'Invalid URL: %s' % url)
2578
2579         # extract uploader (which is in the url)
2580         uploader = mobj.group(1)
2581         # extract simple title (uploader + slug of song title)
2582         slug_title =  mobj.group(2)
2583         simple_title = uploader + u'-' + slug_title
2584         full_title = '%s/%s' % (uploader, slug_title)
2585
2586         self.report_resolve(full_title)
2587
2588         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2589         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2590         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2591
2592         info = json.loads(info_json)
2593         video_id = info['id']
2594         self.report_extraction(full_title)
2595
2596         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2597         stream_json = self._download_webpage(streams_url, full_title,
2598                                              u'Downloading stream definitions',
2599                                              u'unable to download stream definitions')
2600
2601         streams = json.loads(stream_json)
2602         mediaURL = streams['http_mp3_128_url']
2603         upload_date = unified_strdate(info['created_at'])
2604
2605         return [{
2606             'id':       info['id'],
2607             'url':      mediaURL,
2608             'uploader': info['user']['username'],
2609             'upload_date': upload_date,
2610             'title':    info['title'],
2611             'ext':      u'mp3',
2612             'description': info['description'],
2613         }]
2614
2615 class SoundcloudSetIE(InfoExtractor):
2616     """Information extractor for soundcloud.com sets
2617        To access the media, the uid of the song and a stream token
2618        must be extracted from the page source and the script must make
2619        a request to media.soundcloud.com/crossdomain.xml. Then
2620        the media can be grabbed by requesting from an url composed
2621        of the stream token and uid
2622      """
2623
2624     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2625     IE_NAME = u'soundcloud:set'
2626
2627     def report_resolve(self, video_id):
2628         """Report information extraction."""
2629         self.to_screen(u'%s: Resolving id' % video_id)
2630
2631     def _real_extract(self, url):
2632         mobj = re.match(self._VALID_URL, url)
2633         if mobj is None:
2634             raise ExtractorError(u'Invalid URL: %s' % url)
2635
2636         # extract uploader (which is in the url)
2637         uploader = mobj.group(1)
2638         # extract simple title (uploader + slug of song title)
2639         slug_title =  mobj.group(2)
2640         simple_title = uploader + u'-' + slug_title
2641         full_title = '%s/sets/%s' % (uploader, slug_title)
2642
2643         self.report_resolve(full_title)
2644
2645         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2646         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2647         info_json = self._download_webpage(resolv_url, full_title)
2648
2649         videos = []
2650         info = json.loads(info_json)
2651         if 'errors' in info:
2652             for err in info['errors']:
2653                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2654             return
2655
2656         self.report_extraction(full_title)
2657         for track in info['tracks']:
2658             video_id = track['id']
2659
2660             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2661             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2662
2663             self.report_extraction(video_id)
2664             streams = json.loads(stream_json)
2665             mediaURL = streams['http_mp3_128_url']
2666
2667             videos.append({
2668                 'id':       video_id,
2669                 'url':      mediaURL,
2670                 'uploader': track['user']['username'],
2671                 'upload_date':  unified_strdate(track['created_at']),
2672                 'title':    track['title'],
2673                 'ext':      u'mp3',
2674                 'description': track['description'],
2675             })
2676         return videos
2677
2678
2679 class InfoQIE(InfoExtractor):
2680     """Information extractor for infoq.com"""
2681     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2682
2683     def _real_extract(self, url):
2684         mobj = re.match(self._VALID_URL, url)
2685         if mobj is None:
2686             raise ExtractorError(u'Invalid URL: %s' % url)
2687
2688         webpage = self._download_webpage(url, video_id=url)
2689         self.report_extraction(url)
2690
2691         # Extract video URL
2692         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2693         if mobj is None:
2694             raise ExtractorError(u'Unable to extract video url')
2695         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2696         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2697
2698         # Extract title
2699         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2700             webpage, u'title')
2701
2702         # Extract description
2703         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2704             webpage, u'description', fatal=False)
2705
2706         video_filename = video_url.split('/')[-1]
2707         video_id, extension = video_filename.split('.')
2708
2709         info = {
2710             'id': video_id,
2711             'url': video_url,
2712             'uploader': None,
2713             'upload_date': None,
2714             'title': video_title,
2715             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2716             'thumbnail': None,
2717             'description': video_description,
2718         }
2719
2720         return [info]
2721
2722 class MixcloudIE(InfoExtractor):
2723     """Information extractor for www.mixcloud.com"""
2724
2725     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2726     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2727     IE_NAME = u'mixcloud'
2728
2729     def report_download_json(self, file_id):
2730         """Report JSON download."""
2731         self.to_screen(u'Downloading json')
2732
2733     def get_urls(self, jsonData, fmt, bitrate='best'):
2734         """Get urls from 'audio_formats' section in json"""
2735         file_url = None
2736         try:
2737             bitrate_list = jsonData[fmt]
2738             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2739                 bitrate = max(bitrate_list) # select highest
2740
2741             url_list = jsonData[fmt][bitrate]
2742         except TypeError: # we have no bitrate info.
2743             url_list = jsonData[fmt]
2744         return url_list
2745
2746     def check_urls(self, url_list):
2747         """Returns 1st active url from list"""
2748         for url in url_list:
2749             try:
2750                 compat_urllib_request.urlopen(url)
2751                 return url
2752             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2753                 url = None
2754
2755         return None
2756
2757     def _print_formats(self, formats):
2758         print('Available formats:')
2759         for fmt in formats.keys():
2760             for b in formats[fmt]:
2761                 try:
2762                     ext = formats[fmt][b][0]
2763                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2764                 except TypeError: # we have no bitrate info
2765                     ext = formats[fmt][0]
2766                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2767                     break
2768
2769     def _real_extract(self, url):
2770         mobj = re.match(self._VALID_URL, url)
2771         if mobj is None:
2772             raise ExtractorError(u'Invalid URL: %s' % url)
2773         # extract uploader & filename from url
2774         uploader = mobj.group(1).decode('utf-8')
2775         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2776
2777         # construct API request
2778         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2779         # retrieve .json file with links to files
2780         request = compat_urllib_request.Request(file_url)
2781         try:
2782             self.report_download_json(file_url)
2783             jsonData = compat_urllib_request.urlopen(request).read()
2784         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2786
2787         # parse JSON
2788         json_data = json.loads(jsonData)
2789         player_url = json_data['player_swf_url']
2790         formats = dict(json_data['audio_formats'])
2791
2792         req_format = self._downloader.params.get('format', None)
2793         bitrate = None
2794
2795         if self._downloader.params.get('listformats', None):
2796             self._print_formats(formats)
2797             return
2798
2799         if req_format is None or req_format == 'best':
2800             for format_param in formats.keys():
2801                 url_list = self.get_urls(formats, format_param)
2802                 # check urls
2803                 file_url = self.check_urls(url_list)
2804                 if file_url is not None:
2805                     break # got it!
2806         else:
2807             if req_format not in formats:
2808                 raise ExtractorError(u'Format is not available')
2809
2810             url_list = self.get_urls(formats, req_format)
2811             file_url = self.check_urls(url_list)
2812             format_param = req_format
2813
2814         return [{
2815             'id': file_id.decode('utf-8'),
2816             'url': file_url.decode('utf-8'),
2817             'uploader': uploader.decode('utf-8'),
2818             'upload_date': None,
2819             'title': json_data['name'],
2820             'ext': file_url.split('.')[-1].decode('utf-8'),
2821             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2822             'thumbnail': json_data['thumbnail_url'],
2823             'description': json_data['description'],
2824             'player_url': player_url.decode('utf-8'),
2825         }]
2826
2827 class StanfordOpenClassroomIE(InfoExtractor):
2828     """Information extractor for Stanford's Open ClassRoom"""
2829
2830     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2831     IE_NAME = u'stanfordoc'
2832
2833     def _real_extract(self, url):
2834         mobj = re.match(self._VALID_URL, url)
2835         if mobj is None:
2836             raise ExtractorError(u'Invalid URL: %s' % url)
2837
2838         if mobj.group('course') and mobj.group('video'): # A specific video
2839             course = mobj.group('course')
2840             video = mobj.group('video')
2841             info = {
2842                 'id': course + '_' + video,
2843                 'uploader': None,
2844                 'upload_date': None,
2845             }
2846
2847             self.report_extraction(info['id'])
2848             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2849             xmlUrl = baseUrl + video + '.xml'
2850             try:
2851                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2852             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2854             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2855             try:
2856                 info['title'] = mdoc.findall('./title')[0].text
2857                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2858             except IndexError:
2859                 raise ExtractorError(u'Invalid metadata XML file')
2860             info['ext'] = info['url'].rpartition('.')[2]
2861             return [info]
2862         elif mobj.group('course'): # A course page
2863             course = mobj.group('course')
2864             info = {
2865                 'id': course,
2866                 'type': 'playlist',
2867                 'uploader': None,
2868                 'upload_date': None,
2869             }
2870
2871             coursepage = self._download_webpage(url, info['id'],
2872                                         note='Downloading course info page',
2873                                         errnote='Unable to download course info page')
2874
2875             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2876
2877             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2878                 coursepage, u'description', fatal=False)
2879
2880             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2881             info['list'] = [
2882                 {
2883                     'type': 'reference',
2884                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2885                 }
2886                     for vpage in links]
2887             results = []
2888             for entry in info['list']:
2889                 assert entry['type'] == 'reference'
2890                 results += self.extract(entry['url'])
2891             return results
2892         else: # Root page
2893             info = {
2894                 'id': 'Stanford OpenClassroom',
2895                 'type': 'playlist',
2896                 'uploader': None,
2897                 'upload_date': None,
2898             }
2899
2900             self.report_download_webpage(info['id'])
2901             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2902             try:
2903                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2904             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2905                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2906
2907             info['title'] = info['id']
2908
2909             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2910             info['list'] = [
2911                 {
2912                     'type': 'reference',
2913                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2914                 }
2915                     for cpage in links]
2916
2917             results = []
2918             for entry in info['list']:
2919                 assert entry['type'] == 'reference'
2920                 results += self.extract(entry['url'])
2921             return results
2922
2923 class MTVIE(InfoExtractor):
2924     """Information extractor for MTV.com"""
2925
2926     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2927     IE_NAME = u'mtv'
2928
2929     def _real_extract(self, url):
2930         mobj = re.match(self._VALID_URL, url)
2931         if mobj is None:
2932             raise ExtractorError(u'Invalid URL: %s' % url)
2933         if not mobj.group('proto'):
2934             url = 'http://' + url
2935         video_id = mobj.group('videoid')
2936
2937         webpage = self._download_webpage(url, video_id)
2938
2939         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2940             webpage, u'song name', fatal=False)
2941
2942         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2943             webpage, u'title')
2944
2945         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2946             webpage, u'mtvn_uri', fatal=False)
2947
2948         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2949             webpage, u'content id', fatal=False)
2950
2951         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2952         self.report_extraction(video_id)
2953         request = compat_urllib_request.Request(videogen_url)
2954         try:
2955             metadataXml = compat_urllib_request.urlopen(request).read()
2956         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2957             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2958
2959         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2960         renditions = mdoc.findall('.//rendition')
2961
2962         # For now, always pick the highest quality.
2963         rendition = renditions[-1]
2964
2965         try:
2966             _,_,ext = rendition.attrib['type'].partition('/')
2967             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2968             video_url = rendition.find('./src').text
2969         except KeyError:
2970             raise ExtractorError('Invalid rendition field.')
2971
2972         info = {
2973             'id': video_id,
2974             'url': video_url,
2975             'uploader': performer,
2976             'upload_date': None,
2977             'title': video_title,
2978             'ext': ext,
2979             'format': format,
2980         }
2981
2982         return [info]
2983
2984
2985 class YoukuIE(InfoExtractor):
2986     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2987
2988     def _gen_sid(self):
2989         nowTime = int(time.time() * 1000)
2990         random1 = random.randint(1000,1998)
2991         random2 = random.randint(1000,9999)
2992
2993         return "%d%d%d" %(nowTime,random1,random2)
2994
2995     def _get_file_ID_mix_string(self, seed):
2996         mixed = []
2997         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2998         seed = float(seed)
2999         for i in range(len(source)):
3000             seed  =  (seed * 211 + 30031 ) % 65536
3001             index  =  math.floor(seed / 65536 * len(source) )
3002             mixed.append(source[int(index)])
3003             source.remove(source[int(index)])
3004         #return ''.join(mixed)
3005         return mixed
3006
3007     def _get_file_id(self, fileId, seed):
3008         mixed = self._get_file_ID_mix_string(seed)
3009         ids = fileId.split('*')
3010         realId = []
3011         for ch in ids:
3012             if ch:
3013                 realId.append(mixed[int(ch)])
3014         return ''.join(realId)
3015
3016     def _real_extract(self, url):
3017         mobj = re.match(self._VALID_URL, url)
3018         if mobj is None:
3019             raise ExtractorError(u'Invalid URL: %s' % url)
3020         video_id = mobj.group('ID')
3021
3022         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3023
3024         jsondata = self._download_webpage(info_url, video_id)
3025
3026         self.report_extraction(video_id)
3027         try:
3028             config = json.loads(jsondata)
3029
3030             video_title =  config['data'][0]['title']
3031             seed = config['data'][0]['seed']
3032
3033             format = self._downloader.params.get('format', None)
3034             supported_format = list(config['data'][0]['streamfileids'].keys())
3035
3036             if format is None or format == 'best':
3037                 if 'hd2' in supported_format:
3038                     format = 'hd2'
3039                 else:
3040                     format = 'flv'
3041                 ext = u'flv'
3042             elif format == 'worst':
3043                 format = 'mp4'
3044                 ext = u'mp4'
3045             else:
3046                 format = 'flv'
3047                 ext = u'flv'
3048
3049
3050             fileid = config['data'][0]['streamfileids'][format]
3051             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3052         except (UnicodeDecodeError, ValueError, KeyError):
3053             raise ExtractorError(u'Unable to extract info section')
3054
3055         files_info=[]
3056         sid = self._gen_sid()
3057         fileid = self._get_file_id(fileid, seed)
3058
3059         #column 8,9 of fileid represent the segment number
3060         #fileid[7:9] should be changed
3061         for index, key in enumerate(keys):
3062
3063             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3064             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3065
3066             info = {
3067                 'id': '%s_part%02d' % (video_id, index),
3068                 'url': download_url,
3069                 'uploader': None,
3070                 'upload_date': None,
3071                 'title': video_title,
3072                 'ext': ext,
3073             }
3074             files_info.append(info)
3075
3076         return files_info
3077
3078
3079 class XNXXIE(InfoExtractor):
3080     """Information extractor for xnxx.com"""
3081
3082     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3083     IE_NAME = u'xnxx'
3084     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3085     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3086     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3087
3088     def _real_extract(self, url):
3089         mobj = re.match(self._VALID_URL, url)
3090         if mobj is None:
3091             raise ExtractorError(u'Invalid URL: %s' % url)
3092         video_id = mobj.group(1)
3093
3094         # Get webpage content
3095         webpage = self._download_webpage(url, video_id)
3096
3097         video_url = self._search_regex(self.VIDEO_URL_RE,
3098             webpage, u'video URL')
3099         video_url = compat_urllib_parse.unquote(video_url)
3100
3101         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3102             webpage, u'title')
3103
3104         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3105             webpage, u'thumbnail', fatal=False)
3106
3107         return [{
3108             'id': video_id,
3109             'url': video_url,
3110             'uploader': None,
3111             'upload_date': None,
3112             'title': video_title,
3113             'ext': 'flv',
3114             'thumbnail': video_thumbnail,
3115             'description': None,
3116         }]
3117
3118
3119 class GooglePlusIE(InfoExtractor):
3120     """Information extractor for plus.google.com."""
3121
3122     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3123     IE_NAME = u'plus.google'
3124
3125     def _real_extract(self, url):
3126         # Extract id from URL
3127         mobj = re.match(self._VALID_URL, url)
3128         if mobj is None:
3129             raise ExtractorError(u'Invalid URL: %s' % url)
3130
3131         post_url = mobj.group(0)
3132         video_id = mobj.group(1)
3133
3134         video_extension = 'flv'
3135
3136         # Step 1, Retrieve post webpage to extract further information
3137         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3138
3139         self.report_extraction(video_id)
3140
3141         # Extract update date
3142         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3143             webpage, u'upload date', fatal=False)
3144         if upload_date:
3145             # Convert timestring to a format suitable for filename
3146             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3147             upload_date = upload_date.strftime('%Y%m%d')
3148
3149         # Extract uploader
3150         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3151             webpage, u'uploader', fatal=False)
3152
3153         # Extract title
3154         # Get the first line for title
3155         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3156             webpage, 'title', default=u'NA')
3157
3158         # Step 2, Stimulate clicking the image box to launch video
3159         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3160             webpage, u'video page URL')
3161         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3162
3163         # Extract video links on video page
3164         """Extract video links of all sizes"""
3165         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3166         mobj = re.findall(pattern, webpage)
3167         if len(mobj) == 0:
3168             raise ExtractorError(u'Unable to extract video links')
3169
3170         # Sort in resolution
3171         links = sorted(mobj)
3172
3173         # Choose the lowest of the sort, i.e. highest resolution
3174         video_url = links[-1]
3175         # Only get the url. The resolution part in the tuple has no use anymore
3176         video_url = video_url[-1]
3177         # Treat escaped \u0026 style hex
3178         try:
3179             video_url = video_url.decode("unicode_escape")
3180         except AttributeError: # Python 3
3181             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3182
3183
3184         return [{
3185             'id':       video_id,
3186             'url':      video_url,
3187             'uploader': uploader,
3188             'upload_date':  upload_date,
3189             'title':    video_title,
3190             'ext':      video_extension,
3191         }]
3192
3193 class NBAIE(InfoExtractor):
3194     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3195     IE_NAME = u'nba'
3196
3197     def _real_extract(self, url):
3198         mobj = re.match(self._VALID_URL, url)
3199         if mobj is None:
3200             raise ExtractorError(u'Invalid URL: %s' % url)
3201
3202         video_id = mobj.group(1)
3203
3204         webpage = self._download_webpage(url, video_id)
3205
3206         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3207
3208         shortened_video_id = video_id.rpartition('/')[2]
3209         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3210             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3211
3212         # It isn't there in the HTML it returns to us
3213         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3214
3215         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3216
3217         info = {
3218             'id': shortened_video_id,
3219             'url': video_url,
3220             'ext': 'mp4',
3221             'title': title,
3222             # 'uploader_date': uploader_date,
3223             'description': description,
3224         }
3225         return [info]
3226
3227 class JustinTVIE(InfoExtractor):
3228     """Information extractor for justin.tv and twitch.tv"""
3229     # TODO: One broadcast may be split into multiple videos. The key
3230     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3231     # starts at 1 and increases. Can we treat all parts as one video?
3232
3233     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3234         (?:
3235             (?P<channelid>[^/]+)|
3236             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3237             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3238         )
3239         /?(?:\#.*)?$
3240         """
3241     _JUSTIN_PAGE_LIMIT = 100
3242     IE_NAME = u'justin.tv'
3243
3244     def report_download_page(self, channel, offset):
3245         """Report attempt to download a single page of videos."""
3246         self.to_screen(u'%s: Downloading video information from %d to %d' %
3247                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3248
3249     # Return count of items, list of *valid* items
3250     def _parse_page(self, url, video_id):
3251         webpage = self._download_webpage(url, video_id,
3252                                          u'Downloading video info JSON',
3253                                          u'unable to download video info JSON')
3254
3255         response = json.loads(webpage)
3256         if type(response) != list:
3257             error_text = response.get('error', 'unknown error')
3258             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3259         info = []
3260         for clip in response:
3261             video_url = clip['video_file_url']
3262             if video_url:
3263                 video_extension = os.path.splitext(video_url)[1][1:]
3264                 video_date = re.sub('-', '', clip['start_time'][:10])
3265                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3266                 video_id = clip['id']
3267                 video_title = clip.get('title', video_id)
3268                 info.append({
3269                     'id': video_id,
3270                     'url': video_url,
3271                     'title': video_title,
3272                     'uploader': clip.get('channel_name', video_uploader_id),
3273                     'uploader_id': video_uploader_id,
3274                     'upload_date': video_date,
3275                     'ext': video_extension,
3276                 })
3277         return (len(response), info)
3278
3279     def _real_extract(self, url):
3280         mobj = re.match(self._VALID_URL, url)
3281         if mobj is None:
3282             raise ExtractorError(u'invalid URL: %s' % url)
3283
3284         api_base = 'http://api.justin.tv'
3285         paged = False
3286         if mobj.group('channelid'):
3287             paged = True
3288             video_id = mobj.group('channelid')
3289             api = api_base + '/channel/archives/%s.json' % video_id
3290         elif mobj.group('chapterid'):
3291             chapter_id = mobj.group('chapterid')
3292
3293             webpage = self._download_webpage(url, chapter_id)
3294             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3295             if not m:
3296                 raise ExtractorError(u'Cannot find archive of a chapter')
3297             archive_id = m.group(1)
3298
3299             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3300             chapter_info_xml = self._download_webpage(api, chapter_id,
3301                                              note=u'Downloading chapter information',
3302                                              errnote=u'Chapter information download failed')
3303             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3304             for a in doc.findall('.//archive'):
3305                 if archive_id == a.find('./id').text:
3306                     break
3307             else:
3308                 raise ExtractorError(u'Could not find chapter in chapter information')
3309
3310             video_url = a.find('./video_file_url').text
3311             video_ext = video_url.rpartition('.')[2] or u'flv'
3312
3313             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3314             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3315                                    note='Downloading chapter metadata',
3316                                    errnote='Download of chapter metadata failed')
3317             chapter_info = json.loads(chapter_info_json)
3318
3319             bracket_start = int(doc.find('.//bracket_start').text)
3320             bracket_end = int(doc.find('.//bracket_end').text)
3321
3322             # TODO determine start (and probably fix up file)
3323             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3324             #video_url += u'?start=' + TODO:start_timestamp
3325             # bracket_start is 13290, but we want 51670615
3326             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3327                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3328
3329             info = {
3330                 'id': u'c' + chapter_id,
3331                 'url': video_url,
3332                 'ext': video_ext,
3333                 'title': chapter_info['title'],
3334                 'thumbnail': chapter_info['preview'],
3335                 'description': chapter_info['description'],
3336                 'uploader': chapter_info['channel']['display_name'],
3337                 'uploader_id': chapter_info['channel']['name'],
3338             }
3339             return [info]
3340         else:
3341             video_id = mobj.group('videoid')
3342             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3343
3344         self.report_extraction(video_id)
3345
3346         info = []
3347         offset = 0
3348         limit = self._JUSTIN_PAGE_LIMIT
3349         while True:
3350             if paged:
3351                 self.report_download_page(video_id, offset)
3352             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3353             page_count, page_info = self._parse_page(page_url, video_id)
3354             info.extend(page_info)
3355             if not paged or page_count != limit:
3356                 break
3357             offset += limit
3358         return info
3359
3360 class FunnyOrDieIE(InfoExtractor):
3361     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3362
3363     def _real_extract(self, url):
3364         mobj = re.match(self._VALID_URL, url)
3365         if mobj is None:
3366             raise ExtractorError(u'invalid URL: %s' % url)
3367
3368         video_id = mobj.group('id')
3369         webpage = self._download_webpage(url, video_id)
3370
3371         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3372             webpage, u'video URL', flags=re.DOTALL)
3373
3374         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3375             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3376
3377         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3378             webpage, u'description', fatal=False, flags=re.DOTALL)
3379
3380         info = {
3381             'id': video_id,
3382             'url': video_url,
3383             'ext': 'mp4',
3384             'title': title,
3385             'description': video_description,
3386         }
3387         return [info]
3388
3389 class SteamIE(InfoExtractor):
3390     _VALID_URL = r"""http://store\.steampowered\.com/
3391                 (agecheck/)?
3392                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3393                 (?P<gameID>\d+)/?
3394                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3395                 """
3396     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3397     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3398
3399     @classmethod
3400     def suitable(cls, url):
3401         """Receives a URL and returns True if suitable for this IE."""
3402         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3403
3404     def _real_extract(self, url):
3405         m = re.match(self._VALID_URL, url, re.VERBOSE)
3406         gameID = m.group('gameID')
3407
3408         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3409         webpage = self._download_webpage(videourl, gameID)
3410
3411         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3412             videourl = self._AGECHECK_TEMPLATE % gameID
3413             self.report_age_confirmation()
3414             webpage = self._download_webpage(videourl, gameID)
3415
3416         self.report_extraction(gameID)
3417         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3418                                              webpage, 'game title')
3419
3420         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3421         mweb = re.finditer(urlRE, webpage)
3422         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3423         titles = re.finditer(namesRE, webpage)
3424         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3425         thumbs = re.finditer(thumbsRE, webpage)
3426         videos = []
3427         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3428             video_id = vid.group('videoID')
3429             title = vtitle.group('videoName')
3430             video_url = vid.group('videoURL')
3431             video_thumb = thumb.group('thumbnail')
3432             if not video_url:
3433                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3434             info = {
3435                 'id':video_id,
3436                 'url':video_url,
3437                 'ext': 'flv',
3438                 'title': unescapeHTML(title),
3439                 'thumbnail': video_thumb
3440                   }
3441             videos.append(info)
3442         return [self.playlist_result(videos, gameID, game_title)]
3443
3444 class UstreamIE(InfoExtractor):
3445     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3446     IE_NAME = u'ustream'
3447
3448     def _real_extract(self, url):
3449         m = re.match(self._VALID_URL, url)
3450         video_id = m.group('videoID')
3451
3452         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3453         webpage = self._download_webpage(url, video_id)
3454
3455         self.report_extraction(video_id)
3456
3457         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3458             webpage, u'title')
3459
3460         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3461             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3462
3463         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3464             webpage, u'thumbnail', fatal=False)
3465
3466         info = {
3467                 'id': video_id,
3468                 'url': video_url,
3469                 'ext': 'flv',
3470                 'title': video_title,
3471                 'uploader': uploader,
3472                 'thumbnail': thumbnail,
3473                }
3474         return info
3475
3476 class WorldStarHipHopIE(InfoExtractor):
3477     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3478     IE_NAME = u'WorldStarHipHop'
3479
3480     def _real_extract(self, url):
3481         m = re.match(self._VALID_URL, url)
3482         video_id = m.group('id')
3483
3484         webpage_src = self._download_webpage(url, video_id)
3485
3486         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3487             webpage_src, u'video URL')
3488
3489         if 'mp4' in video_url:
3490             ext = 'mp4'
3491         else:
3492             ext = 'flv'
3493
3494         video_title = self._html_search_regex(r"<title>(.*)</title>",
3495             webpage_src, u'title')
3496
3497         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3498         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3499             webpage_src, u'thumbnail', fatal=False)
3500
3501         if not thumbnail:
3502             _title = r"""candytitles.*>(.*)</span>"""
3503             mobj = re.search(_title, webpage_src)
3504             if mobj is not None:
3505                 video_title = mobj.group(1)
3506
3507         results = [{
3508                     'id': video_id,
3509                     'url' : video_url,
3510                     'title' : video_title,
3511                     'thumbnail' : thumbnail,
3512                     'ext' : ext,
3513                     }]
3514         return results
3515
3516 class RBMARadioIE(InfoExtractor):
3517     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3518
3519     def _real_extract(self, url):
3520         m = re.match(self._VALID_URL, url)
3521         video_id = m.group('videoID')
3522
3523         webpage = self._download_webpage(url, video_id)
3524
3525         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3526             webpage, u'json data', flags=re.MULTILINE)
3527
3528         try:
3529             data = json.loads(json_data)
3530         except ValueError as e:
3531             raise ExtractorError(u'Invalid JSON: ' + str(e))
3532
3533         video_url = data['akamai_url'] + '&cbr=256'
3534         url_parts = compat_urllib_parse_urlparse(video_url)
3535         video_ext = url_parts.path.rpartition('.')[2]
3536         info = {
3537                 'id': video_id,
3538                 'url': video_url,
3539                 'ext': video_ext,
3540                 'title': data['title'],
3541                 'description': data.get('teaser_text'),
3542                 'location': data.get('country_of_origin'),
3543                 'uploader': data.get('host', {}).get('name'),
3544                 'uploader_id': data.get('host', {}).get('slug'),
3545                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3546                 'duration': data.get('duration'),
3547         }
3548         return [info]
3549
3550
3551 class YouPornIE(InfoExtractor):
3552     """Information extractor for youporn.com."""
3553     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3554
3555     def _print_formats(self, formats):
3556         """Print all available formats"""
3557         print(u'Available formats:')
3558         print(u'ext\t\tformat')
3559         print(u'---------------------------------')
3560         for format in formats:
3561             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3562
3563     def _specific(self, req_format, formats):
3564         for x in formats:
3565             if(x["format"]==req_format):
3566                 return x
3567         return None
3568
3569     def _real_extract(self, url):
3570         mobj = re.match(self._VALID_URL, url)
3571         if mobj is None:
3572             raise ExtractorError(u'Invalid URL: %s' % url)
3573         video_id = mobj.group('videoid')
3574
3575         req = compat_urllib_request.Request(url)
3576         req.add_header('Cookie', 'age_verified=1')
3577         webpage = self._download_webpage(req, video_id)
3578
3579         # Get JSON parameters
3580         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3581         try:
3582             params = json.loads(json_params)
3583         except:
3584             raise ExtractorError(u'Invalid JSON')
3585
3586         self.report_extraction(video_id)
3587         try:
3588             video_title = params['title']
3589             upload_date = unified_strdate(params['release_date_f'])
3590             video_description = params['description']
3591             video_uploader = params['submitted_by']
3592             thumbnail = params['thumbnails'][0]['image']
3593         except KeyError:
3594             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3595
3596         # Get all of the formats available
3597         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3598         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3599             webpage, u'download list').strip()
3600
3601         # Get all of the links from the page
3602         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3603         links = re.findall(LINK_RE, download_list_html)
3604         if(len(links) == 0):
3605             raise ExtractorError(u'ERROR: no known formats available for video')
3606
3607         self.to_screen(u'Links found: %d' % len(links))
3608
3609         formats = []
3610         for link in links:
3611
3612             # A link looks like this:
3613             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3614             # A path looks like this:
3615             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3616             video_url = unescapeHTML( link )
3617             path = compat_urllib_parse_urlparse( video_url ).path
3618             extension = os.path.splitext( path )[1][1:]
3619             format = path.split('/')[4].split('_')[:2]
3620             size = format[0]
3621             bitrate = format[1]
3622             format = "-".join( format )
3623             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3624
3625             formats.append({
3626                 'id': video_id,
3627                 'url': video_url,
3628                 'uploader': video_uploader,
3629                 'upload_date': upload_date,
3630                 'title': video_title,
3631                 'ext': extension,
3632                 'format': format,
3633                 'thumbnail': thumbnail,
3634                 'description': video_description
3635             })
3636
3637         if self._downloader.params.get('listformats', None):
3638             self._print_formats(formats)
3639             return
3640
3641         req_format = self._downloader.params.get('format', None)
3642         self.to_screen(u'Format: %s' % req_format)
3643
3644         if req_format is None or req_format == 'best':
3645             return [formats[0]]
3646         elif req_format == 'worst':
3647             return [formats[-1]]
3648         elif req_format in ('-1', 'all'):
3649             return formats
3650         else:
3651             format = self._specific( req_format, formats )
3652             if result is None:
3653                 raise ExtractorError(u'Requested format not available')
3654             return [format]
3655
3656
3657
3658 class PornotubeIE(InfoExtractor):
3659     """Information extractor for pornotube.com."""
3660     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3661
3662     def _real_extract(self, url):
3663         mobj = re.match(self._VALID_URL, url)
3664         if mobj is None:
3665             raise ExtractorError(u'Invalid URL: %s' % url)
3666
3667         video_id = mobj.group('videoid')
3668         video_title = mobj.group('title')
3669
3670         # Get webpage content
3671         webpage = self._download_webpage(url, video_id)
3672
3673         # Get the video URL
3674         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3675         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3676         video_url = compat_urllib_parse.unquote(video_url)
3677
3678         #Get the uploaded date
3679         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3680         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3681         if upload_date: upload_date = unified_strdate(upload_date)
3682
3683         info = {'id': video_id,
3684                 'url': video_url,
3685                 'uploader': None,
3686                 'upload_date': upload_date,
3687                 'title': video_title,
3688                 'ext': 'flv',
3689                 'format': 'flv'}
3690
3691         return [info]
3692
3693 class YouJizzIE(InfoExtractor):
3694     """Information extractor for youjizz.com."""
3695     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3696
3697     def _real_extract(self, url):
3698         mobj = re.match(self._VALID_URL, url)
3699         if mobj is None:
3700             raise ExtractorError(u'Invalid URL: %s' % url)
3701
3702         video_id = mobj.group('videoid')
3703
3704         # Get webpage content
3705         webpage = self._download_webpage(url, video_id)
3706
3707         # Get the video title
3708         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3709             webpage, u'title').strip()
3710
3711         # Get the embed page
3712         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3713         if result is None:
3714             raise ExtractorError(u'ERROR: unable to extract embed page')
3715
3716         embed_page_url = result.group(0).strip()
3717         video_id = result.group('videoid')
3718
3719         webpage = self._download_webpage(embed_page_url, video_id)
3720
3721         # Get the video URL
3722         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3723             webpage, u'video URL')
3724
3725         info = {'id': video_id,
3726                 'url': video_url,
3727                 'title': video_title,
3728                 'ext': 'flv',
3729                 'format': 'flv',
3730                 'player_url': embed_page_url}
3731
3732         return [info]
3733
3734 class EightTracksIE(InfoExtractor):
3735     IE_NAME = '8tracks'
3736     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3737
3738     def _real_extract(self, url):
3739         mobj = re.match(self._VALID_URL, url)
3740         if mobj is None:
3741             raise ExtractorError(u'Invalid URL: %s' % url)
3742         playlist_id = mobj.group('id')
3743
3744         webpage = self._download_webpage(url, playlist_id)
3745
3746         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3747         data = json.loads(json_like)
3748
3749         session = str(random.randint(0, 1000000000))
3750         mix_id = data['id']
3751         track_count = data['tracks_count']
3752         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3753         next_url = first_url
3754         res = []
3755         for i in itertools.count():
3756             api_json = self._download_webpage(next_url, playlist_id,
3757                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3758                 errnote=u'Failed to download song information')
3759             api_data = json.loads(api_json)
3760             track_data = api_data[u'set']['track']
3761             info = {
3762                 'id': track_data['id'],
3763                 'url': track_data['track_file_stream_url'],
3764                 'title': track_data['performer'] + u' - ' + track_data['name'],
3765                 'raw_title': track_data['name'],
3766                 'uploader_id': data['user']['login'],
3767                 'ext': 'm4a',
3768             }
3769             res.append(info)
3770             if api_data['set']['at_last_track']:
3771                 break
3772             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3773         return res
3774
3775 class KeekIE(InfoExtractor):
3776     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3777     IE_NAME = u'keek'
3778
3779     def _real_extract(self, url):
3780         m = re.match(self._VALID_URL, url)
3781         video_id = m.group('videoID')
3782
3783         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3784         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3785         webpage = self._download_webpage(url, video_id)
3786
3787         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3788             webpage, u'title')
3789
3790         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3791             webpage, u'uploader', fatal=False)
3792
3793         info = {
3794                 'id': video_id,
3795                 'url': video_url,
3796                 'ext': 'mp4',
3797                 'title': video_title,
3798                 'thumbnail': thumbnail,
3799                 'uploader': uploader
3800         }
3801         return [info]
3802
3803 class TEDIE(InfoExtractor):
3804     _VALID_URL=r'''http://www\.ted\.com/
3805                    (
3806                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3807                         |
3808                         ((?P<type_talk>talks)) # We have a simple talk
3809                    )
3810                    (/lang/(.*?))? # The url may contain the language
3811                    /(?P<name>\w+) # Here goes the name and then ".html"
3812                    '''
3813
3814     @classmethod
3815     def suitable(cls, url):
3816         """Receives a URL and returns True if suitable for this IE."""
3817         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3818
3819     def _real_extract(self, url):
3820         m=re.match(self._VALID_URL, url, re.VERBOSE)
3821         if m.group('type_talk'):
3822             return [self._talk_info(url)]
3823         else :
3824             playlist_id=m.group('playlist_id')
3825             name=m.group('name')
3826             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3827             return [self._playlist_videos_info(url,name,playlist_id)]
3828
3829     def _playlist_videos_info(self,url,name,playlist_id=0):
3830         '''Returns the videos of the playlist'''
3831         video_RE=r'''
3832                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3833                      ([.\s]*?)data-playlist_item_id="(\d+)"
3834                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3835                      '''
3836         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3837         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3838         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3839         m_names=re.finditer(video_name_RE,webpage)
3840
3841         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3842                                                  webpage, 'playlist title')
3843
3844         playlist_entries = []
3845         for m_video, m_name in zip(m_videos,m_names):
3846             video_id=m_video.group('video_id')
3847             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3848             playlist_entries.append(self.url_result(talk_url, 'TED'))
3849         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3850
3851     def _talk_info(self, url, video_id=0):
3852         """Return the video for the talk in the url"""
3853         m = re.match(self._VALID_URL, url,re.VERBOSE)
3854         video_name = m.group('name')
3855         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3856         self.report_extraction(video_name)
3857         # If the url includes the language we get the title translated
3858         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3859                                         webpage, 'title')
3860         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3861                                     webpage, 'json data')
3862         info = json.loads(json_data)
3863         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3864                                        webpage, 'description', flags = re.DOTALL)
3865
3866         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3867                                        webpage, 'thumbnail')
3868         info = {
3869                 'id': info['id'],
3870                 'url': info['htmlStreams'][-1]['file'],
3871                 'ext': 'mp4',
3872                 'title': title,
3873                 'thumbnail': thumbnail,
3874                 'description': desc,
3875                 }
3876         return info
3877
3878 class MySpassIE(InfoExtractor):
3879     _VALID_URL = r'http://www.myspass.de/.*'
3880
3881     def _real_extract(self, url):
3882         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3883
3884         # video id is the last path element of the URL
3885         # usually there is a trailing slash, so also try the second but last
3886         url_path = compat_urllib_parse_urlparse(url).path
3887         url_parent_path, video_id = os.path.split(url_path)
3888         if not video_id:
3889             _, video_id = os.path.split(url_parent_path)
3890
3891         # get metadata
3892         metadata_url = META_DATA_URL_TEMPLATE % video_id
3893         metadata_text = self._download_webpage(metadata_url, video_id)
3894         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3895
3896         # extract values from metadata
3897         url_flv_el = metadata.find('url_flv')
3898         if url_flv_el is None:
3899             raise ExtractorError(u'Unable to extract download url')
3900         video_url = url_flv_el.text
3901         extension = os.path.splitext(video_url)[1][1:]
3902         title_el = metadata.find('title')
3903         if title_el is None:
3904             raise ExtractorError(u'Unable to extract title')
3905         title = title_el.text
3906         format_id_el = metadata.find('format_id')
3907         if format_id_el is None:
3908             format = ext
3909         else:
3910             format = format_id_el.text
3911         description_el = metadata.find('description')
3912         if description_el is not None:
3913             description = description_el.text
3914         else:
3915             description = None
3916         imagePreview_el = metadata.find('imagePreview')
3917         if imagePreview_el is not None:
3918             thumbnail = imagePreview_el.text
3919         else:
3920             thumbnail = None
3921         info = {
3922             'id': video_id,
3923             'url': video_url,
3924             'title': title,
3925             'ext': extension,
3926             'format': format,
3927             'thumbnail': thumbnail,
3928             'description': description
3929         }
3930         return [info]
3931
3932 class SpiegelIE(InfoExtractor):
3933     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3934
3935     def _real_extract(self, url):
3936         m = re.match(self._VALID_URL, url)
3937         video_id = m.group('videoID')
3938
3939         webpage = self._download_webpage(url, video_id)
3940
3941         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3942             webpage, u'title')
3943
3944         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3945         xml_code = self._download_webpage(xml_url, video_id,
3946                     note=u'Downloading XML', errnote=u'Failed to download XML')
3947
3948         idoc = xml.etree.ElementTree.fromstring(xml_code)
3949         last_type = idoc[-1]
3950         filename = last_type.findall('./filename')[0].text
3951         duration = float(last_type.findall('./duration')[0].text)
3952
3953         video_url = 'http://video2.spiegel.de/flash/' + filename
3954         video_ext = filename.rpartition('.')[2]
3955         info = {
3956             'id': video_id,
3957             'url': video_url,
3958             'ext': video_ext,
3959             'title': video_title,
3960             'duration': duration,
3961         }
3962         return [info]
3963
3964 class LiveLeakIE(InfoExtractor):
3965
3966     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3967     IE_NAME = u'liveleak'
3968
3969     def _real_extract(self, url):
3970         mobj = re.match(self._VALID_URL, url)
3971         if mobj is None:
3972             raise ExtractorError(u'Invalid URL: %s' % url)
3973
3974         video_id = mobj.group('video_id')
3975
3976         webpage = self._download_webpage(url, video_id)
3977
3978         video_url = self._search_regex(r'file: "(.*?)",',
3979             webpage, u'video URL')
3980
3981         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3982             webpage, u'title').replace('LiveLeak.com -', '').strip()
3983
3984         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3985             webpage, u'description', fatal=False)
3986
3987         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3988             webpage, u'uploader', fatal=False)
3989
3990         info = {
3991             'id':  video_id,
3992             'url': video_url,
3993             'ext': 'mp4',
3994             'title': video_title,
3995             'description': video_description,
3996             'uploader': video_uploader
3997         }
3998
3999         return [info]
4000
4001 class ARDIE(InfoExtractor):
4002     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4003     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4004     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4005
4006     def _real_extract(self, url):
4007         # determine video id from url
4008         m = re.match(self._VALID_URL, url)
4009
4010         numid = re.search(r'documentId=([0-9]+)', url)
4011         if numid:
4012             video_id = numid.group(1)
4013         else:
4014             video_id = m.group('video_id')
4015
4016         # determine title and media streams from webpage
4017         html = self._download_webpage(url, video_id)
4018         title = re.search(self._TITLE, html).group('title')
4019         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4020         if not streams:
4021             assert '"fsk"' in html
4022             raise ExtractorError(u'This video is only available after 8:00 pm')
4023
4024         # choose default media type and highest quality for now
4025         stream = max([s for s in streams if int(s["media_type"]) == 0],
4026                      key=lambda s: int(s["quality"]))
4027
4028         # there's two possibilities: RTMP stream or HTTP download
4029         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4030         if stream['rtmp_url']:
4031             self.to_screen(u'RTMP download detected')
4032             assert stream['video_url'].startswith('mp4:')
4033             info["url"] = stream["rtmp_url"]
4034             info["play_path"] = stream['video_url']
4035         else:
4036             assert stream["video_url"].endswith('.mp4')
4037             info["url"] = stream["video_url"]
4038         return [info]
4039
4040 class ZDFIE(InfoExtractor):
4041     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4042     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4043     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4044     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4045     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4046
4047     def _real_extract(self, url):
4048         mobj = re.match(self._VALID_URL, url)
4049         if mobj is None:
4050             raise ExtractorError(u'Invalid URL: %s' % url)
4051         video_id = mobj.group('video_id')
4052
4053         html = self._download_webpage(url, video_id)
4054         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4055         if streams is None:
4056             raise ExtractorError(u'No media url found.')
4057
4058         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4059         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4060         # choose first/default media type and highest quality for now
4061         for s in streams:        #find 300 - dsl1000mbit
4062             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4063                 stream_=s
4064                 break
4065         for s in streams:        #find veryhigh - dsl2000mbit
4066             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4067                 stream_=s
4068                 break
4069         if stream_ is None:
4070             raise ExtractorError(u'No stream found.')
4071
4072         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4073
4074         self.report_extraction(video_id)
4075         mobj = re.search(self._TITLE, html)
4076         if mobj is None:
4077             raise ExtractorError(u'Cannot extract title')
4078         title = unescapeHTML(mobj.group('title'))
4079
4080         mobj = re.search(self._MMS_STREAM, media_link)
4081         if mobj is None:
4082             mobj = re.search(self._RTSP_STREAM, media_link)
4083             if mobj is None:
4084                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4085         mms_url = mobj.group('video_url')
4086
4087         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4088         if mobj is None:
4089             raise ExtractorError(u'Cannot extract extention')
4090         ext = mobj.group('ext')
4091
4092         return [{'id': video_id,
4093                  'url': mms_url,
4094                  'title': title,
4095                  'ext': ext
4096                  }]
4097
4098 class TumblrIE(InfoExtractor):
4099     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4100
4101     def _real_extract(self, url):
4102         m_url = re.match(self._VALID_URL, url)
4103         video_id = m_url.group('id')
4104         blog = m_url.group('blog_name')
4105
4106         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4107         webpage = self._download_webpage(url, video_id)
4108
4109         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4110         video = re.search(re_video, webpage)
4111         if video is None:
4112            raise ExtractorError(u'Unable to extract video')
4113         video_url = video.group('video_url')
4114         ext = video.group('ext')
4115
4116         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4117             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4118         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4119
4120         # The only place where you can get a title, it's not complete,
4121         # but searching in other places doesn't work for all videos
4122         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4123             webpage, u'title', flags=re.DOTALL)
4124
4125         return [{'id': video_id,
4126                  'url': video_url,
4127                  'title': video_title,
4128                  'thumbnail': video_thumbnail,
4129                  'ext': ext
4130                  }]
4131
4132 class BandcampIE(InfoExtractor):
4133     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4134
4135     def _real_extract(self, url):
4136         mobj = re.match(self._VALID_URL, url)
4137         title = mobj.group('title')
4138         webpage = self._download_webpage(url, title)
4139         # We get the link to the free download page
4140         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4141         if m_download is None:
4142             raise ExtractorError(u'No free songs found')
4143
4144         download_link = m_download.group(1)
4145         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4146                        webpage, re.MULTILINE|re.DOTALL).group('id')
4147
4148         download_webpage = self._download_webpage(download_link, id,
4149                                                   'Downloading free downloads page')
4150         # We get the dictionary of the track from some javascrip code
4151         info = re.search(r'items: (.*?),$',
4152                          download_webpage, re.MULTILINE).group(1)
4153         info = json.loads(info)[0]
4154         # We pick mp3-320 for now, until format selection can be easily implemented.
4155         mp3_info = info[u'downloads'][u'mp3-320']
4156         # If we try to use this url it says the link has expired
4157         initial_url = mp3_info[u'url']
4158         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4159         m_url = re.match(re_url, initial_url)
4160         #We build the url we will use to get the final track url
4161         # This url is build in Bandcamp in the script download_bunde_*.js
4162         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4163         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4164         # If we could correctly generate the .rand field the url would be
4165         #in the "download_url" key
4166         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4167
4168         track_info = {'id':id,
4169                       'title' : info[u'title'],
4170                       'ext' :   'mp3',
4171                       'url' :   final_url,
4172                       'thumbnail' : info[u'thumb_url'],
4173                       'uploader' :  info[u'artist']
4174                       }
4175
4176         return [track_info]
4177
4178 class RedTubeIE(InfoExtractor):
4179     """Information Extractor for redtube"""
4180     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4181
4182     def _real_extract(self,url):
4183         mobj = re.match(self._VALID_URL, url)
4184         if mobj is None:
4185             raise ExtractorError(u'Invalid URL: %s' % url)
4186
4187         video_id = mobj.group('id')
4188         video_extension = 'mp4'
4189         webpage = self._download_webpage(url, video_id)
4190
4191         self.report_extraction(video_id)
4192
4193         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4194             webpage, u'video URL')
4195
4196         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4197             webpage, u'title')
4198
4199         return [{
4200             'id':       video_id,
4201             'url':      video_url,
4202             'ext':      video_extension,
4203             'title':    video_title,
4204         }]
4205
4206 class InaIE(InfoExtractor):
4207     """Information Extractor for Ina.fr"""
4208     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4209
4210     def _real_extract(self,url):
4211         mobj = re.match(self._VALID_URL, url)
4212
4213         video_id = mobj.group('id')
4214         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4215         video_extension = 'mp4'
4216         webpage = self._download_webpage(mrss_url, video_id)
4217
4218         self.report_extraction(video_id)
4219
4220         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4221             webpage, u'video URL')
4222
4223         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4224             webpage, u'title')
4225
4226         return [{
4227             'id':       video_id,
4228             'url':      video_url,
4229             'ext':      video_extension,
4230             'title':    video_title,
4231         }]
4232
4233 class HowcastIE(InfoExtractor):
4234     """Information Extractor for Howcast.com"""
4235     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4236
4237     def _real_extract(self, url):
4238         mobj = re.match(self._VALID_URL, url)
4239
4240         video_id = mobj.group('id')
4241         webpage_url = 'http://www.howcast.com/videos/' + video_id
4242         webpage = self._download_webpage(webpage_url, video_id)
4243
4244         self.report_extraction(video_id)
4245
4246         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4247             webpage, u'video URL')
4248
4249         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4250             webpage, u'title')
4251
4252         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4253             webpage, u'description', fatal=False)
4254
4255         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4256             webpage, u'thumbnail', fatal=False)
4257
4258         return [{
4259             'id':       video_id,
4260             'url':      video_url,
4261             'ext':      'mp4',
4262             'title':    video_title,
4263             'description': video_description,
4264             'thumbnail': thumbnail,
4265         }]
4266
4267 class VineIE(InfoExtractor):
4268     """Information Extractor for Vine.co"""
4269     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4270
4271     def _real_extract(self, url):
4272         mobj = re.match(self._VALID_URL, url)
4273
4274         video_id = mobj.group('id')
4275         webpage_url = 'https://vine.co/v/' + video_id
4276         webpage = self._download_webpage(webpage_url, video_id)
4277
4278         self.report_extraction(video_id)
4279
4280         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4281             webpage, u'video URL')
4282
4283         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4284             webpage, u'title')
4285
4286         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4287             webpage, u'thumbnail', fatal=False)
4288
4289         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4290             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4291
4292         return [{
4293             'id':        video_id,
4294             'url':       video_url,
4295             'ext':       'mp4',
4296             'title':     video_title,
4297             'thumbnail': thumbnail,
4298             'uploader':  uploader,
4299         }]
4300
4301 class FlickrIE(InfoExtractor):
4302     """Information Extractor for Flickr videos"""
4303     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4304
4305     def _real_extract(self, url):
4306         mobj = re.match(self._VALID_URL, url)
4307
4308         video_id = mobj.group('id')
4309         video_uploader_id = mobj.group('uploader_id')
4310         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4311         webpage = self._download_webpage(webpage_url, video_id)
4312
4313         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4314
4315         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4316         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4317
4318         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4319             first_xml, u'node_id')
4320
4321         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4322         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4323
4324         self.report_extraction(video_id)
4325
4326         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4327         if mobj is None:
4328             raise ExtractorError(u'Unable to extract video url')
4329         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4330
4331         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4332             webpage, u'video title')
4333
4334         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4335             webpage, u'description', fatal=False)
4336
4337         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4338             webpage, u'thumbnail', fatal=False)
4339
4340         return [{
4341             'id':          video_id,
4342             'url':         video_url,
4343             'ext':         'mp4',
4344             'title':       video_title,
4345             'description': video_description,
4346             'thumbnail':   thumbnail,
4347             'uploader_id': video_uploader_id,
4348         }]
4349
4350 class TeamcocoIE(InfoExtractor):
4351     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4352
4353     def _real_extract(self, url):
4354         mobj = re.match(self._VALID_URL, url)
4355         if mobj is None:
4356             raise ExtractorError(u'Invalid URL: %s' % url)
4357         url_title = mobj.group('url_title')
4358         webpage = self._download_webpage(url, url_title)
4359
4360         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4361             webpage, u'video id')
4362
4363         self.report_extraction(video_id)
4364
4365         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4366             webpage, u'title')
4367
4368         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4369             webpage, u'thumbnail', fatal=False)
4370
4371         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4372             webpage, u'description', fatal=False)
4373
4374         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4375         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4376
4377         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4378             data, u'video URL')
4379
4380         return [{
4381             'id':          video_id,
4382             'url':         video_url,
4383             'ext':         'mp4',
4384             'title':       video_title,
4385             'thumbnail':   thumbnail,
4386             'description': video_description,
4387         }]
4388
4389 class XHamsterIE(InfoExtractor):
4390     """Information Extractor for xHamster"""
4391     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4392
4393     def _real_extract(self,url):
4394         mobj = re.match(self._VALID_URL, url)
4395
4396         video_id = mobj.group('id')
4397         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4398         webpage = self._download_webpage(mrss_url, video_id)
4399
4400         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4401         if mobj is None:
4402             raise ExtractorError(u'Unable to extract media URL')
4403         if len(mobj.group('server')) == 0:
4404             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4405         else:
4406             video_url = mobj.group('server')+'/key='+mobj.group('file')
4407         video_extension = video_url.split('.')[-1]
4408
4409         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4410             webpage, u'title')
4411
4412         # Can't see the description anywhere in the UI
4413         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4414         #     webpage, u'description', fatal=False)
4415         # if video_description: video_description = unescapeHTML(video_description)
4416
4417         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4418         if mobj:
4419             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4420         else:
4421             video_upload_date = None
4422             self._downloader.report_warning(u'Unable to extract upload date')
4423
4424         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4425             webpage, u'uploader id', default=u'anonymous')
4426
4427         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4428             webpage, u'thumbnail', fatal=False)
4429
4430         return [{
4431             'id':       video_id,
4432             'url':      video_url,
4433             'ext':      video_extension,
4434             'title':    video_title,
4435             # 'description': video_description,
4436             'upload_date': video_upload_date,
4437             'uploader_id': video_uploader_id,
4438             'thumbnail': video_thumbnail
4439         }]
4440
4441 class HypemIE(InfoExtractor):
4442     """Information Extractor for hypem"""
4443     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4444
4445     def _real_extract(self, url):
4446         mobj = re.match(self._VALID_URL, url)
4447         if mobj is None:
4448             raise ExtractorError(u'Invalid URL: %s' % url)
4449         track_id = mobj.group(1)
4450
4451         data = { 'ax': 1, 'ts': time.time() }
4452         data_encoded = compat_urllib_parse.urlencode(data)
4453         complete_url = url + "?" + data_encoded
4454         request = compat_urllib_request.Request(complete_url)
4455         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4456         cookie = urlh.headers.get('Set-Cookie', '')
4457
4458         self.report_extraction(track_id)
4459
4460         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4461             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4462         try:
4463             track_list = json.loads(html_tracks)
4464             track = track_list[u'tracks'][0]
4465         except ValueError:
4466             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4467
4468         key = track[u"key"]
4469         track_id = track[u"id"]
4470         artist = track[u"artist"]
4471         title = track[u"song"]
4472
4473         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4474         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4475         request.add_header('cookie', cookie)
4476         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4477         try:
4478             song_data = json.loads(song_data_json)
4479         except ValueError:
4480             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4481         final_url = song_data[u"url"]
4482
4483         return [{
4484             'id':       track_id,
4485             'url':      final_url,
4486             'ext':      "mp3",
4487             'title':    title,
4488             'artist':   artist,
4489         }]
4490
4491 class Vbox7IE(InfoExtractor):
4492     """Information Extractor for Vbox7"""
4493     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4494
4495     def _real_extract(self,url):
4496         mobj = re.match(self._VALID_URL, url)
4497         if mobj is None:
4498             raise ExtractorError(u'Invalid URL: %s' % url)
4499         video_id = mobj.group(1)
4500
4501         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4502         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4503         redirect_url = urlh.geturl() + new_location
4504         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4505
4506         title = self._html_search_regex(r'<title>(.*)</title>',
4507             webpage, u'title').split('/')[0].strip()
4508
4509         ext = "flv"
4510         info_url = "http://vbox7.com/play/magare.do"
4511         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4512         info_request = compat_urllib_request.Request(info_url, data)
4513         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4514         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4515         if info_response is None:
4516             raise ExtractorError(u'Unable to extract the media url')
4517         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4518
4519         return [{
4520             'id':        video_id,
4521             'url':       final_url,
4522             'ext':       ext,
4523             'title':     title,
4524             'thumbnail': thumbnail_url,
4525         }]
4526
4527 class GametrailersIE(InfoExtractor):
4528     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4529
4530     def _real_extract(self, url):
4531         mobj = re.match(self._VALID_URL, url)
4532         if mobj is None:
4533             raise ExtractorError(u'Invalid URL: %s' % url)
4534         video_id = mobj.group('id')
4535         video_type = mobj.group('type')
4536         webpage = self._download_webpage(url, video_id)
4537         if video_type == 'full-episodes':
4538             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4539         else:
4540             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4541         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4542         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4543
4544         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4545                                            video_id, u'Downloading video info')
4546         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4547                                                video_id, u'Downloading video urls info')
4548
4549         self.report_extraction(video_id)
4550         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4551                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4552                       <image>.*
4553                         <url>(?P<thumb>.*?)</url>.*
4554                       </image>'''
4555
4556         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4557         if m_info is None:
4558             raise ExtractorError(u'Unable to extract video info')
4559         video_title = m_info.group('title')
4560         video_description = m_info.group('description')
4561         video_thumb = m_info.group('thumb')
4562
4563         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4564         if m_urls is None or len(m_urls) == 0:
4565             raise ExtractError(u'Unable to extrat video url')
4566         # They are sorted from worst to best quality
4567         video_url = m_urls[-1].group('url')
4568
4569         return {'url':         video_url,
4570                 'id':          video_id,
4571                 'title':       video_title,
4572                 # Videos are actually flv not mp4
4573                 'ext':         'flv',
4574                 'thumbnail':   video_thumb,
4575                 'description': video_description,
4576                 }
4577
4578 class StatigramIE(InfoExtractor):
4579     _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4580
4581     def _real_extract(self, url):
4582         mobj = re.match(self._VALID_URL, url)
4583
4584         video_id = mobj.group(1)
4585         webpage = self._download_webpage(url, video_id)
4586         video_url = self._html_search_regex(
4587             r'<meta property="og:video:secure_url" content="(.+?)">',
4588             webpage, u'video URL')
4589         thumbnail_url = self._html_search_regex(
4590             r'<meta property="og:image" content="(.+?)" />',
4591             webpage, u'thumbnail URL', fatal=False)
4592         html_title = self._html_search_regex(
4593             r'<title>(.+?)</title>',
4594             webpage, u'title')
4595         title = html_title.rpartition(u' | Statigram')[0]
4596         uploader = self._html_search_regex(
4597             r'@(.+) \(Videos\)', title, u'uploader name', fatal=False)
4598         ext = 'mp4'
4599
4600         return [{
4601             'id':        video_id,
4602             'url':       video_url,
4603             'ext':       ext,
4604             'title':     title,
4605             'thumbnail': thumbnail_url,
4606             'uploader' : uploader
4607         }]
4608
4609 def gen_extractors():
4610     """ Return a list of an instance of every supported extractor.
4611     The order does matter; the first extractor matched is the one handling the URL.
4612     """
4613     return [
4614         YoutubePlaylistIE(),
4615         YoutubeChannelIE(),
4616         YoutubeUserIE(),
4617         YoutubeSearchIE(),
4618         YoutubeIE(),
4619         MetacafeIE(),
4620         DailymotionIE(),
4621         GoogleSearchIE(),
4622         PhotobucketIE(),
4623         YahooIE(),
4624         YahooSearchIE(),
4625         DepositFilesIE(),
4626         FacebookIE(),
4627         BlipTVIE(),
4628         BlipTVUserIE(),
4629         VimeoIE(),
4630         MyVideoIE(),
4631         ComedyCentralIE(),
4632         EscapistIE(),
4633         CollegeHumorIE(),
4634         XVideosIE(),
4635         SoundcloudSetIE(),
4636         SoundcloudIE(),
4637         InfoQIE(),
4638         MixcloudIE(),
4639         StanfordOpenClassroomIE(),
4640         MTVIE(),
4641         YoukuIE(),
4642         XNXXIE(),
4643         YouJizzIE(),
4644         PornotubeIE(),
4645         YouPornIE(),
4646         GooglePlusIE(),
4647         ArteTvIE(),
4648         NBAIE(),
4649         WorldStarHipHopIE(),
4650         JustinTVIE(),
4651         FunnyOrDieIE(),
4652         SteamIE(),
4653         UstreamIE(),
4654         RBMARadioIE(),
4655         EightTracksIE(),
4656         KeekIE(),
4657         TEDIE(),
4658         MySpassIE(),
4659         SpiegelIE(),
4660         LiveLeakIE(),
4661         ARDIE(),
4662         ZDFIE(),
4663         TumblrIE(),
4664         BandcampIE(),
4665         RedTubeIE(),
4666         InaIE(),
4667         HowcastIE(),
4668         VineIE(),
4669         FlickrIE(),
4670         TeamcocoIE(),
4671         XHamsterIE(),
4672         HypemIE(),
4673         Vbox7IE(),
4674         GametrailersIE(),
4675         StatigramIE(),
4676         GenericIE()
4677     ]
4678
4679 def get_info_extractor(ie_name):
4680     """Returns the info extractor class with the given ie_name"""
4681     return globals()[ie_name+'IE']