_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 195         """
 196         Perform a regex search on the given string, using a single or a list of
 197         patterns returning the first matching group.
 198         In case of failure return a default value or raise a WARNING or a
 199         ExtractorError, depending on fatal, specifying the field name.
 200         """
 201         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 202             mobj = re.search(pattern, string, flags)
 203         else:
 204             for p in pattern:
 205                 mobj = re.search(p, string, flags)
 206                 if mobj: break
 207
 208         if sys.stderr.isatty() and os.name != 'nt':
 209             _name = u'\033[0;34m%s\033[0m' % name
 210         else:
 211             _name = name
 212
 213         if mobj:
 214             # return the first matching group
 215             return next(g for g in mobj.groups() if g is not None)
 216         elif default is not None:
 217             return default
 218         elif fatal:
 219             raise ExtractorError(u'Unable to extract %s' % _name)
 220         else:
 221             self._downloader.report_warning(u'unable to extract %s; '
 222                 u'please report this issue on GitHub.' % _name)
 223             return None
 224
 225     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
 226         """
 227         Like _search_regex, but strips HTML tags and unescapes entities.
 228         """
 229         res = self._search_regex(pattern, string, name, default, fatal, flags)
 230         if res:
 231             return clean_html(res).strip()
 232         else:
 233             return res
 234
 235 class SearchInfoExtractor(InfoExtractor):
 236     """
 237     Base class for paged search queries extractors.
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 240     """
 241
 242     @classmethod
 243     def _make_valid_url(cls):
 244         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 245
 246     @classmethod
 247     def suitable(cls, url):
 248         return re.match(cls._make_valid_url(), url) is not None
 249
 250     def _real_extract(self, query):
 251         mobj = re.match(self._make_valid_url(), query)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid search query "%s"' % query)
 254
 255         prefix = mobj.group('prefix')
 256         query = mobj.group('query')
 257         if prefix == '':
 258             return self._get_n_results(query, 1)
 259         elif prefix == 'all':
 260             return self._get_n_results(query, self._MAX_RESULTS)
 261         else:
 262             n = int(prefix)
 263             if n <= 0:
 264                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 265             elif n > self._MAX_RESULTS:
 266                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 267                 n = self._MAX_RESULTS
 268             return self._get_n_results(query, n)
 269
 270     def _get_n_results(self, query, n):
 271         """Get a specified number of results for a query"""
 272         raise NotImplementedError("This method must be implemented by sublclasses")
 273
 274
 275 class YoutubeIE(InfoExtractor):
 276     """Information extractor for youtube.com."""
 277
 278     _VALID_URL = r"""^
 279                      (
 280                          (?:https?://)?                                       # http(s):// (optional)
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 284                          (?:                                                  # the various things that can precede the ID:
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 286                              |(?:                                             # or the v= param in all its forms
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 290                                  v=
 291                              )
 292                          )?                                                   # optional -> youtube.com/xxxx is OK
 293                      )?                                                       # all until now is optional -> you can pass the naked ID
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 295                      (?(1).+)?                                                # if we found the ID, everything can follow
 296                      $"""
 297     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 298     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 299     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 300     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 301     _NETRC_MACHINE = 'youtube'
 302     # Listed in order of quality
 303     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 304     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 305     _video_extensions = {
 306         '13': '3gp',
 307         '17': 'mp4',
 308         '18': 'mp4',
 309         '22': 'mp4',
 310         '37': 'mp4',
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 312         '43': 'webm',
 313         '44': 'webm',
 314         '45': 'webm',
 315         '46': 'webm',
 316     }
 317     _video_dimensions = {
 318         '5': '240x400',
 319         '6': '???',
 320         '13': '???',
 321         '17': '144x176',
 322         '18': '360x640',
 323         '22': '720x1280',
 324         '34': '360x640',
 325         '35': '480x854',
 326         '37': '1080x1920',
 327         '38': '3072x4096',
 328         '43': '360x640',
 329         '44': '480x854',
 330         '45': '720x1280',
 331         '46': '1080x1920',
 332     }
 333     IE_NAME = u'youtube'
 334
 335     @classmethod
 336     def suitable(cls, url):
 337         """Receives a URL and returns True if suitable for this IE."""
 338         if YoutubePlaylistIE.suitable(url): return False
 339         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 340
 341     def report_lang(self):
 342         """Report attempt to set language."""
 343         self.to_screen(u'Setting language')
 344
 345     def report_login(self):
 346         """Report attempt to log in."""
 347         self.to_screen(u'Logging in')
 348
 349     def report_video_webpage_download(self, video_id):
 350         """Report attempt to download video webpage."""
 351         self.to_screen(u'%s: Downloading video webpage' % video_id)
 352
 353     def report_video_info_webpage_download(self, video_id):
 354         """Report attempt to download video info webpage."""
 355         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 356
 357     def report_video_subtitles_download(self, video_id):
 358         """Report attempt to download video info webpage."""
 359         self.to_screen(u'%s: Checking available subtitles' % video_id)
 360
 361     def report_video_subtitles_request(self, video_id, sub_lang, format):
 362         """Report attempt to download video info webpage."""
 363         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 364
 365     def report_video_subtitles_available(self, video_id, sub_lang_list):
 366         """Report available subtitles."""
 367         sub_lang = ",".join(list(sub_lang_list.keys()))
 368         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 369
 370     def report_information_extraction(self, video_id):
 371         """Report attempt to extract video information."""
 372         self.to_screen(u'%s: Extracting video information' % video_id)
 373
 374     def report_unavailable_format(self, video_id, format):
 375         """Report extracted video URL."""
 376         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 377
 378     def report_rtmp_download(self):
 379         """Indicate the download will use the RTMP protocol."""
 380         self.to_screen(u'RTMP download detected')
 381
 382     def _get_available_subtitles(self, video_id):
 383         self.report_video_subtitles_download(video_id)
 384         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 385         try:
 386             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 387         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 388             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 389         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 390         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 391         if not sub_lang_list:
 392             return (u'video doesn\'t have subtitles', None)
 393         return sub_lang_list
 394
 395     def _list_available_subtitles(self, video_id):
 396         sub_lang_list = self._get_available_subtitles(video_id)
 397         self.report_video_subtitles_available(video_id, sub_lang_list)
 398
 399     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 400         """
 401         Return tuple:
 402         (error_message, sub_lang, sub)
 403         """
 404         self.report_video_subtitles_request(video_id, sub_lang, format)
 405         params = compat_urllib_parse.urlencode({
 406             'lang': sub_lang,
 407             'name': sub_name,
 408             'v': video_id,
 409             'fmt': format,
 410         })
 411         url = 'http://www.youtube.com/api/timedtext?' + params
 412         try:
 413             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 416         if not sub:
 417             return (u'Did not fetch video subtitles', None, None)
 418         return (None, sub_lang, sub)
 419
 420     def _request_automatic_caption(self, video_id, webpage):
 421         """We need the webpage for getting the captions url, pass it as an
 422            argument to speed up the process."""
 423         sub_lang = self._downloader.params.get('subtitleslang')
 424         sub_format = self._downloader.params.get('subtitlesformat')
 425         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 426         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 427         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 428         if mobj is None:
 429             return [(err_msg, None, None)]
 430         player_config = json.loads(mobj.group(1))
 431         try:
 432             args = player_config[u'args']
 433             caption_url = args[u'ttsurl']
 434             timestamp = args[u'timestamp']
 435             params = compat_urllib_parse.urlencode({
 436                 'lang': 'en',
 437                 'tlang': sub_lang,
 438                 'fmt': sub_format,
 439                 'ts': timestamp,
 440                 'kind': 'asr',
 441             })
 442             subtitles_url = caption_url + '&' + params
 443             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 444             return [(None, sub_lang, sub)]
 445         except KeyError:
 446             return [(err_msg, None, None)]
 447
 448     def _extract_subtitle(self, video_id):
 449         """
 450         Return a list with a tuple:
 451         [(error_message, sub_lang, sub)]
 452         """
 453         sub_lang_list = self._get_available_subtitles(video_id)
 454         sub_format = self._downloader.params.get('subtitlesformat')
 455         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 456             return [(sub_lang_list[0], None, None)]
 457         if self._downloader.params.get('subtitleslang', False):
 458             sub_lang = self._downloader.params.get('subtitleslang')
 459         elif 'en' in sub_lang_list:
 460             sub_lang = 'en'
 461         else:
 462             sub_lang = list(sub_lang_list.keys())[0]
 463         if not sub_lang in sub_lang_list:
 464             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 465
 466         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 467         return [subtitle]
 468
 469     def _extract_all_subtitles(self, video_id):
 470         sub_lang_list = self._get_available_subtitles(video_id)
 471         sub_format = self._downloader.params.get('subtitlesformat')
 472         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 473             return [(sub_lang_list[0], None, None)]
 474         subtitles = []
 475         for sub_lang in sub_lang_list:
 476             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 477             subtitles.append(subtitle)
 478         return subtitles
 479
 480     def _print_formats(self, formats):
 481         print('Available formats:')
 482         for x in formats:
 483             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 484
 485     def _real_initialize(self):
 486         if self._downloader is None:
 487             return
 488
 489         username = None
 490         password = None
 491         downloader_params = self._downloader.params
 492
 493         # Attempt to use provided username and password or .netrc data
 494         if downloader_params.get('username', None) is not None:
 495             username = downloader_params['username']
 496             password = downloader_params['password']
 497         elif downloader_params.get('usenetrc', False):
 498             try:
 499                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 500                 if info is not None:
 501                     username = info[0]
 502                     password = info[2]
 503                 else:
 504                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 505             except (IOError, netrc.NetrcParseError) as err:
 506                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 507                 return
 508
 509         # Set language
 510         request = compat_urllib_request.Request(self._LANG_URL)
 511         try:
 512             self.report_lang()
 513             compat_urllib_request.urlopen(request).read()
 514         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 515             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 516             return
 517
 518         # No authentication to be performed
 519         if username is None:
 520             return
 521
 522         request = compat_urllib_request.Request(self._LOGIN_URL)
 523         try:
 524             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 525         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 526             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 527             return
 528
 529         galx = None
 530         dsh = None
 531         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 532         if match:
 533           galx = match.group(1)
 534
 535         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 536         if match:
 537           dsh = match.group(1)
 538
 539         # Log in
 540         login_form_strs = {
 541                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 542                 u'Email': username,
 543                 u'GALX': galx,
 544                 u'Passwd': password,
 545                 u'PersistentCookie': u'yes',
 546                 u'_utf8': u'霱',
 547                 u'bgresponse': u'js_disabled',
 548                 u'checkConnection': u'',
 549                 u'checkedDomains': u'youtube',
 550                 u'dnConn': u'',
 551                 u'dsh': dsh,
 552                 u'pstMsg': u'0',
 553                 u'rmShown': u'1',
 554                 u'secTok': u'',
 555                 u'signIn': u'Sign in',
 556                 u'timeStmp': u'',
 557                 u'service': u'youtube',
 558                 u'uilel': u'3',
 559                 u'hl': u'en_US',
 560         }
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 562         # chokes on unicode
 563         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 564         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 565         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 566         try:
 567             self.report_login()
 568             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 569             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 570                 self._downloader.report_warning(u'unable to log in: bad username or password')
 571                 return
 572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 573             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 574             return
 575
 576         # Confirm age
 577         age_form = {
 578                 'next_url':     '/',
 579                 'action_confirm':   'Confirm',
 580                 }
 581         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 582         try:
 583             self.report_age_confirmation()
 584             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 585         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 586             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 587
 588     def _extract_id(self, url):
 589         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 590         if mobj is None:
 591             raise ExtractorError(u'Invalid URL: %s' % url)
 592         video_id = mobj.group(2)
 593         return video_id
 594
 595     def _real_extract(self, url):
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 597         mobj = re.search(self._NEXT_URL_RE, url)
 598         if mobj:
 599             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 600         video_id = self._extract_id(url)
 601
 602         # Get video webpage
 603         self.report_video_webpage_download(video_id)
 604         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 605         request = compat_urllib_request.Request(url)
 606         try:
 607             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 608         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 609             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 610
 611         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 612
 613         # Attempt to extract SWF player URL
 614         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 615         if mobj is not None:
 616             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 617         else:
 618             player_url = None
 619
 620         # Get video info
 621         self.report_video_info_webpage_download(video_id)
 622         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 623             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 624                     % (video_id, el_type))
 625             video_info_webpage = self._download_webpage(video_info_url, video_id,
 626                                     note=False,
 627                                     errnote='unable to download video info webpage')
 628             video_info = compat_parse_qs(video_info_webpage)
 629             if 'token' in video_info:
 630                 break
 631         if 'token' not in video_info:
 632             if 'reason' in video_info:
 633                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 634             else:
 635                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 636
 637         # Check for "rental" videos
 638         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 639             raise ExtractorError(u'"rental" videos not supported')
 640
 641         # Start extracting information
 642         self.report_information_extraction(video_id)
 643
 644         # uploader
 645         if 'author' not in video_info:
 646             raise ExtractorError(u'Unable to extract uploader name')
 647         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 648
 649         # uploader_id
 650         video_uploader_id = None
 651         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 652         if mobj is not None:
 653             video_uploader_id = mobj.group(1)
 654         else:
 655             self._downloader.report_warning(u'unable to extract uploader nickname')
 656
 657         # title
 658         if 'title' not in video_info:
 659             raise ExtractorError(u'Unable to extract video title')
 660         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 661
 662         # thumbnail image
 663         if 'thumbnail_url' not in video_info:
 664             self._downloader.report_warning(u'unable to extract video thumbnail')
 665             video_thumbnail = ''
 666         else:   # don't panic if we can't find it
 667             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 668
 669         # upload date
 670         upload_date = None
 671         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 672         if mobj is not None:
 673             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 674             upload_date = unified_strdate(upload_date)
 675
 676         # description
 677         video_description = get_element_by_id("eow-description", video_webpage)
 678         if video_description:
 679             video_description = clean_html(video_description)
 680         else:
 681             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 682             if fd_mobj:
 683                 video_description = unescapeHTML(fd_mobj.group(1))
 684             else:
 685                 video_description = u''
 686
 687         # subtitles
 688         video_subtitles = None
 689
 690         if self._downloader.params.get('writesubtitles', False):
 691             video_subtitles = self._extract_subtitle(video_id)
 692             if video_subtitles:
 693                 (sub_error, sub_lang, sub) = video_subtitles[0]
 694                 if sub_error:
 695                     # We try with the automatic captions
 696                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 697                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 698                     if sub is not None:
 699                         pass
 700                     else:
 701                         # We report the original error
 702                         self._downloader.report_error(sub_error)
 703
 704         if self._downloader.params.get('allsubtitles', False):
 705             video_subtitles = self._extract_all_subtitles(video_id)
 706             for video_subtitle in video_subtitles:
 707                 (sub_error, sub_lang, sub) = video_subtitle
 708                 if sub_error:
 709                     self._downloader.report_error(sub_error)
 710
 711         if self._downloader.params.get('listsubtitles', False):
 712             sub_lang_list = self._list_available_subtitles(video_id)
 713             return
 714
 715         if 'length_seconds' not in video_info:
 716             self._downloader.report_warning(u'unable to extract video duration')
 717             video_duration = ''
 718         else:
 719             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 720
 721         # token
 722         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 723
 724         # Decide which formats to download
 725         req_format = self._downloader.params.get('format', None)
 726
 727         try:
 728             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
 729             info = json.loads(mobj.group(1))
 730             if 'dashmpd' in info['args']:
 731                 # Vevo videos with encrypted signatures
 732                 self.to_screen(u'Vevo video detected.')
 733                 video_info['url_encoded_fmt_stream_map'] = [info['args']['url_encoded_fmt_stream_map']]
 734         except ValueError:
 735             pass
 736
 737         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 738             self.report_rtmp_download()
 739             video_url_list = [(None, video_info['conn'][0])]
 740         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 741             url_map = {}
 742             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 743                 url_data = compat_parse_qs(url_data_str)
 744                 if 'itag' in url_data and 'url' in url_data:
 745                     url = url_data['url'][0]
 746                     if 'sig' in url_data:
 747                         url += '&signature=' + url_data['sig'][0]
 748                     if 's' in url_data:
 749                         def k(s):
 750                             """Decrypt the key the two subkeys must have a length of 43"""
 751                             (a,b) = s.split('.')
 752                             b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
 753                             a = a[-40:]
 754                             s_dec = '.'.join((a,b))[::-1]
 755                             return s_dec
 756                         key = k(url_data['s'][0])
 757                         url += '&signature=' + key
 758                     if 'ratebypass' not in url:
 759                         url += '&ratebypass=yes'
 760                     url_map[url_data['itag'][0]] = url
 761
 762             format_limit = self._downloader.params.get('format_limit', None)
 763             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 764             if format_limit is not None and format_limit in available_formats:
 765                 format_list = available_formats[available_formats.index(format_limit):]
 766             else:
 767                 format_list = available_formats
 768             existing_formats = [x for x in format_list if x in url_map]
 769             if len(existing_formats) == 0:
 770                 raise ExtractorError(u'no known formats available for video')
 771             if self._downloader.params.get('listformats', None):
 772                 self._print_formats(existing_formats)
 773                 return
 774             if req_format is None or req_format == 'best':
 775                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 776             elif req_format == 'worst':
 777                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 778             elif req_format in ('-1', 'all'):
 779                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 780             else:
 781                 # Specific formats. We pick the first in a slash-delimeted sequence.
 782                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 783                 req_formats = req_format.split('/')
 784                 video_url_list = None
 785                 for rf in req_formats:
 786                     if rf in url_map:
 787                         video_url_list = [(rf, url_map[rf])]
 788                         break
 789                 if video_url_list is None:
 790                     raise ExtractorError(u'requested format not available')
 791         else:
 792             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 793
 794         results = []
 795         for format_param, video_real_url in video_url_list:
 796             # Extension
 797             video_extension = self._video_extensions.get(format_param, 'flv')
 798
 799             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 800                                               self._video_dimensions.get(format_param, '???'))
 801
 802             results.append({
 803                 'id':       video_id,
 804                 'url':      video_real_url,
 805                 'uploader': video_uploader,
 806                 'uploader_id': video_uploader_id,
 807                 'upload_date':  upload_date,
 808                 'title':    video_title,
 809                 'ext':      video_extension,
 810                 'format':   video_format,
 811                 'thumbnail':    video_thumbnail,
 812                 'description':  video_description,
 813                 'player_url':   player_url,
 814                 'subtitles':    video_subtitles,
 815                 'duration':     video_duration
 816             })
 817         return results
 818
 819
 820 class MetacafeIE(InfoExtractor):
 821     """Information Extractor for metacafe.com."""
 822
 823     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 824     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 825     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 826     IE_NAME = u'metacafe'
 827
 828     def report_disclaimer(self):
 829         """Report disclaimer retrieval."""
 830         self.to_screen(u'Retrieving disclaimer')
 831
 832     def _real_initialize(self):
 833         # Retrieve disclaimer
 834         request = compat_urllib_request.Request(self._DISCLAIMER)
 835         try:
 836             self.report_disclaimer()
 837             disclaimer = compat_urllib_request.urlopen(request).read()
 838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 839             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 840
 841         # Confirm age
 842         disclaimer_form = {
 843             'filters': '0',
 844             'submit': "Continue - I'm over 18",
 845             }
 846         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 847         try:
 848             self.report_age_confirmation()
 849             disclaimer = compat_urllib_request.urlopen(request).read()
 850         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 851             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 852
 853     def _real_extract(self, url):
 854         # Extract id and simplified title from URL
 855         mobj = re.match(self._VALID_URL, url)
 856         if mobj is None:
 857             raise ExtractorError(u'Invalid URL: %s' % url)
 858
 859         video_id = mobj.group(1)
 860
 861         # Check if video comes from YouTube
 862         mobj2 = re.match(r'^yt-(.*)$', video_id)
 863         if mobj2 is not None:
 864             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 865
 866         # Retrieve video webpage to extract further information
 867         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 868
 869         # Extract URL, uploader and title from webpage
 870         self.report_extraction(video_id)
 871         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 872         if mobj is not None:
 873             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 874             video_extension = mediaURL[-3:]
 875
 876             # Extract gdaKey if available
 877             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 878             if mobj is None:
 879                 video_url = mediaURL
 880             else:
 881                 gdaKey = mobj.group(1)
 882                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 883         else:
 884             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 885             if mobj is None:
 886                 raise ExtractorError(u'Unable to extract media URL')
 887             vardict = compat_parse_qs(mobj.group(1))
 888             if 'mediaData' not in vardict:
 889                 raise ExtractorError(u'Unable to extract media URL')
 890             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 891             if mobj is None:
 892                 raise ExtractorError(u'Unable to extract media URL')
 893             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 894             video_extension = mediaURL[-3:]
 895             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 896
 897         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 898         if mobj is None:
 899             raise ExtractorError(u'Unable to extract title')
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         mobj = re.search(r'submitter=(.*?);', webpage)
 903         if mobj is None:
 904             raise ExtractorError(u'Unable to extract uploader nickname')
 905         video_uploader = mobj.group(1)
 906
 907         return [{
 908             'id':       video_id.decode('utf-8'),
 909             'url':      video_url.decode('utf-8'),
 910             'uploader': video_uploader.decode('utf-8'),
 911             'upload_date':  None,
 912             'title':    video_title,
 913             'ext':      video_extension.decode('utf-8'),
 914         }]
 915
 916 class DailymotionIE(InfoExtractor):
 917     """Information Extractor for Dailymotion"""
 918
 919     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 920     IE_NAME = u'dailymotion'
 921
 922     def _real_extract(self, url):
 923         # Extract id and simplified title from URL
 924         mobj = re.match(self._VALID_URL, url)
 925         if mobj is None:
 926             raise ExtractorError(u'Invalid URL: %s' % url)
 927
 928         video_id = mobj.group(1).split('_')[0].split('?')[0]
 929
 930         video_extension = 'mp4'
 931
 932         # Retrieve video webpage to extract further information
 933         request = compat_urllib_request.Request(url)
 934         request.add_header('Cookie', 'family_filter=off')
 935         webpage = self._download_webpage(request, video_id)
 936
 937         # Extract URL, uploader and title from webpage
 938         self.report_extraction(video_id)
 939         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 940         if mobj is None:
 941             raise ExtractorError(u'Unable to extract media URL')
 942         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 943
 944         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 945             if key in flashvars:
 946                 max_quality = key
 947                 self.to_screen(u'Using %s' % key)
 948                 break
 949         else:
 950             raise ExtractorError(u'Unable to extract video URL')
 951
 952         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 953         if mobj is None:
 954             raise ExtractorError(u'Unable to extract video URL')
 955
 956         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 957
 958         # TODO: support choosing qualities
 959
 960         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 961         if mobj is None:
 962             raise ExtractorError(u'Unable to extract title')
 963         video_title = unescapeHTML(mobj.group('title'))
 964
 965         video_uploader = None
 966         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 967                                              # Looking for official user
 968                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 969                                             webpage, 'video uploader')
 970
 971         video_upload_date = None
 972         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 973         if mobj is not None:
 974             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 975
 976         return [{
 977             'id':       video_id,
 978             'url':      video_url,
 979             'uploader': video_uploader,
 980             'upload_date':  video_upload_date,
 981             'title':    video_title,
 982             'ext':      video_extension,
 983         }]
 984
 985
 986 class PhotobucketIE(InfoExtractor):
 987     """Information extractor for photobucket.com."""
 988
 989     # TODO: the original _VALID_URL was:
 990     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 991     # Check if it's necessary to keep the old extracion process
 992     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 993     IE_NAME = u'photobucket'
 994
 995     def _real_extract(self, url):
 996         # Extract id from URL
 997         mobj = re.match(self._VALID_URL, url)
 998         if mobj is None:
 999             raise ExtractorError(u'Invalid URL: %s' % url)
1000
1001         video_id = mobj.group('id')
1002
1003         video_extension = mobj.group('ext')
1004
1005         # Retrieve video webpage to extract further information
1006         webpage = self._download_webpage(url, video_id)
1007
1008         # Extract URL, uploader, and title from webpage
1009         self.report_extraction(video_id)
1010         # We try first by looking the javascript code:
1011         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
1012         if mobj is not None:
1013             info = json.loads(mobj.group('json'))
1014             return [{
1015                 'id':       video_id,
1016                 'url':      info[u'downloadUrl'],
1017                 'uploader': info[u'username'],
1018                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1019                 'title':    info[u'title'],
1020                 'ext':      video_extension,
1021                 'thumbnail': info[u'thumbUrl'],
1022             }]
1023
1024         # We try looking in other parts of the webpage
1025         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1026             webpage, u'video URL')
1027
1028         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1029         if mobj is None:
1030             raise ExtractorError(u'Unable to extract title')
1031         video_title = mobj.group(1).decode('utf-8')
1032         video_uploader = mobj.group(2).decode('utf-8')
1033
1034         return [{
1035             'id':       video_id.decode('utf-8'),
1036             'url':      video_url.decode('utf-8'),
1037             'uploader': video_uploader,
1038             'upload_date':  None,
1039             'title':    video_title,
1040             'ext':      video_extension.decode('utf-8'),
1041         }]
1042
1043
1044 class YahooIE(InfoExtractor):
1045     """Information extractor for screen.yahoo.com."""
1046     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1047
1048     def _real_extract(self, url):
1049         mobj = re.match(self._VALID_URL, url)
1050         if mobj is None:
1051             raise ExtractorError(u'Invalid URL: %s' % url)
1052         video_id = mobj.group('id')
1053         webpage = self._download_webpage(url, video_id)
1054         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1055
1056         if m_id is None:
1057             # TODO: Check which url parameters are required
1058             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1059             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1060             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1061                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1062                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1063                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1064                         '''
1065             self.report_extraction(video_id)
1066             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1067             if m_info is None:
1068                 raise ExtractorError(u'Unable to extract video info')
1069             video_title = m_info.group('title')
1070             video_description = m_info.group('description')
1071             video_thumb = m_info.group('thumb')
1072             video_date = m_info.group('date')
1073             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1074
1075             # TODO: Find a way to get mp4 videos
1076             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1077             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1078             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1079             video_url = m_rest.group('url')
1080             video_path = m_rest.group('path')
1081             if m_rest is None:
1082                 raise ExtractorError(u'Unable to extract video url')
1083
1084         else: # We have to use a different method if another id is defined
1085             long_id = m_id.group('new_id')
1086             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1087             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1088             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1089             info = json.loads(json_str)
1090             res = info[u'query'][u'results'][u'mediaObj'][0]
1091             stream = res[u'streams'][0]
1092             video_path = stream[u'path']
1093             video_url = stream[u'host']
1094             meta = res[u'meta']
1095             video_title = meta[u'title']
1096             video_description = meta[u'description']
1097             video_thumb = meta[u'thumbnail']
1098             video_date = None # I can't find it
1099
1100         info_dict = {
1101                      'id': video_id,
1102                      'url': video_url,
1103                      'play_path': video_path,
1104                      'title':video_title,
1105                      'description': video_description,
1106                      'thumbnail': video_thumb,
1107                      'upload_date': video_date,
1108                      'ext': 'flv',
1109                      }
1110         return info_dict
1111
1112 class VimeoIE(InfoExtractor):
1113     """Information extractor for vimeo.com."""
1114
1115     # _VALID_URL matches Vimeo URLs
1116     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1117     IE_NAME = u'vimeo'
1118
1119     def _real_extract(self, url, new_video=True):
1120         # Extract ID from URL
1121         mobj = re.match(self._VALID_URL, url)
1122         if mobj is None:
1123             raise ExtractorError(u'Invalid URL: %s' % url)
1124
1125         video_id = mobj.group('id')
1126         if not mobj.group('proto'):
1127             url = 'https://' + url
1128         if mobj.group('direct_link') or mobj.group('pro'):
1129             url = 'https://vimeo.com/' + video_id
1130
1131         # Retrieve video webpage to extract further information
1132         request = compat_urllib_request.Request(url, None, std_headers)
1133         webpage = self._download_webpage(request, video_id)
1134
1135         # Now we begin extracting as much information as we can from what we
1136         # retrieved. First we extract the information common to all extractors,
1137         # and latter we extract those that are Vimeo specific.
1138         self.report_extraction(video_id)
1139
1140         # Extract the config JSON
1141         try:
1142             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1143             config = json.loads(config)
1144         except:
1145             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1146                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1147             else:
1148                 raise ExtractorError(u'Unable to extract info section')
1149
1150         # Extract title
1151         video_title = config["video"]["title"]
1152
1153         # Extract uploader and uploader_id
1154         video_uploader = config["video"]["owner"]["name"]
1155         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1156
1157         # Extract video thumbnail
1158         video_thumbnail = config["video"]["thumbnail"]
1159
1160         # Extract video description
1161         video_description = get_element_by_attribute("itemprop", "description", webpage)
1162         if video_description: video_description = clean_html(video_description)
1163         else: video_description = u''
1164
1165         # Extract upload date
1166         video_upload_date = None
1167         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1168         if mobj is not None:
1169             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1170
1171         # Vimeo specific: extract request signature and timestamp
1172         sig = config['request']['signature']
1173         timestamp = config['request']['timestamp']
1174
1175         # Vimeo specific: extract video codec and quality information
1176         # First consider quality, then codecs, then take everything
1177         # TODO bind to format param
1178         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1179         files = { 'hd': [], 'sd': [], 'other': []}
1180         for codec_name, codec_extension in codecs:
1181             if codec_name in config["video"]["files"]:
1182                 if 'hd' in config["video"]["files"][codec_name]:
1183                     files['hd'].append((codec_name, codec_extension, 'hd'))
1184                 elif 'sd' in config["video"]["files"][codec_name]:
1185                     files['sd'].append((codec_name, codec_extension, 'sd'))
1186                 else:
1187                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1188
1189         for quality in ('hd', 'sd', 'other'):
1190             if len(files[quality]) > 0:
1191                 video_quality = files[quality][0][2]
1192                 video_codec = files[quality][0][0]
1193                 video_extension = files[quality][0][1]
1194                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1195                 break
1196         else:
1197             raise ExtractorError(u'No known codec found')
1198
1199         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1200                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1201
1202         return [{
1203             'id':       video_id,
1204             'url':      video_url,
1205             'uploader': video_uploader,
1206             'uploader_id': video_uploader_id,
1207             'upload_date':  video_upload_date,
1208             'title':    video_title,
1209             'ext':      video_extension,
1210             'thumbnail':    video_thumbnail,
1211             'description':  video_description,
1212         }]
1213
1214
1215 class ArteTvIE(InfoExtractor):
1216     """arte.tv information extractor."""
1217
1218     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1219     _LIVE_URL = r'index-[0-9]+\.html$'
1220
1221     IE_NAME = u'arte.tv'
1222
1223     def fetch_webpage(self, url):
1224         request = compat_urllib_request.Request(url)
1225         try:
1226             self.report_download_webpage(url)
1227             webpage = compat_urllib_request.urlopen(request).read()
1228         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1229             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1230         except ValueError as err:
1231             raise ExtractorError(u'Invalid URL: %s' % url)
1232         return webpage
1233
1234     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1235         page = self.fetch_webpage(url)
1236         mobj = re.search(regex, page, regexFlags)
1237         info = {}
1238
1239         if mobj is None:
1240             raise ExtractorError(u'Invalid URL: %s' % url)
1241
1242         for (i, key, err) in matchTuples:
1243             if mobj.group(i) is None:
1244                 raise ExtractorError(err)
1245             else:
1246                 info[key] = mobj.group(i)
1247
1248         return info
1249
1250     def extractLiveStream(self, url):
1251         video_lang = url.split('/')[-4]
1252         info = self.grep_webpage(
1253             url,
1254             r'src="(.*?/videothek_js.*?\.js)',
1255             0,
1256             [
1257                 (1, 'url', u'Invalid URL: %s' % url)
1258             ]
1259         )
1260         http_host = url.split('/')[2]
1261         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1262         info = self.grep_webpage(
1263             next_url,
1264             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1265                 '(http://.*?\.swf).*?' +
1266                 '(rtmp://.*?)\'',
1267             re.DOTALL,
1268             [
1269                 (1, 'path',   u'could not extract video path: %s' % url),
1270                 (2, 'player', u'could not extract video player: %s' % url),
1271                 (3, 'url',    u'could not extract video url: %s' % url)
1272             ]
1273         )
1274         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1275
1276     def extractPlus7Stream(self, url):
1277         video_lang = url.split('/')[-3]
1278         info = self.grep_webpage(
1279             url,
1280             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1281             0,
1282             [
1283                 (1, 'url', u'Invalid URL: %s' % url)
1284             ]
1285         )
1286         next_url = compat_urllib_parse.unquote(info.get('url'))
1287         info = self.grep_webpage(
1288             next_url,
1289             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1290             0,
1291             [
1292                 (1, 'url', u'Could not find <video> tag: %s' % url)
1293             ]
1294         )
1295         next_url = compat_urllib_parse.unquote(info.get('url'))
1296
1297         info = self.grep_webpage(
1298             next_url,
1299             r'<video id="(.*?)".*?>.*?' +
1300                 '<name>(.*?)</name>.*?' +
1301                 '<dateVideo>(.*?)</dateVideo>.*?' +
1302                 '<url quality="hd">(.*?)</url>',
1303             re.DOTALL,
1304             [
1305                 (1, 'id',    u'could not extract video id: %s' % url),
1306                 (2, 'title', u'could not extract video title: %s' % url),
1307                 (3, 'date',  u'could not extract video date: %s' % url),
1308                 (4, 'url',   u'could not extract video url: %s' % url)
1309             ]
1310         )
1311
1312         return {
1313             'id':           info.get('id'),
1314             'url':          compat_urllib_parse.unquote(info.get('url')),
1315             'uploader':     u'arte.tv',
1316             'upload_date':  unified_strdate(info.get('date')),
1317             'title':        info.get('title').decode('utf-8'),
1318             'ext':          u'mp4',
1319             'format':       u'NA',
1320             'player_url':   None,
1321         }
1322
1323     def _real_extract(self, url):
1324         video_id = url.split('/')[-1]
1325         self.report_extraction(video_id)
1326
1327         if re.search(self._LIVE_URL, video_id) is not None:
1328             self.extractLiveStream(url)
1329             return
1330         else:
1331             info = self.extractPlus7Stream(url)
1332
1333         return [info]
1334
1335
1336 class GenericIE(InfoExtractor):
1337     """Generic last-resort information extractor."""
1338
1339     _VALID_URL = r'.*'
1340     IE_NAME = u'generic'
1341
1342     def report_download_webpage(self, video_id):
1343         """Report webpage download."""
1344         if not self._downloader.params.get('test', False):
1345             self._downloader.report_warning(u'Falling back on generic information extractor.')
1346         super(GenericIE, self).report_download_webpage(video_id)
1347
1348     def report_following_redirect(self, new_url):
1349         """Report information extraction."""
1350         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1351
1352     def _test_redirect(self, url):
1353         """Check if it is a redirect, like url shorteners, in case return the new url."""
1354         class HeadRequest(compat_urllib_request.Request):
1355             def get_method(self):
1356                 return "HEAD"
1357
1358         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1359             """
1360             Subclass the HTTPRedirectHandler to make it use our
1361             HeadRequest also on the redirected URL
1362             """
1363             def redirect_request(self, req, fp, code, msg, headers, newurl):
1364                 if code in (301, 302, 303, 307):
1365                     newurl = newurl.replace(' ', '%20')
1366                     newheaders = dict((k,v) for k,v in req.headers.items()
1367                                       if k.lower() not in ("content-length", "content-type"))
1368                     return HeadRequest(newurl,
1369                                        headers=newheaders,
1370                                        origin_req_host=req.get_origin_req_host(),
1371                                        unverifiable=True)
1372                 else:
1373                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1374
1375         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1376             """
1377             Fallback to GET if HEAD is not allowed (405 HTTP error)
1378             """
1379             def http_error_405(self, req, fp, code, msg, headers):
1380                 fp.read()
1381                 fp.close()
1382
1383                 newheaders = dict((k,v) for k,v in req.headers.items()
1384                                   if k.lower() not in ("content-length", "content-type"))
1385                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1386                                                  headers=newheaders,
1387                                                  origin_req_host=req.get_origin_req_host(),
1388                                                  unverifiable=True))
1389
1390         # Build our opener
1391         opener = compat_urllib_request.OpenerDirector()
1392         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1393                         HTTPMethodFallback, HEADRedirectHandler,
1394                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1395             opener.add_handler(handler())
1396
1397         response = opener.open(HeadRequest(url))
1398         if response is None:
1399             raise ExtractorError(u'Invalid URL protocol')
1400         new_url = response.geturl()
1401
1402         if url == new_url:
1403             return False
1404
1405         self.report_following_redirect(new_url)
1406         return new_url
1407
1408     def _real_extract(self, url):
1409         new_url = self._test_redirect(url)
1410         if new_url: return [self.url_result(new_url)]
1411
1412         video_id = url.split('/')[-1]
1413         try:
1414             webpage = self._download_webpage(url, video_id)
1415         except ValueError as err:
1416             # since this is the last-resort InfoExtractor, if
1417             # this error is thrown, it'll be thrown here
1418             raise ExtractorError(u'Invalid URL: %s' % url)
1419
1420         self.report_extraction(video_id)
1421         # Start with something easy: JW Player in SWFObject
1422         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1423         if mobj is None:
1424             # Broaden the search a little bit
1425             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1426         if mobj is None:
1427             # Broaden the search a little bit: JWPlayer JS loader
1428             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1429         if mobj is None:
1430             # Try to find twitter cards info
1431             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1432         if mobj is None:
1433             raise ExtractorError(u'Invalid URL: %s' % url)
1434
1435         # It's possible that one of the regexes
1436         # matched, but returned an empty group:
1437         if mobj.group(1) is None:
1438             raise ExtractorError(u'Invalid URL: %s' % url)
1439
1440         video_url = compat_urllib_parse.unquote(mobj.group(1))
1441         video_id = os.path.basename(video_url)
1442
1443         # here's a fun little line of code for you:
1444         video_extension = os.path.splitext(video_id)[1][1:]
1445         video_id = os.path.splitext(video_id)[0]
1446
1447         # it's tempting to parse this further, but you would
1448         # have to take into account all the variations like
1449         #   Video Title - Site Name
1450         #   Site Name | Video Title
1451         #   Video Title - Tagline | Site Name
1452         # and so on and so forth; it's just not practical
1453         video_title = self._html_search_regex(r'<title>(.*)</title>',
1454             webpage, u'video title')
1455
1456         # video uploader is domain name
1457         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1458             url, u'video uploader')
1459
1460         return [{
1461             'id':       video_id,
1462             'url':      video_url,
1463             'uploader': video_uploader,
1464             'upload_date':  None,
1465             'title':    video_title,
1466             'ext':      video_extension,
1467         }]
1468
1469
1470 class YoutubeSearchIE(SearchInfoExtractor):
1471     """Information Extractor for YouTube search queries."""
1472     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1473     _MAX_RESULTS = 1000
1474     IE_NAME = u'youtube:search'
1475     _SEARCH_KEY = 'ytsearch'
1476
1477     def report_download_page(self, query, pagenum):
1478         """Report attempt to download search page with given number."""
1479         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1480
1481     def _get_n_results(self, query, n):
1482         """Get a specified number of results for a query"""
1483
1484         video_ids = []
1485         pagenum = 0
1486         limit = n
1487
1488         while (50 * pagenum) < limit:
1489             self.report_download_page(query, pagenum+1)
1490             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1491             request = compat_urllib_request.Request(result_url)
1492             try:
1493                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1494             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1495                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1496             api_response = json.loads(data)['data']
1497
1498             if not 'items' in api_response:
1499                 raise ExtractorError(u'[youtube] No video results')
1500
1501             new_ids = list(video['id'] for video in api_response['items'])
1502             video_ids += new_ids
1503
1504             limit = min(n, api_response['totalItems'])
1505             pagenum += 1
1506
1507         if len(video_ids) > n:
1508             video_ids = video_ids[:n]
1509         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1510         return self.playlist_result(videos, query)
1511
1512
1513 class GoogleSearchIE(SearchInfoExtractor):
1514     """Information Extractor for Google Video search queries."""
1515     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1516     _MAX_RESULTS = 1000
1517     IE_NAME = u'video.google:search'
1518     _SEARCH_KEY = 'gvsearch'
1519
1520     def _get_n_results(self, query, n):
1521         """Get a specified number of results for a query"""
1522
1523         res = {
1524             '_type': 'playlist',
1525             'id': query,
1526             'entries': []
1527         }
1528
1529         for pagenum in itertools.count(1):
1530             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1531             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1532                                              note='Downloading result page ' + str(pagenum))
1533
1534             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1535                 e = {
1536                     '_type': 'url',
1537                     'url': mobj.group(1)
1538                 }
1539                 res['entries'].append(e)
1540
1541             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1542                 return res
1543
1544 class YahooSearchIE(SearchInfoExtractor):
1545     """Information Extractor for Yahoo! Video search queries."""
1546
1547     _MAX_RESULTS = 1000
1548     IE_NAME = u'screen.yahoo:search'
1549     _SEARCH_KEY = 'yvsearch'
1550
1551     def _get_n_results(self, query, n):
1552         """Get a specified number of results for a query"""
1553
1554         res = {
1555             '_type': 'playlist',
1556             'id': query,
1557             'entries': []
1558         }
1559         for pagenum in itertools.count(0):
1560             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1561             webpage = self._download_webpage(result_url, query,
1562                                              note='Downloading results page '+str(pagenum+1))
1563             info = json.loads(webpage)
1564             m = info[u'm']
1565             results = info[u'results']
1566
1567             for (i, r) in enumerate(results):
1568                 if (pagenum * 30) +i >= n:
1569                     break
1570                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1571                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1572                 res['entries'].append(e)
1573             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1574                 break
1575
1576         return res
1577
1578
1579 class YoutubePlaylistIE(InfoExtractor):
1580     """Information Extractor for YouTube playlists."""
1581
1582     _VALID_URL = r"""(?:
1583                         (?:https?://)?
1584                         (?:\w+\.)?
1585                         youtube\.com/
1586                         (?:
1587                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1588                            \? (?:.*?&)*? (?:p|a|list)=
1589                         |  p/
1590                         )
1591                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1592                         .*
1593                      |
1594                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1595                      )"""
1596     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1597     _MAX_RESULTS = 50
1598     IE_NAME = u'youtube:playlist'
1599
1600     @classmethod
1601     def suitable(cls, url):
1602         """Receives a URL and returns True if suitable for this IE."""
1603         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1604
1605     def _real_extract(self, url):
1606         # Extract playlist id
1607         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1608         if mobj is None:
1609             raise ExtractorError(u'Invalid URL: %s' % url)
1610
1611         # Download playlist videos from API
1612         playlist_id = mobj.group(1) or mobj.group(2)
1613         page_num = 1
1614         videos = []
1615
1616         while True:
1617             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1618             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1619
1620             try:
1621                 response = json.loads(page)
1622             except ValueError as err:
1623                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1624
1625             if 'feed' not in response:
1626                 raise ExtractorError(u'Got a malformed response from YouTube API')
1627             playlist_title = response['feed']['title']['$t']
1628             if 'entry' not in response['feed']:
1629                 # Number of videos is a multiple of self._MAX_RESULTS
1630                 break
1631
1632             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1633                         for entry in response['feed']['entry']
1634                         if 'content' in entry ]
1635
1636             if len(response['feed']['entry']) < self._MAX_RESULTS:
1637                 break
1638             page_num += 1
1639
1640         videos = [v[1] for v in sorted(videos)]
1641
1642         url_results = [self.url_result(url, 'Youtube') for url in videos]
1643         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1644
1645
1646 class YoutubeChannelIE(InfoExtractor):
1647     """Information Extractor for YouTube channels."""
1648
1649     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1650     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1651     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1652     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1653     IE_NAME = u'youtube:channel'
1654
1655     def extract_videos_from_page(self, page):
1656         ids_in_page = []
1657         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1658             if mobj.group(1) not in ids_in_page:
1659                 ids_in_page.append(mobj.group(1))
1660         return ids_in_page
1661
1662     def _real_extract(self, url):
1663         # Extract channel id
1664         mobj = re.match(self._VALID_URL, url)
1665         if mobj is None:
1666             raise ExtractorError(u'Invalid URL: %s' % url)
1667
1668         # Download channel page
1669         channel_id = mobj.group(1)
1670         video_ids = []
1671         pagenum = 1
1672
1673         url = self._TEMPLATE_URL % (channel_id, pagenum)
1674         page = self._download_webpage(url, channel_id,
1675                                       u'Downloading page #%s' % pagenum)
1676
1677         # Extract video identifiers
1678         ids_in_page = self.extract_videos_from_page(page)
1679         video_ids.extend(ids_in_page)
1680
1681         # Download any subsequent channel pages using the json-based channel_ajax query
1682         if self._MORE_PAGES_INDICATOR in page:
1683             while True:
1684                 pagenum = pagenum + 1
1685
1686                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1687                 page = self._download_webpage(url, channel_id,
1688                                               u'Downloading page #%s' % pagenum)
1689
1690                 page = json.loads(page)
1691
1692                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1693                 video_ids.extend(ids_in_page)
1694
1695                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1696                     break
1697
1698         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1699
1700         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1701         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1702         return [self.playlist_result(url_entries, channel_id)]
1703
1704
1705 class YoutubeUserIE(InfoExtractor):
1706     """Information Extractor for YouTube users."""
1707
1708     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1709     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1710     _GDATA_PAGE_SIZE = 50
1711     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1712     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1713     IE_NAME = u'youtube:user'
1714
1715     def _real_extract(self, url):
1716         # Extract username
1717         mobj = re.match(self._VALID_URL, url)
1718         if mobj is None:
1719             raise ExtractorError(u'Invalid URL: %s' % url)
1720
1721         username = mobj.group(1)
1722
1723         # Download video ids using YouTube Data API. Result size per
1724         # query is limited (currently to 50 videos) so we need to query
1725         # page by page until there are no video ids - it means we got
1726         # all of them.
1727
1728         video_ids = []
1729         pagenum = 0
1730
1731         while True:
1732             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1733
1734             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1735             page = self._download_webpage(gdata_url, username,
1736                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1737
1738             # Extract video identifiers
1739             ids_in_page = []
1740
1741             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1742                 if mobj.group(1) not in ids_in_page:
1743                     ids_in_page.append(mobj.group(1))
1744
1745             video_ids.extend(ids_in_page)
1746
1747             # A little optimization - if current page is not
1748             # "full", ie. does not contain PAGE_SIZE video ids then
1749             # we can assume that this page is the last one - there
1750             # are no more ids on further pages - no need to query
1751             # again.
1752
1753             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1754                 break
1755
1756             pagenum += 1
1757
1758         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1759         url_results = [self.url_result(url, 'Youtube') for url in urls]
1760         return [self.playlist_result(url_results, playlist_title = username)]
1761
1762
1763 class BlipTVUserIE(InfoExtractor):
1764     """Information Extractor for blip.tv users."""
1765
1766     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1767     _PAGE_SIZE = 12
1768     IE_NAME = u'blip.tv:user'
1769
1770     def _real_extract(self, url):
1771         # Extract username
1772         mobj = re.match(self._VALID_URL, url)
1773         if mobj is None:
1774             raise ExtractorError(u'Invalid URL: %s' % url)
1775
1776         username = mobj.group(1)
1777
1778         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1779
1780         page = self._download_webpage(url, username, u'Downloading user page')
1781         mobj = re.search(r'data-users-id="([^"]+)"', page)
1782         page_base = page_base % mobj.group(1)
1783
1784
1785         # Download video ids using BlipTV Ajax calls. Result size per
1786         # query is limited (currently to 12 videos) so we need to query
1787         # page by page until there are no video ids - it means we got
1788         # all of them.
1789
1790         video_ids = []
1791         pagenum = 1
1792
1793         while True:
1794             url = page_base + "&page=" + str(pagenum)
1795             page = self._download_webpage(url, username,
1796                                           u'Downloading video ids from page %d' % pagenum)
1797
1798             # Extract video identifiers
1799             ids_in_page = []
1800
1801             for mobj in re.finditer(r'href="/([^"]+)"', page):
1802                 if mobj.group(1) not in ids_in_page:
1803                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1804
1805             video_ids.extend(ids_in_page)
1806
1807             # A little optimization - if current page is not
1808             # "full", ie. does not contain PAGE_SIZE video ids then
1809             # we can assume that this page is the last one - there
1810             # are no more ids on further pages - no need to query
1811             # again.
1812
1813             if len(ids_in_page) < self._PAGE_SIZE:
1814                 break
1815
1816             pagenum += 1
1817
1818         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1819         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1820         return [self.playlist_result(url_entries, playlist_title = username)]
1821
1822
1823 class DepositFilesIE(InfoExtractor):
1824     """Information extractor for depositfiles.com"""
1825
1826     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1827
1828     def _real_extract(self, url):
1829         file_id = url.split('/')[-1]
1830         # Rebuild url in english locale
1831         url = 'http://depositfiles.com/en/files/' + file_id
1832
1833         # Retrieve file webpage with 'Free download' button pressed
1834         free_download_indication = { 'gateway_result' : '1' }
1835         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1836         try:
1837             self.report_download_webpage(file_id)
1838             webpage = compat_urllib_request.urlopen(request).read()
1839         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1840             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1841
1842         # Search for the real file URL
1843         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1844         if (mobj is None) or (mobj.group(1) is None):
1845             # Try to figure out reason of the error.
1846             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1847             if (mobj is not None) and (mobj.group(1) is not None):
1848                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1849                 raise ExtractorError(u'%s' % restriction_message)
1850             else:
1851                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1852
1853         file_url = mobj.group(1)
1854         file_extension = os.path.splitext(file_url)[1][1:]
1855
1856         # Search for file title
1857         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1858
1859         return [{
1860             'id':       file_id.decode('utf-8'),
1861             'url':      file_url.decode('utf-8'),
1862             'uploader': None,
1863             'upload_date':  None,
1864             'title':    file_title,
1865             'ext':      file_extension.decode('utf-8'),
1866         }]
1867
1868
1869 class FacebookIE(InfoExtractor):
1870     """Information Extractor for Facebook"""
1871
1872     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1873     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1874     _NETRC_MACHINE = 'facebook'
1875     IE_NAME = u'facebook'
1876
1877     def report_login(self):
1878         """Report attempt to log in."""
1879         self.to_screen(u'Logging in')
1880
1881     def _real_initialize(self):
1882         if self._downloader is None:
1883             return
1884
1885         useremail = None
1886         password = None
1887         downloader_params = self._downloader.params
1888
1889         # Attempt to use provided username and password or .netrc data
1890         if downloader_params.get('username', None) is not None:
1891             useremail = downloader_params['username']
1892             password = downloader_params['password']
1893         elif downloader_params.get('usenetrc', False):
1894             try:
1895                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1896                 if info is not None:
1897                     useremail = info[0]
1898                     password = info[2]
1899                 else:
1900                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1901             except (IOError, netrc.NetrcParseError) as err:
1902                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1903                 return
1904
1905         if useremail is None:
1906             return
1907
1908         # Log in
1909         login_form = {
1910             'email': useremail,
1911             'pass': password,
1912             'login': 'Log+In'
1913             }
1914         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1915         try:
1916             self.report_login()
1917             login_results = compat_urllib_request.urlopen(request).read()
1918             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1919                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1920                 return
1921         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1922             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1923             return
1924
1925     def _real_extract(self, url):
1926         mobj = re.match(self._VALID_URL, url)
1927         if mobj is None:
1928             raise ExtractorError(u'Invalid URL: %s' % url)
1929         video_id = mobj.group('ID')
1930
1931         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1932         webpage = self._download_webpage(url, video_id)
1933
1934         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1935         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1936         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1937         if not m:
1938             raise ExtractorError(u'Cannot parse data')
1939         data = dict(json.loads(m.group(1)))
1940         params_raw = compat_urllib_parse.unquote(data['params'])
1941         params = json.loads(params_raw)
1942         video_data = params['video_data'][0]
1943         video_url = video_data.get('hd_src')
1944         if not video_url:
1945             video_url = video_data['sd_src']
1946         if not video_url:
1947             raise ExtractorError(u'Cannot find video URL')
1948         video_duration = int(video_data['video_duration'])
1949         thumbnail = video_data['thumbnail_src']
1950
1951         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1952             webpage, u'title')
1953
1954         info = {
1955             'id': video_id,
1956             'title': video_title,
1957             'url': video_url,
1958             'ext': 'mp4',
1959             'duration': video_duration,
1960             'thumbnail': thumbnail,
1961         }
1962         return [info]
1963
1964
1965 class BlipTVIE(InfoExtractor):
1966     """Information extractor for blip.tv"""
1967
1968     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1969     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1970     IE_NAME = u'blip.tv'
1971
1972     def report_direct_download(self, title):
1973         """Report information extraction."""
1974         self.to_screen(u'%s: Direct download detected' % title)
1975
1976     def _real_extract(self, url):
1977         mobj = re.match(self._VALID_URL, url)
1978         if mobj is None:
1979             raise ExtractorError(u'Invalid URL: %s' % url)
1980
1981         # See https://github.com/rg3/youtube-dl/issues/857
1982         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1983         if api_mobj is not None:
1984             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1985         urlp = compat_urllib_parse_urlparse(url)
1986         if urlp.path.startswith('/play/'):
1987             request = compat_urllib_request.Request(url)
1988             response = compat_urllib_request.urlopen(request)
1989             redirecturl = response.geturl()
1990             rurlp = compat_urllib_parse_urlparse(redirecturl)
1991             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1992             url = 'http://blip.tv/a/a-' + file_id
1993             return self._real_extract(url)
1994
1995
1996         if '?' in url:
1997             cchar = '&'
1998         else:
1999             cchar = '?'
2000         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2001         request = compat_urllib_request.Request(json_url)
2002         request.add_header('User-Agent', 'iTunes/10.6.1')
2003         self.report_extraction(mobj.group(1))
2004         info = None
2005         try:
2006             urlh = compat_urllib_request.urlopen(request)
2007             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2008                 basename = url.split('/')[-1]
2009                 title,ext = os.path.splitext(basename)
2010                 title = title.decode('UTF-8')
2011                 ext = ext.replace('.', '')
2012                 self.report_direct_download(title)
2013                 info = {
2014                     'id': title,
2015                     'url': url,
2016                     'uploader': None,
2017                     'upload_date': None,
2018                     'title': title,
2019                     'ext': ext,
2020                     'urlhandle': urlh
2021                 }
2022         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2023             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2024         if info is None: # Regular URL
2025             try:
2026                 json_code_bytes = urlh.read()
2027                 json_code = json_code_bytes.decode('utf-8')
2028             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2029                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2030
2031             try:
2032                 json_data = json.loads(json_code)
2033                 if 'Post' in json_data:
2034                     data = json_data['Post']
2035                 else:
2036                     data = json_data
2037
2038                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2039                 video_url = data['media']['url']
2040                 umobj = re.match(self._URL_EXT, video_url)
2041                 if umobj is None:
2042                     raise ValueError('Can not determine filename extension')
2043                 ext = umobj.group(1)
2044
2045                 info = {
2046                     'id': data['item_id'],
2047                     'url': video_url,
2048                     'uploader': data['display_name'],
2049                     'upload_date': upload_date,
2050                     'title': data['title'],
2051                     'ext': ext,
2052                     'format': data['media']['mimeType'],
2053                     'thumbnail': data['thumbnailUrl'],
2054                     'description': data['description'],
2055                     'player_url': data['embedUrl'],
2056                     'user_agent': 'iTunes/10.6.1',
2057                 }
2058             except (ValueError,KeyError) as err:
2059                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2060
2061         return [info]
2062
2063
2064 class MyVideoIE(InfoExtractor):
2065     """Information Extractor for myvideo.de."""
2066
2067     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2068     IE_NAME = u'myvideo'
2069
2070     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2071     # Released into the Public Domain by Tristan Fischer on 2013-05-19
2072     # https://github.com/rg3/youtube-dl/pull/842
2073     def __rc4crypt(self,data, key):
2074         x = 0
2075         box = list(range(256))
2076         for i in list(range(256)):
2077             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2078             box[i], box[x] = box[x], box[i]
2079         x = 0
2080         y = 0
2081         out = ''
2082         for char in data:
2083             x = (x + 1) % 256
2084             y = (y + box[x]) % 256
2085             box[x], box[y] = box[y], box[x]
2086             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2087         return out
2088
2089     def __md5(self,s):
2090         return hashlib.md5(s).hexdigest().encode()
2091
2092     def _real_extract(self,url):
2093         mobj = re.match(self._VALID_URL, url)
2094         if mobj is None:
2095             raise ExtractorError(u'invalid URL: %s' % url)
2096
2097         video_id = mobj.group(1)
2098
2099         GK = (
2100           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2101           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2102           b'TnpsbA0KTVRkbU1tSTRNdz09'
2103         )
2104
2105         # Get video webpage
2106         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2107         webpage = self._download_webpage(webpage_url, video_id)
2108
2109         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2110         if mobj is not None:
2111             self.report_extraction(video_id)
2112             video_url = mobj.group(1) + '.flv'
2113
2114             video_title = self._html_search_regex('<title>([^<]+)</title>',
2115                 webpage, u'title')
2116
2117             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2118
2119             return [{
2120                 'id':       video_id,
2121                 'url':      video_url,
2122                 'uploader': None,
2123                 'upload_date':  None,
2124                 'title':    video_title,
2125                 'ext':      u'flv',
2126             }]
2127
2128         # try encxml
2129         mobj = re.search('var flashvars={(.+?)}', webpage)
2130         if mobj is None:
2131             raise ExtractorError(u'Unable to extract video')
2132
2133         params = {}
2134         encxml = ''
2135         sec = mobj.group(1)
2136         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2137             if not a == '_encxml':
2138                 params[a] = b
2139             else:
2140                 encxml = compat_urllib_parse.unquote(b)
2141         if not params.get('domain'):
2142             params['domain'] = 'www.myvideo.de'
2143         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2144         if 'flash_playertype=MTV' in xmldata_url:
2145             self._downloader.report_warning(u'avoiding MTV player')
2146             xmldata_url = (
2147                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2148                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2149             ) % video_id
2150
2151         # get enc data
2152         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2153         enc_data_b = binascii.unhexlify(enc_data)
2154         sk = self.__md5(
2155             base64.b64decode(base64.b64decode(GK)) +
2156             self.__md5(
2157                 str(video_id).encode('utf-8')
2158             )
2159         )
2160         dec_data = self.__rc4crypt(enc_data_b, sk)
2161
2162         # extracting infos
2163         self.report_extraction(video_id)
2164
2165         video_url = None
2166         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2167         if mobj:
2168             video_url = compat_urllib_parse.unquote(mobj.group(1))
2169             if 'myvideo2flash' in video_url:
2170                 self._downloader.report_warning(u'forcing RTMPT ...')
2171                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2172
2173         if not video_url:
2174             # extract non rtmp videos
2175             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2176             if mobj is None:
2177                 raise ExtractorError(u'unable to extract url')
2178             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2179
2180         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2181         video_file = compat_urllib_parse.unquote(video_file)
2182
2183         if not video_file.endswith('f4m'):
2184             ppath, prefix = video_file.split('.')
2185             video_playpath = '%s:%s' % (prefix, ppath)
2186             video_hls_playlist = ''
2187         else:
2188             video_playpath = ''
2189             video_hls_playlist = (
2190                 video_filepath + video_file
2191             ).replace('.f4m', '.m3u8')
2192
2193         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2194         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2195
2196         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2197             webpage, u'title')
2198
2199         return [{
2200             'id':                 video_id,
2201             'url':                video_url,
2202             'tc_url':             video_url,
2203             'uploader':           None,
2204             'upload_date':        None,
2205             'title':              video_title,
2206             'ext':                u'flv',
2207             'play_path':          video_playpath,
2208             'video_file':         video_file,
2209             'video_hls_playlist': video_hls_playlist,
2210             'player_url':         video_swfobj,
2211         }]
2212
2213
2214 class ComedyCentralIE(InfoExtractor):
2215     """Information extractor for The Daily Show and Colbert Report """
2216
2217     # urls can be abbreviations like :thedailyshow or :colbert
2218     # urls for episodes like:
2219     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2220     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2221     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2222     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2223                       |(https?://)?(www\.)?
2224                           (?P<showname>thedailyshow|colbertnation)\.com/
2225                          (full-episodes/(?P<episode>.*)|
2226                           (?P<clip>
2227                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2228                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2229                      $"""
2230
2231     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2232
2233     _video_extensions = {
2234         '3500': 'mp4',
2235         '2200': 'mp4',
2236         '1700': 'mp4',
2237         '1200': 'mp4',
2238         '750': 'mp4',
2239         '400': 'mp4',
2240     }
2241     _video_dimensions = {
2242         '3500': '1280x720',
2243         '2200': '960x540',
2244         '1700': '768x432',
2245         '1200': '640x360',
2246         '750': '512x288',
2247         '400': '384x216',
2248     }
2249
2250     @classmethod
2251     def suitable(cls, url):
2252         """Receives a URL and returns True if suitable for this IE."""
2253         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2254
2255     def _print_formats(self, formats):
2256         print('Available formats:')
2257         for x in formats:
2258             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2259
2260
2261     def _real_extract(self, url):
2262         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2263         if mobj is None:
2264             raise ExtractorError(u'Invalid URL: %s' % url)
2265
2266         if mobj.group('shortname'):
2267             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2268                 url = u'http://www.thedailyshow.com/full-episodes/'
2269             else:
2270                 url = u'http://www.colbertnation.com/full-episodes/'
2271             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2272             assert mobj is not None
2273
2274         if mobj.group('clip'):
2275             if mobj.group('showname') == 'thedailyshow':
2276                 epTitle = mobj.group('tdstitle')
2277             else:
2278                 epTitle = mobj.group('cntitle')
2279             dlNewest = False
2280         else:
2281             dlNewest = not mobj.group('episode')
2282             if dlNewest:
2283                 epTitle = mobj.group('showname')
2284             else:
2285                 epTitle = mobj.group('episode')
2286
2287         self.report_extraction(epTitle)
2288         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2289         if dlNewest:
2290             url = htmlHandle.geturl()
2291             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2292             if mobj is None:
2293                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2294             if mobj.group('episode') == '':
2295                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2296             epTitle = mobj.group('episode')
2297
2298         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2299
2300         if len(mMovieParams) == 0:
2301             # The Colbert Report embeds the information in a without
2302             # a URL prefix; so extract the alternate reference
2303             # and then add the URL prefix manually.
2304
2305             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2306             if len(altMovieParams) == 0:
2307                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2308             else:
2309                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2310
2311         uri = mMovieParams[0][1]
2312         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2313         indexXml = self._download_webpage(indexUrl, epTitle,
2314                                           u'Downloading show index',
2315                                           u'unable to download episode index')
2316
2317         results = []
2318
2319         idoc = xml.etree.ElementTree.fromstring(indexXml)
2320         itemEls = idoc.findall('.//item')
2321         for partNum,itemEl in enumerate(itemEls):
2322             mediaId = itemEl.findall('./guid')[0].text
2323             shortMediaId = mediaId.split(':')[-1]
2324             showId = mediaId.split(':')[-2].replace('.com', '')
2325             officialTitle = itemEl.findall('./title')[0].text
2326             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2327
2328             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2329                         compat_urllib_parse.urlencode({'uri': mediaId}))
2330             configXml = self._download_webpage(configUrl, epTitle,
2331                                                u'Downloading configuration for %s' % shortMediaId)
2332
2333             cdoc = xml.etree.ElementTree.fromstring(configXml)
2334             turls = []
2335             for rendition in cdoc.findall('.//rendition'):
2336                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2337                 turls.append(finfo)
2338
2339             if len(turls) == 0:
2340                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2341                 continue
2342
2343             if self._downloader.params.get('listformats', None):
2344                 self._print_formats([i[0] for i in turls])
2345                 return
2346
2347             # For now, just pick the highest bitrate
2348             format,rtmp_video_url = turls[-1]
2349
2350             # Get the format arg from the arg stream
2351             req_format = self._downloader.params.get('format', None)
2352
2353             # Select format if we can find one
2354             for f,v in turls:
2355                 if f == req_format:
2356                     format, rtmp_video_url = f, v
2357                     break
2358
2359             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2360             if not m:
2361                 raise ExtractorError(u'Cannot transform RTMP url')
2362             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2363             video_url = base + m.group('finalid')
2364
2365             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2366             info = {
2367                 'id': shortMediaId,
2368                 'url': video_url,
2369                 'uploader': showId,
2370                 'upload_date': officialDate,
2371                 'title': effTitle,
2372                 'ext': 'mp4',
2373                 'format': format,
2374                 'thumbnail': None,
2375                 'description': officialTitle,
2376             }
2377             results.append(info)
2378
2379         return results
2380
2381
2382 class EscapistIE(InfoExtractor):
2383     """Information extractor for The Escapist """
2384
2385     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2386     IE_NAME = u'escapist'
2387
2388     def _real_extract(self, url):
2389         mobj = re.match(self._VALID_URL, url)
2390         if mobj is None:
2391             raise ExtractorError(u'Invalid URL: %s' % url)
2392         showName = mobj.group('showname')
2393         videoId = mobj.group('episode')
2394
2395         self.report_extraction(videoId)
2396         webpage = self._download_webpage(url, videoId)
2397
2398         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2399             webpage, u'description', fatal=False)
2400
2401         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2402             webpage, u'thumbnail', fatal=False)
2403
2404         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2405             webpage, u'player url')
2406
2407         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2408             webpage, u'player url').split(' : ')[-1]
2409
2410         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2411         configUrl = compat_urllib_parse.unquote(configUrl)
2412
2413         configJSON = self._download_webpage(configUrl, videoId,
2414                                             u'Downloading configuration',
2415                                             u'unable to download configuration')
2416
2417         # Technically, it's JavaScript, not JSON
2418         configJSON = configJSON.replace("'", '"')
2419
2420         try:
2421             config = json.loads(configJSON)
2422         except (ValueError,) as err:
2423             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2424
2425         playlist = config['playlist']
2426         videoUrl = playlist[1]['url']
2427
2428         info = {
2429             'id': videoId,
2430             'url': videoUrl,
2431             'uploader': showName,
2432             'upload_date': None,
2433             'title': title,
2434             'ext': 'mp4',
2435             'thumbnail': imgUrl,
2436             'description': videoDesc,
2437             'player_url': playerUrl,
2438         }
2439
2440         return [info]
2441
2442 class CollegeHumorIE(InfoExtractor):
2443     """Information extractor for collegehumor.com"""
2444
2445     _WORKING = False
2446     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2447     IE_NAME = u'collegehumor'
2448
2449     def report_manifest(self, video_id):
2450         """Report information extraction."""
2451         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2452
2453     def _real_extract(self, url):
2454         mobj = re.match(self._VALID_URL, url)
2455         if mobj is None:
2456             raise ExtractorError(u'Invalid URL: %s' % url)
2457         video_id = mobj.group('videoid')
2458
2459         info = {
2460             'id': video_id,
2461             'uploader': None,
2462             'upload_date': None,
2463         }
2464
2465         self.report_extraction(video_id)
2466         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2467         try:
2468             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2470             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2471
2472         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2473         try:
2474             videoNode = mdoc.findall('./video')[0]
2475             info['description'] = videoNode.findall('./description')[0].text
2476             info['title'] = videoNode.findall('./caption')[0].text
2477             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2478             manifest_url = videoNode.findall('./file')[0].text
2479         except IndexError:
2480             raise ExtractorError(u'Invalid metadata XML file')
2481
2482         manifest_url += '?hdcore=2.10.3'
2483         self.report_manifest(video_id)
2484         try:
2485             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2486         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2488
2489         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2490         try:
2491             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2492             node_id = media_node.attrib['url']
2493             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2494         except IndexError as err:
2495             raise ExtractorError(u'Invalid manifest file')
2496
2497         url_pr = compat_urllib_parse_urlparse(manifest_url)
2498         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2499
2500         info['url'] = url
2501         info['ext'] = 'f4f'
2502         return [info]
2503
2504
2505 class XVideosIE(InfoExtractor):
2506     """Information extractor for xvideos.com"""
2507
2508     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2509     IE_NAME = u'xvideos'
2510
2511     def _real_extract(self, url):
2512         mobj = re.match(self._VALID_URL, url)
2513         if mobj is None:
2514             raise ExtractorError(u'Invalid URL: %s' % url)
2515         video_id = mobj.group(1)
2516
2517         webpage = self._download_webpage(url, video_id)
2518
2519         self.report_extraction(video_id)
2520
2521         # Extract video URL
2522         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2523             webpage, u'video URL'))
2524
2525         # Extract title
2526         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2527             webpage, u'title')
2528
2529         # Extract video thumbnail
2530         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2531             webpage, u'thumbnail', fatal=False)
2532
2533         info = {
2534             'id': video_id,
2535             'url': video_url,
2536             'uploader': None,
2537             'upload_date': None,
2538             'title': video_title,
2539             'ext': 'flv',
2540             'thumbnail': video_thumbnail,
2541             'description': None,
2542         }
2543
2544         return [info]
2545
2546
2547 class SoundcloudIE(InfoExtractor):
2548     """Information extractor for soundcloud.com
2549        To access the media, the uid of the song and a stream token
2550        must be extracted from the page source and the script must make
2551        a request to media.soundcloud.com/crossdomain.xml. Then
2552        the media can be grabbed by requesting from an url composed
2553        of the stream token and uid
2554      """
2555
2556     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2557     IE_NAME = u'soundcloud'
2558
2559     def report_resolve(self, video_id):
2560         """Report information extraction."""
2561         self.to_screen(u'%s: Resolving id' % video_id)
2562
2563     def _real_extract(self, url):
2564         mobj = re.match(self._VALID_URL, url)
2565         if mobj is None:
2566             raise ExtractorError(u'Invalid URL: %s' % url)
2567
2568         # extract uploader (which is in the url)
2569         uploader = mobj.group(1)
2570         # extract simple title (uploader + slug of song title)
2571         slug_title =  mobj.group(2)
2572         simple_title = uploader + u'-' + slug_title
2573         full_title = '%s/%s' % (uploader, slug_title)
2574
2575         self.report_resolve(full_title)
2576
2577         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2578         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2579         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2580
2581         info = json.loads(info_json)
2582         video_id = info['id']
2583         self.report_extraction(full_title)
2584
2585         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2586         stream_json = self._download_webpage(streams_url, full_title,
2587                                              u'Downloading stream definitions',
2588                                              u'unable to download stream definitions')
2589
2590         streams = json.loads(stream_json)
2591         mediaURL = streams['http_mp3_128_url']
2592         upload_date = unified_strdate(info['created_at'])
2593
2594         return [{
2595             'id':       info['id'],
2596             'url':      mediaURL,
2597             'uploader': info['user']['username'],
2598             'upload_date': upload_date,
2599             'title':    info['title'],
2600             'ext':      u'mp3',
2601             'description': info['description'],
2602         }]
2603
2604 class SoundcloudSetIE(InfoExtractor):
2605     """Information extractor for soundcloud.com sets
2606        To access the media, the uid of the song and a stream token
2607        must be extracted from the page source and the script must make
2608        a request to media.soundcloud.com/crossdomain.xml. Then
2609        the media can be grabbed by requesting from an url composed
2610        of the stream token and uid
2611      """
2612
2613     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2614     IE_NAME = u'soundcloud:set'
2615
2616     def report_resolve(self, video_id):
2617         """Report information extraction."""
2618         self.to_screen(u'%s: Resolving id' % video_id)
2619
2620     def _real_extract(self, url):
2621         mobj = re.match(self._VALID_URL, url)
2622         if mobj is None:
2623             raise ExtractorError(u'Invalid URL: %s' % url)
2624
2625         # extract uploader (which is in the url)
2626         uploader = mobj.group(1)
2627         # extract simple title (uploader + slug of song title)
2628         slug_title =  mobj.group(2)
2629         simple_title = uploader + u'-' + slug_title
2630         full_title = '%s/sets/%s' % (uploader, slug_title)
2631
2632         self.report_resolve(full_title)
2633
2634         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2635         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2636         info_json = self._download_webpage(resolv_url, full_title)
2637
2638         videos = []
2639         info = json.loads(info_json)
2640         if 'errors' in info:
2641             for err in info['errors']:
2642                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2643             return
2644
2645         self.report_extraction(full_title)
2646         for track in info['tracks']:
2647             video_id = track['id']
2648
2649             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2651
2652             self.report_extraction(video_id)
2653             streams = json.loads(stream_json)
2654             mediaURL = streams['http_mp3_128_url']
2655
2656             videos.append({
2657                 'id':       video_id,
2658                 'url':      mediaURL,
2659                 'uploader': track['user']['username'],
2660                 'upload_date':  unified_strdate(track['created_at']),
2661                 'title':    track['title'],
2662                 'ext':      u'mp3',
2663                 'description': track['description'],
2664             })
2665         return videos
2666
2667
2668 class InfoQIE(InfoExtractor):
2669     """Information extractor for infoq.com"""
2670     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2671
2672     def _real_extract(self, url):
2673         mobj = re.match(self._VALID_URL, url)
2674         if mobj is None:
2675             raise ExtractorError(u'Invalid URL: %s' % url)
2676
2677         webpage = self._download_webpage(url, video_id=url)
2678         self.report_extraction(url)
2679
2680         # Extract video URL
2681         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2682         if mobj is None:
2683             raise ExtractorError(u'Unable to extract video url')
2684         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2685         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2686
2687         # Extract title
2688         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2689             webpage, u'title')
2690
2691         # Extract description
2692         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2693             webpage, u'description', fatal=False)
2694
2695         video_filename = video_url.split('/')[-1]
2696         video_id, extension = video_filename.split('.')
2697
2698         info = {
2699             'id': video_id,
2700             'url': video_url,
2701             'uploader': None,
2702             'upload_date': None,
2703             'title': video_title,
2704             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2705             'thumbnail': None,
2706             'description': video_description,
2707         }
2708
2709         return [info]
2710
2711 class MixcloudIE(InfoExtractor):
2712     """Information extractor for www.mixcloud.com"""
2713
2714     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2715     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2716     IE_NAME = u'mixcloud'
2717
2718     def report_download_json(self, file_id):
2719         """Report JSON download."""
2720         self.to_screen(u'Downloading json')
2721
2722     def get_urls(self, jsonData, fmt, bitrate='best'):
2723         """Get urls from 'audio_formats' section in json"""
2724         file_url = None
2725         try:
2726             bitrate_list = jsonData[fmt]
2727             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2728                 bitrate = max(bitrate_list) # select highest
2729
2730             url_list = jsonData[fmt][bitrate]
2731         except TypeError: # we have no bitrate info.
2732             url_list = jsonData[fmt]
2733         return url_list
2734
2735     def check_urls(self, url_list):
2736         """Returns 1st active url from list"""
2737         for url in url_list:
2738             try:
2739                 compat_urllib_request.urlopen(url)
2740                 return url
2741             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742                 url = None
2743
2744         return None
2745
2746     def _print_formats(self, formats):
2747         print('Available formats:')
2748         for fmt in formats.keys():
2749             for b in formats[fmt]:
2750                 try:
2751                     ext = formats[fmt][b][0]
2752                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2753                 except TypeError: # we have no bitrate info
2754                     ext = formats[fmt][0]
2755                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2756                     break
2757
2758     def _real_extract(self, url):
2759         mobj = re.match(self._VALID_URL, url)
2760         if mobj is None:
2761             raise ExtractorError(u'Invalid URL: %s' % url)
2762         # extract uploader & filename from url
2763         uploader = mobj.group(1).decode('utf-8')
2764         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2765
2766         # construct API request
2767         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2768         # retrieve .json file with links to files
2769         request = compat_urllib_request.Request(file_url)
2770         try:
2771             self.report_download_json(file_url)
2772             jsonData = compat_urllib_request.urlopen(request).read()
2773         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2775
2776         # parse JSON
2777         json_data = json.loads(jsonData)
2778         player_url = json_data['player_swf_url']
2779         formats = dict(json_data['audio_formats'])
2780
2781         req_format = self._downloader.params.get('format', None)
2782         bitrate = None
2783
2784         if self._downloader.params.get('listformats', None):
2785             self._print_formats(formats)
2786             return
2787
2788         if req_format is None or req_format == 'best':
2789             for format_param in formats.keys():
2790                 url_list = self.get_urls(formats, format_param)
2791                 # check urls
2792                 file_url = self.check_urls(url_list)
2793                 if file_url is not None:
2794                     break # got it!
2795         else:
2796             if req_format not in formats:
2797                 raise ExtractorError(u'Format is not available')
2798
2799             url_list = self.get_urls(formats, req_format)
2800             file_url = self.check_urls(url_list)
2801             format_param = req_format
2802
2803         return [{
2804             'id': file_id.decode('utf-8'),
2805             'url': file_url.decode('utf-8'),
2806             'uploader': uploader.decode('utf-8'),
2807             'upload_date': None,
2808             'title': json_data['name'],
2809             'ext': file_url.split('.')[-1].decode('utf-8'),
2810             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2811             'thumbnail': json_data['thumbnail_url'],
2812             'description': json_data['description'],
2813             'player_url': player_url.decode('utf-8'),
2814         }]
2815
2816 class StanfordOpenClassroomIE(InfoExtractor):
2817     """Information extractor for Stanford's Open ClassRoom"""
2818
2819     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2820     IE_NAME = u'stanfordoc'
2821
2822     def _real_extract(self, url):
2823         mobj = re.match(self._VALID_URL, url)
2824         if mobj is None:
2825             raise ExtractorError(u'Invalid URL: %s' % url)
2826
2827         if mobj.group('course') and mobj.group('video'): # A specific video
2828             course = mobj.group('course')
2829             video = mobj.group('video')
2830             info = {
2831                 'id': course + '_' + video,
2832                 'uploader': None,
2833                 'upload_date': None,
2834             }
2835
2836             self.report_extraction(info['id'])
2837             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2838             xmlUrl = baseUrl + video + '.xml'
2839             try:
2840                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2841             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2842                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2843             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2844             try:
2845                 info['title'] = mdoc.findall('./title')[0].text
2846                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2847             except IndexError:
2848                 raise ExtractorError(u'Invalid metadata XML file')
2849             info['ext'] = info['url'].rpartition('.')[2]
2850             return [info]
2851         elif mobj.group('course'): # A course page
2852             course = mobj.group('course')
2853             info = {
2854                 'id': course,
2855                 'type': 'playlist',
2856                 'uploader': None,
2857                 'upload_date': None,
2858             }
2859
2860             coursepage = self._download_webpage(url, info['id'],
2861                                         note='Downloading course info page',
2862                                         errnote='Unable to download course info page')
2863
2864             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2865
2866             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2867                 coursepage, u'description', fatal=False)
2868
2869             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2870             info['list'] = [
2871                 {
2872                     'type': 'reference',
2873                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2874                 }
2875                     for vpage in links]
2876             results = []
2877             for entry in info['list']:
2878                 assert entry['type'] == 'reference'
2879                 results += self.extract(entry['url'])
2880             return results
2881         else: # Root page
2882             info = {
2883                 'id': 'Stanford OpenClassroom',
2884                 'type': 'playlist',
2885                 'uploader': None,
2886                 'upload_date': None,
2887             }
2888
2889             self.report_download_webpage(info['id'])
2890             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2891             try:
2892                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2893             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2895
2896             info['title'] = info['id']
2897
2898             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2899             info['list'] = [
2900                 {
2901                     'type': 'reference',
2902                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2903                 }
2904                     for cpage in links]
2905
2906             results = []
2907             for entry in info['list']:
2908                 assert entry['type'] == 'reference'
2909                 results += self.extract(entry['url'])
2910             return results
2911
2912 class MTVIE(InfoExtractor):
2913     """Information extractor for MTV.com"""
2914
2915     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2916     IE_NAME = u'mtv'
2917
2918     def _real_extract(self, url):
2919         mobj = re.match(self._VALID_URL, url)
2920         if mobj is None:
2921             raise ExtractorError(u'Invalid URL: %s' % url)
2922         if not mobj.group('proto'):
2923             url = 'http://' + url
2924         video_id = mobj.group('videoid')
2925
2926         webpage = self._download_webpage(url, video_id)
2927
2928         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2929             webpage, u'song name', fatal=False)
2930
2931         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2932             webpage, u'title')
2933
2934         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2935             webpage, u'mtvn_uri', fatal=False)
2936
2937         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2938             webpage, u'content id', fatal=False)
2939
2940         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2941         self.report_extraction(video_id)
2942         request = compat_urllib_request.Request(videogen_url)
2943         try:
2944             metadataXml = compat_urllib_request.urlopen(request).read()
2945         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2946             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2947
2948         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2949         renditions = mdoc.findall('.//rendition')
2950
2951         # For now, always pick the highest quality.
2952         rendition = renditions[-1]
2953
2954         try:
2955             _,_,ext = rendition.attrib['type'].partition('/')
2956             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2957             video_url = rendition.find('./src').text
2958         except KeyError:
2959             raise ExtractorError('Invalid rendition field.')
2960
2961         info = {
2962             'id': video_id,
2963             'url': video_url,
2964             'uploader': performer,
2965             'upload_date': None,
2966             'title': video_title,
2967             'ext': ext,
2968             'format': format,
2969         }
2970
2971         return [info]
2972
2973
2974 class YoukuIE(InfoExtractor):
2975     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2976
2977     def _gen_sid(self):
2978         nowTime = int(time.time() * 1000)
2979         random1 = random.randint(1000,1998)
2980         random2 = random.randint(1000,9999)
2981
2982         return "%d%d%d" %(nowTime,random1,random2)
2983
2984     def _get_file_ID_mix_string(self, seed):
2985         mixed = []
2986         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2987         seed = float(seed)
2988         for i in range(len(source)):
2989             seed  =  (seed * 211 + 30031 ) % 65536
2990             index  =  math.floor(seed / 65536 * len(source) )
2991             mixed.append(source[int(index)])
2992             source.remove(source[int(index)])
2993         #return ''.join(mixed)
2994         return mixed
2995
2996     def _get_file_id(self, fileId, seed):
2997         mixed = self._get_file_ID_mix_string(seed)
2998         ids = fileId.split('*')
2999         realId = []
3000         for ch in ids:
3001             if ch:
3002                 realId.append(mixed[int(ch)])
3003         return ''.join(realId)
3004
3005     def _real_extract(self, url):
3006         mobj = re.match(self._VALID_URL, url)
3007         if mobj is None:
3008             raise ExtractorError(u'Invalid URL: %s' % url)
3009         video_id = mobj.group('ID')
3010
3011         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3012
3013         jsondata = self._download_webpage(info_url, video_id)
3014
3015         self.report_extraction(video_id)
3016         try:
3017             config = json.loads(jsondata)
3018
3019             video_title =  config['data'][0]['title']
3020             seed = config['data'][0]['seed']
3021
3022             format = self._downloader.params.get('format', None)
3023             supported_format = list(config['data'][0]['streamfileids'].keys())
3024
3025             if format is None or format == 'best':
3026                 if 'hd2' in supported_format:
3027                     format = 'hd2'
3028                 else:
3029                     format = 'flv'
3030                 ext = u'flv'
3031             elif format == 'worst':
3032                 format = 'mp4'
3033                 ext = u'mp4'
3034             else:
3035                 format = 'flv'
3036                 ext = u'flv'
3037
3038
3039             fileid = config['data'][0]['streamfileids'][format]
3040             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3041         except (UnicodeDecodeError, ValueError, KeyError):
3042             raise ExtractorError(u'Unable to extract info section')
3043
3044         files_info=[]
3045         sid = self._gen_sid()
3046         fileid = self._get_file_id(fileid, seed)
3047
3048         #column 8,9 of fileid represent the segment number
3049         #fileid[7:9] should be changed
3050         for index, key in enumerate(keys):
3051
3052             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3053             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3054
3055             info = {
3056                 'id': '%s_part%02d' % (video_id, index),
3057                 'url': download_url,
3058                 'uploader': None,
3059                 'upload_date': None,
3060                 'title': video_title,
3061                 'ext': ext,
3062             }
3063             files_info.append(info)
3064
3065         return files_info
3066
3067
3068 class XNXXIE(InfoExtractor):
3069     """Information extractor for xnxx.com"""
3070
3071     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3072     IE_NAME = u'xnxx'
3073     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3074     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3075     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3076
3077     def _real_extract(self, url):
3078         mobj = re.match(self._VALID_URL, url)
3079         if mobj is None:
3080             raise ExtractorError(u'Invalid URL: %s' % url)
3081         video_id = mobj.group(1)
3082
3083         # Get webpage content
3084         webpage = self._download_webpage(url, video_id)
3085
3086         video_url = self._search_regex(self.VIDEO_URL_RE,
3087             webpage, u'video URL')
3088         video_url = compat_urllib_parse.unquote(video_url)
3089
3090         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3091             webpage, u'title')
3092
3093         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3094             webpage, u'thumbnail', fatal=False)
3095
3096         return [{
3097             'id': video_id,
3098             'url': video_url,
3099             'uploader': None,
3100             'upload_date': None,
3101             'title': video_title,
3102             'ext': 'flv',
3103             'thumbnail': video_thumbnail,
3104             'description': None,
3105         }]
3106
3107
3108 class GooglePlusIE(InfoExtractor):
3109     """Information extractor for plus.google.com."""
3110
3111     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3112     IE_NAME = u'plus.google'
3113
3114     def _real_extract(self, url):
3115         # Extract id from URL
3116         mobj = re.match(self._VALID_URL, url)
3117         if mobj is None:
3118             raise ExtractorError(u'Invalid URL: %s' % url)
3119
3120         post_url = mobj.group(0)
3121         video_id = mobj.group(1)
3122
3123         video_extension = 'flv'
3124
3125         # Step 1, Retrieve post webpage to extract further information
3126         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3127
3128         self.report_extraction(video_id)
3129
3130         # Extract update date
3131         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3132             webpage, u'upload date', fatal=False)
3133         if upload_date:
3134             # Convert timestring to a format suitable for filename
3135             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3136             upload_date = upload_date.strftime('%Y%m%d')
3137
3138         # Extract uploader
3139         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3140             webpage, u'uploader', fatal=False)
3141
3142         # Extract title
3143         # Get the first line for title
3144         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3145             webpage, 'title', default=u'NA')
3146
3147         # Step 2, Stimulate clicking the image box to launch video
3148         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3149             webpage, u'video page URL')
3150         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3151
3152         # Extract video links on video page
3153         """Extract video links of all sizes"""
3154         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3155         mobj = re.findall(pattern, webpage)
3156         if len(mobj) == 0:
3157             raise ExtractorError(u'Unable to extract video links')
3158
3159         # Sort in resolution
3160         links = sorted(mobj)
3161
3162         # Choose the lowest of the sort, i.e. highest resolution
3163         video_url = links[-1]
3164         # Only get the url. The resolution part in the tuple has no use anymore
3165         video_url = video_url[-1]
3166         # Treat escaped \u0026 style hex
3167         try:
3168             video_url = video_url.decode("unicode_escape")
3169         except AttributeError: # Python 3
3170             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3171
3172
3173         return [{
3174             'id':       video_id,
3175             'url':      video_url,
3176             'uploader': uploader,
3177             'upload_date':  upload_date,
3178             'title':    video_title,
3179             'ext':      video_extension,
3180         }]
3181
3182 class NBAIE(InfoExtractor):
3183     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3184     IE_NAME = u'nba'
3185
3186     def _real_extract(self, url):
3187         mobj = re.match(self._VALID_URL, url)
3188         if mobj is None:
3189             raise ExtractorError(u'Invalid URL: %s' % url)
3190
3191         video_id = mobj.group(1)
3192
3193         webpage = self._download_webpage(url, video_id)
3194
3195         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3196
3197         shortened_video_id = video_id.rpartition('/')[2]
3198         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3199             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3200
3201         # It isn't there in the HTML it returns to us
3202         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3203
3204         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3205
3206         info = {
3207             'id': shortened_video_id,
3208             'url': video_url,
3209             'ext': 'mp4',
3210             'title': title,
3211             # 'uploader_date': uploader_date,
3212             'description': description,
3213         }
3214         return [info]
3215
3216 class JustinTVIE(InfoExtractor):
3217     """Information extractor for justin.tv and twitch.tv"""
3218     # TODO: One broadcast may be split into multiple videos. The key
3219     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3220     # starts at 1 and increases. Can we treat all parts as one video?
3221
3222     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3223         (?:
3224             (?P<channelid>[^/]+)|
3225             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3226             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3227         )
3228         /?(?:\#.*)?$
3229         """
3230     _JUSTIN_PAGE_LIMIT = 100
3231     IE_NAME = u'justin.tv'
3232
3233     def report_download_page(self, channel, offset):
3234         """Report attempt to download a single page of videos."""
3235         self.to_screen(u'%s: Downloading video information from %d to %d' %
3236                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3237
3238     # Return count of items, list of *valid* items
3239     def _parse_page(self, url, video_id):
3240         webpage = self._download_webpage(url, video_id,
3241                                          u'Downloading video info JSON',
3242                                          u'unable to download video info JSON')
3243
3244         response = json.loads(webpage)
3245         if type(response) != list:
3246             error_text = response.get('error', 'unknown error')
3247             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3248         info = []
3249         for clip in response:
3250             video_url = clip['video_file_url']
3251             if video_url:
3252                 video_extension = os.path.splitext(video_url)[1][1:]
3253                 video_date = re.sub('-', '', clip['start_time'][:10])
3254                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3255                 video_id = clip['id']
3256                 video_title = clip.get('title', video_id)
3257                 info.append({
3258                     'id': video_id,
3259                     'url': video_url,
3260                     'title': video_title,
3261                     'uploader': clip.get('channel_name', video_uploader_id),
3262                     'uploader_id': video_uploader_id,
3263                     'upload_date': video_date,
3264                     'ext': video_extension,
3265                 })
3266         return (len(response), info)
3267
3268     def _real_extract(self, url):
3269         mobj = re.match(self._VALID_URL, url)
3270         if mobj is None:
3271             raise ExtractorError(u'invalid URL: %s' % url)
3272
3273         api_base = 'http://api.justin.tv'
3274         paged = False
3275         if mobj.group('channelid'):
3276             paged = True
3277             video_id = mobj.group('channelid')
3278             api = api_base + '/channel/archives/%s.json' % video_id
3279         elif mobj.group('chapterid'):
3280             chapter_id = mobj.group('chapterid')
3281
3282             webpage = self._download_webpage(url, chapter_id)
3283             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3284             if not m:
3285                 raise ExtractorError(u'Cannot find archive of a chapter')
3286             archive_id = m.group(1)
3287
3288             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3289             chapter_info_xml = self._download_webpage(api, chapter_id,
3290                                              note=u'Downloading chapter information',
3291                                              errnote=u'Chapter information download failed')
3292             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3293             for a in doc.findall('.//archive'):
3294                 if archive_id == a.find('./id').text:
3295                     break
3296             else:
3297                 raise ExtractorError(u'Could not find chapter in chapter information')
3298
3299             video_url = a.find('./video_file_url').text
3300             video_ext = video_url.rpartition('.')[2] or u'flv'
3301
3302             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3303             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3304                                    note='Downloading chapter metadata',
3305                                    errnote='Download of chapter metadata failed')
3306             chapter_info = json.loads(chapter_info_json)
3307
3308             bracket_start = int(doc.find('.//bracket_start').text)
3309             bracket_end = int(doc.find('.//bracket_end').text)
3310
3311             # TODO determine start (and probably fix up file)
3312             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3313             #video_url += u'?start=' + TODO:start_timestamp
3314             # bracket_start is 13290, but we want 51670615
3315             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3316                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3317
3318             info = {
3319                 'id': u'c' + chapter_id,
3320                 'url': video_url,
3321                 'ext': video_ext,
3322                 'title': chapter_info['title'],
3323                 'thumbnail': chapter_info['preview'],
3324                 'description': chapter_info['description'],
3325                 'uploader': chapter_info['channel']['display_name'],
3326                 'uploader_id': chapter_info['channel']['name'],
3327             }
3328             return [info]
3329         else:
3330             video_id = mobj.group('videoid')
3331             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3332
3333         self.report_extraction(video_id)
3334
3335         info = []
3336         offset = 0
3337         limit = self._JUSTIN_PAGE_LIMIT
3338         while True:
3339             if paged:
3340                 self.report_download_page(video_id, offset)
3341             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3342             page_count, page_info = self._parse_page(page_url, video_id)
3343             info.extend(page_info)
3344             if not paged or page_count != limit:
3345                 break
3346             offset += limit
3347         return info
3348
3349 class FunnyOrDieIE(InfoExtractor):
3350     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3351
3352     def _real_extract(self, url):
3353         mobj = re.match(self._VALID_URL, url)
3354         if mobj is None:
3355             raise ExtractorError(u'invalid URL: %s' % url)
3356
3357         video_id = mobj.group('id')
3358         webpage = self._download_webpage(url, video_id)
3359
3360         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3361             webpage, u'video URL', flags=re.DOTALL)
3362
3363         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3364             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3365
3366         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3367             webpage, u'description', fatal=False, flags=re.DOTALL)
3368
3369         info = {
3370             'id': video_id,
3371             'url': video_url,
3372             'ext': 'mp4',
3373             'title': title,
3374             'description': video_description,
3375         }
3376         return [info]
3377
3378 class SteamIE(InfoExtractor):
3379     _VALID_URL = r"""http://store\.steampowered\.com/
3380                 (agecheck/)?
3381                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3382                 (?P<gameID>\d+)/?
3383                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3384                 """
3385     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3386     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3387
3388     @classmethod
3389     def suitable(cls, url):
3390         """Receives a URL and returns True if suitable for this IE."""
3391         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3392
3393     def _real_extract(self, url):
3394         m = re.match(self._VALID_URL, url, re.VERBOSE)
3395         gameID = m.group('gameID')
3396
3397         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3398         webpage = self._download_webpage(videourl, gameID)
3399
3400         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3401             videourl = self._AGECHECK_TEMPLATE % gameID
3402             self.report_age_confirmation()
3403             webpage = self._download_webpage(videourl, gameID)
3404
3405         self.report_extraction(gameID)
3406         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3407                                              webpage, 'game title')
3408
3409         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3410         mweb = re.finditer(urlRE, webpage)
3411         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3412         titles = re.finditer(namesRE, webpage)
3413         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3414         thumbs = re.finditer(thumbsRE, webpage)
3415         videos = []
3416         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3417             video_id = vid.group('videoID')
3418             title = vtitle.group('videoName')
3419             video_url = vid.group('videoURL')
3420             video_thumb = thumb.group('thumbnail')
3421             if not video_url:
3422                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3423             info = {
3424                 'id':video_id,
3425                 'url':video_url,
3426                 'ext': 'flv',
3427                 'title': unescapeHTML(title),
3428                 'thumbnail': video_thumb
3429                   }
3430             videos.append(info)
3431         return [self.playlist_result(videos, gameID, game_title)]
3432
3433 class UstreamIE(InfoExtractor):
3434     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3435     IE_NAME = u'ustream'
3436
3437     def _real_extract(self, url):
3438         m = re.match(self._VALID_URL, url)
3439         video_id = m.group('videoID')
3440
3441         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3442         webpage = self._download_webpage(url, video_id)
3443
3444         self.report_extraction(video_id)
3445
3446         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3447             webpage, u'title')
3448
3449         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3450             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3451
3452         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3453             webpage, u'thumbnail', fatal=False)
3454
3455         info = {
3456                 'id': video_id,
3457                 'url': video_url,
3458                 'ext': 'flv',
3459                 'title': video_title,
3460                 'uploader': uploader,
3461                 'thumbnail': thumbnail,
3462                }
3463         return info
3464
3465 class WorldStarHipHopIE(InfoExtractor):
3466     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3467     IE_NAME = u'WorldStarHipHop'
3468
3469     def _real_extract(self, url):
3470         m = re.match(self._VALID_URL, url)
3471         video_id = m.group('id')
3472
3473         webpage_src = self._download_webpage(url, video_id)
3474
3475         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3476             webpage_src, u'video URL')
3477
3478         if 'mp4' in video_url:
3479             ext = 'mp4'
3480         else:
3481             ext = 'flv'
3482
3483         video_title = self._html_search_regex(r"<title>(.*)</title>",
3484             webpage_src, u'title')
3485
3486         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3487         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3488             webpage_src, u'thumbnail', fatal=False)
3489
3490         if not thumbnail:
3491             _title = r"""candytitles.*>(.*)</span>"""
3492             mobj = re.search(_title, webpage_src)
3493             if mobj is not None:
3494                 video_title = mobj.group(1)
3495
3496         results = [{
3497                     'id': video_id,
3498                     'url' : video_url,
3499                     'title' : video_title,
3500                     'thumbnail' : thumbnail,
3501                     'ext' : ext,
3502                     }]
3503         return results
3504
3505 class RBMARadioIE(InfoExtractor):
3506     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3507
3508     def _real_extract(self, url):
3509         m = re.match(self._VALID_URL, url)
3510         video_id = m.group('videoID')
3511
3512         webpage = self._download_webpage(url, video_id)
3513
3514         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3515             webpage, u'json data', flags=re.MULTILINE)
3516
3517         try:
3518             data = json.loads(json_data)
3519         except ValueError as e:
3520             raise ExtractorError(u'Invalid JSON: ' + str(e))
3521
3522         video_url = data['akamai_url'] + '&cbr=256'
3523         url_parts = compat_urllib_parse_urlparse(video_url)
3524         video_ext = url_parts.path.rpartition('.')[2]
3525         info = {
3526                 'id': video_id,
3527                 'url': video_url,
3528                 'ext': video_ext,
3529                 'title': data['title'],
3530                 'description': data.get('teaser_text'),
3531                 'location': data.get('country_of_origin'),
3532                 'uploader': data.get('host', {}).get('name'),
3533                 'uploader_id': data.get('host', {}).get('slug'),
3534                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3535                 'duration': data.get('duration'),
3536         }
3537         return [info]
3538
3539
3540 class YouPornIE(InfoExtractor):
3541     """Information extractor for youporn.com."""
3542     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3543
3544     def _print_formats(self, formats):
3545         """Print all available formats"""
3546         print(u'Available formats:')
3547         print(u'ext\t\tformat')
3548         print(u'---------------------------------')
3549         for format in formats:
3550             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3551
3552     def _specific(self, req_format, formats):
3553         for x in formats:
3554             if(x["format"]==req_format):
3555                 return x
3556         return None
3557
3558     def _real_extract(self, url):
3559         mobj = re.match(self._VALID_URL, url)
3560         if mobj is None:
3561             raise ExtractorError(u'Invalid URL: %s' % url)
3562         video_id = mobj.group('videoid')
3563
3564         req = compat_urllib_request.Request(url)
3565         req.add_header('Cookie', 'age_verified=1')
3566         webpage = self._download_webpage(req, video_id)
3567
3568         # Get JSON parameters
3569         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3570         try:
3571             params = json.loads(json_params)
3572         except:
3573             raise ExtractorError(u'Invalid JSON')
3574
3575         self.report_extraction(video_id)
3576         try:
3577             video_title = params['title']
3578             upload_date = unified_strdate(params['release_date_f'])
3579             video_description = params['description']
3580             video_uploader = params['submitted_by']
3581             thumbnail = params['thumbnails'][0]['image']
3582         except KeyError:
3583             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3584
3585         # Get all of the formats available
3586         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3587         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3588             webpage, u'download list').strip()
3589
3590         # Get all of the links from the page
3591         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3592         links = re.findall(LINK_RE, download_list_html)
3593         if(len(links) == 0):
3594             raise ExtractorError(u'ERROR: no known formats available for video')
3595
3596         self.to_screen(u'Links found: %d' % len(links))
3597
3598         formats = []
3599         for link in links:
3600
3601             # A link looks like this:
3602             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3603             # A path looks like this:
3604             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3605             video_url = unescapeHTML( link )
3606             path = compat_urllib_parse_urlparse( video_url ).path
3607             extension = os.path.splitext( path )[1][1:]
3608             format = path.split('/')[4].split('_')[:2]
3609             size = format[0]
3610             bitrate = format[1]
3611             format = "-".join( format )
3612             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3613
3614             formats.append({
3615                 'id': video_id,
3616                 'url': video_url,
3617                 'uploader': video_uploader,
3618                 'upload_date': upload_date,
3619                 'title': video_title,
3620                 'ext': extension,
3621                 'format': format,
3622                 'thumbnail': thumbnail,
3623                 'description': video_description
3624             })
3625
3626         if self._downloader.params.get('listformats', None):
3627             self._print_formats(formats)
3628             return
3629
3630         req_format = self._downloader.params.get('format', None)
3631         self.to_screen(u'Format: %s' % req_format)
3632
3633         if req_format is None or req_format == 'best':
3634             return [formats[0]]
3635         elif req_format == 'worst':
3636             return [formats[-1]]
3637         elif req_format in ('-1', 'all'):
3638             return formats
3639         else:
3640             format = self._specific( req_format, formats )
3641             if result is None:
3642                 raise ExtractorError(u'Requested format not available')
3643             return [format]
3644
3645
3646
3647 class PornotubeIE(InfoExtractor):
3648     """Information extractor for pornotube.com."""
3649     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3650
3651     def _real_extract(self, url):
3652         mobj = re.match(self._VALID_URL, url)
3653         if mobj is None:
3654             raise ExtractorError(u'Invalid URL: %s' % url)
3655
3656         video_id = mobj.group('videoid')
3657         video_title = mobj.group('title')
3658
3659         # Get webpage content
3660         webpage = self._download_webpage(url, video_id)
3661
3662         # Get the video URL
3663         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3664         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3665         video_url = compat_urllib_parse.unquote(video_url)
3666
3667         #Get the uploaded date
3668         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3669         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3670         if upload_date: upload_date = unified_strdate(upload_date)
3671
3672         info = {'id': video_id,
3673                 'url': video_url,
3674                 'uploader': None,
3675                 'upload_date': upload_date,
3676                 'title': video_title,
3677                 'ext': 'flv',
3678                 'format': 'flv'}
3679
3680         return [info]
3681
3682 class YouJizzIE(InfoExtractor):
3683     """Information extractor for youjizz.com."""
3684     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3685
3686     def _real_extract(self, url):
3687         mobj = re.match(self._VALID_URL, url)
3688         if mobj is None:
3689             raise ExtractorError(u'Invalid URL: %s' % url)
3690
3691         video_id = mobj.group('videoid')
3692
3693         # Get webpage content
3694         webpage = self._download_webpage(url, video_id)
3695
3696         # Get the video title
3697         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3698             webpage, u'title').strip()
3699
3700         # Get the embed page
3701         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3702         if result is None:
3703             raise ExtractorError(u'ERROR: unable to extract embed page')
3704
3705         embed_page_url = result.group(0).strip()
3706         video_id = result.group('videoid')
3707
3708         webpage = self._download_webpage(embed_page_url, video_id)
3709
3710         # Get the video URL
3711         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3712             webpage, u'video URL')
3713
3714         info = {'id': video_id,
3715                 'url': video_url,
3716                 'title': video_title,
3717                 'ext': 'flv',
3718                 'format': 'flv',
3719                 'player_url': embed_page_url}
3720
3721         return [info]
3722
3723 class EightTracksIE(InfoExtractor):
3724     IE_NAME = '8tracks'
3725     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3726
3727     def _real_extract(self, url):
3728         mobj = re.match(self._VALID_URL, url)
3729         if mobj is None:
3730             raise ExtractorError(u'Invalid URL: %s' % url)
3731         playlist_id = mobj.group('id')
3732
3733         webpage = self._download_webpage(url, playlist_id)
3734
3735         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3736         data = json.loads(json_like)
3737
3738         session = str(random.randint(0, 1000000000))
3739         mix_id = data['id']
3740         track_count = data['tracks_count']
3741         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3742         next_url = first_url
3743         res = []
3744         for i in itertools.count():
3745             api_json = self._download_webpage(next_url, playlist_id,
3746                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3747                 errnote=u'Failed to download song information')
3748             api_data = json.loads(api_json)
3749             track_data = api_data[u'set']['track']
3750             info = {
3751                 'id': track_data['id'],
3752                 'url': track_data['track_file_stream_url'],
3753                 'title': track_data['performer'] + u' - ' + track_data['name'],
3754                 'raw_title': track_data['name'],
3755                 'uploader_id': data['user']['login'],
3756                 'ext': 'm4a',
3757             }
3758             res.append(info)
3759             if api_data['set']['at_last_track']:
3760                 break
3761             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3762         return res
3763
3764 class KeekIE(InfoExtractor):
3765     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3766     IE_NAME = u'keek'
3767
3768     def _real_extract(self, url):
3769         m = re.match(self._VALID_URL, url)
3770         video_id = m.group('videoID')
3771
3772         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3773         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3774         webpage = self._download_webpage(url, video_id)
3775
3776         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3777             webpage, u'title')
3778
3779         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3780             webpage, u'uploader', fatal=False)
3781
3782         info = {
3783                 'id': video_id,
3784                 'url': video_url,
3785                 'ext': 'mp4',
3786                 'title': video_title,
3787                 'thumbnail': thumbnail,
3788                 'uploader': uploader
3789         }
3790         return [info]
3791
3792 class TEDIE(InfoExtractor):
3793     _VALID_URL=r'''http://www\.ted\.com/
3794                    (
3795                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3796                         |
3797                         ((?P<type_talk>talks)) # We have a simple talk
3798                    )
3799                    (/lang/(.*?))? # The url may contain the language
3800                    /(?P<name>\w+) # Here goes the name and then ".html"
3801                    '''
3802
3803     @classmethod
3804     def suitable(cls, url):
3805         """Receives a URL and returns True if suitable for this IE."""
3806         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3807
3808     def _real_extract(self, url):
3809         m=re.match(self._VALID_URL, url, re.VERBOSE)
3810         if m.group('type_talk'):
3811             return [self._talk_info(url)]
3812         else :
3813             playlist_id=m.group('playlist_id')
3814             name=m.group('name')
3815             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3816             return [self._playlist_videos_info(url,name,playlist_id)]
3817
3818     def _playlist_videos_info(self,url,name,playlist_id=0):
3819         '''Returns the videos of the playlist'''
3820         video_RE=r'''
3821                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3822                      ([.\s]*?)data-playlist_item_id="(\d+)"
3823                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3824                      '''
3825         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3826         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3827         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3828         m_names=re.finditer(video_name_RE,webpage)
3829
3830         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3831                                                  webpage, 'playlist title')
3832
3833         playlist_entries = []
3834         for m_video, m_name in zip(m_videos,m_names):
3835             video_id=m_video.group('video_id')
3836             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3837             playlist_entries.append(self.url_result(talk_url, 'TED'))
3838         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3839
3840     def _talk_info(self, url, video_id=0):
3841         """Return the video for the talk in the url"""
3842         m = re.match(self._VALID_URL, url,re.VERBOSE)
3843         video_name = m.group('name')
3844         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3845         self.report_extraction(video_name)
3846         # If the url includes the language we get the title translated
3847         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3848                                         webpage, 'title')
3849         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3850                                     webpage, 'json data')
3851         info = json.loads(json_data)
3852         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3853                                        webpage, 'description', flags = re.DOTALL)
3854
3855         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3856                                        webpage, 'thumbnail')
3857         info = {
3858                 'id': info['id'],
3859                 'url': info['htmlStreams'][-1]['file'],
3860                 'ext': 'mp4',
3861                 'title': title,
3862                 'thumbnail': thumbnail,
3863                 'description': desc,
3864                 }
3865         return info
3866
3867 class MySpassIE(InfoExtractor):
3868     _VALID_URL = r'http://www.myspass.de/.*'
3869
3870     def _real_extract(self, url):
3871         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3872
3873         # video id is the last path element of the URL
3874         # usually there is a trailing slash, so also try the second but last
3875         url_path = compat_urllib_parse_urlparse(url).path
3876         url_parent_path, video_id = os.path.split(url_path)
3877         if not video_id:
3878             _, video_id = os.path.split(url_parent_path)
3879
3880         # get metadata
3881         metadata_url = META_DATA_URL_TEMPLATE % video_id
3882         metadata_text = self._download_webpage(metadata_url, video_id)
3883         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3884
3885         # extract values from metadata
3886         url_flv_el = metadata.find('url_flv')
3887         if url_flv_el is None:
3888             raise ExtractorError(u'Unable to extract download url')
3889         video_url = url_flv_el.text
3890         extension = os.path.splitext(video_url)[1][1:]
3891         title_el = metadata.find('title')
3892         if title_el is None:
3893             raise ExtractorError(u'Unable to extract title')
3894         title = title_el.text
3895         format_id_el = metadata.find('format_id')
3896         if format_id_el is None:
3897             format = ext
3898         else:
3899             format = format_id_el.text
3900         description_el = metadata.find('description')
3901         if description_el is not None:
3902             description = description_el.text
3903         else:
3904             description = None
3905         imagePreview_el = metadata.find('imagePreview')
3906         if imagePreview_el is not None:
3907             thumbnail = imagePreview_el.text
3908         else:
3909             thumbnail = None
3910         info = {
3911             'id': video_id,
3912             'url': video_url,
3913             'title': title,
3914             'ext': extension,
3915             'format': format,
3916             'thumbnail': thumbnail,
3917             'description': description
3918         }
3919         return [info]
3920
3921 class SpiegelIE(InfoExtractor):
3922     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3923
3924     def _real_extract(self, url):
3925         m = re.match(self._VALID_URL, url)
3926         video_id = m.group('videoID')
3927
3928         webpage = self._download_webpage(url, video_id)
3929
3930         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3931             webpage, u'title')
3932
3933         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3934         xml_code = self._download_webpage(xml_url, video_id,
3935                     note=u'Downloading XML', errnote=u'Failed to download XML')
3936
3937         idoc = xml.etree.ElementTree.fromstring(xml_code)
3938         last_type = idoc[-1]
3939         filename = last_type.findall('./filename')[0].text
3940         duration = float(last_type.findall('./duration')[0].text)
3941
3942         video_url = 'http://video2.spiegel.de/flash/' + filename
3943         video_ext = filename.rpartition('.')[2]
3944         info = {
3945             'id': video_id,
3946             'url': video_url,
3947             'ext': video_ext,
3948             'title': video_title,
3949             'duration': duration,
3950         }
3951         return [info]
3952
3953 class LiveLeakIE(InfoExtractor):
3954
3955     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3956     IE_NAME = u'liveleak'
3957
3958     def _real_extract(self, url):
3959         mobj = re.match(self._VALID_URL, url)
3960         if mobj is None:
3961             raise ExtractorError(u'Invalid URL: %s' % url)
3962
3963         video_id = mobj.group('video_id')
3964
3965         webpage = self._download_webpage(url, video_id)
3966
3967         video_url = self._search_regex(r'file: "(.*?)",',
3968             webpage, u'video URL')
3969
3970         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3971             webpage, u'title').replace('LiveLeak.com -', '').strip()
3972
3973         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3974             webpage, u'description', fatal=False)
3975
3976         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3977             webpage, u'uploader', fatal=False)
3978
3979         info = {
3980             'id':  video_id,
3981             'url': video_url,
3982             'ext': 'mp4',
3983             'title': video_title,
3984             'description': video_description,
3985             'uploader': video_uploader
3986         }
3987
3988         return [info]
3989
3990 class ARDIE(InfoExtractor):
3991     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3992     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3993     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3994
3995     def _real_extract(self, url):
3996         # determine video id from url
3997         m = re.match(self._VALID_URL, url)
3998
3999         numid = re.search(r'documentId=([0-9]+)', url)
4000         if numid:
4001             video_id = numid.group(1)
4002         else:
4003             video_id = m.group('video_id')
4004
4005         # determine title and media streams from webpage
4006         html = self._download_webpage(url, video_id)
4007         title = re.search(self._TITLE, html).group('title')
4008         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4009         if not streams:
4010             assert '"fsk"' in html
4011             raise ExtractorError(u'This video is only available after 8:00 pm')
4012
4013         # choose default media type and highest quality for now
4014         stream = max([s for s in streams if int(s["media_type"]) == 0],
4015                      key=lambda s: int(s["quality"]))
4016
4017         # there's two possibilities: RTMP stream or HTTP download
4018         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4019         if stream['rtmp_url']:
4020             self.to_screen(u'RTMP download detected')
4021             assert stream['video_url'].startswith('mp4:')
4022             info["url"] = stream["rtmp_url"]
4023             info["play_path"] = stream['video_url']
4024         else:
4025             assert stream["video_url"].endswith('.mp4')
4026             info["url"] = stream["video_url"]
4027         return [info]
4028
4029 class ZDFIE(InfoExtractor):
4030     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4031     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4032     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4033     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4034     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4035
4036     def _real_extract(self, url):
4037         mobj = re.match(self._VALID_URL, url)
4038         if mobj is None:
4039             raise ExtractorError(u'Invalid URL: %s' % url)
4040         video_id = mobj.group('video_id')
4041
4042         html = self._download_webpage(url, video_id)
4043         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4044         if streams is None:
4045             raise ExtractorError(u'No media url found.')
4046
4047         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4048         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4049         # choose first/default media type and highest quality for now
4050         for s in streams:        #find 300 - dsl1000mbit
4051             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4052                 stream_=s
4053                 break
4054         for s in streams:        #find veryhigh - dsl2000mbit
4055             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4056                 stream_=s
4057                 break
4058         if stream_ is None:
4059             raise ExtractorError(u'No stream found.')
4060
4061         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4062
4063         self.report_extraction(video_id)
4064         mobj = re.search(self._TITLE, html)
4065         if mobj is None:
4066             raise ExtractorError(u'Cannot extract title')
4067         title = unescapeHTML(mobj.group('title'))
4068
4069         mobj = re.search(self._MMS_STREAM, media_link)
4070         if mobj is None:
4071             mobj = re.search(self._RTSP_STREAM, media_link)
4072             if mobj is None:
4073                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4074         mms_url = mobj.group('video_url')
4075
4076         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4077         if mobj is None:
4078             raise ExtractorError(u'Cannot extract extention')
4079         ext = mobj.group('ext')
4080
4081         return [{'id': video_id,
4082                  'url': mms_url,
4083                  'title': title,
4084                  'ext': ext
4085                  }]
4086
4087 class TumblrIE(InfoExtractor):
4088     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4089
4090     def _real_extract(self, url):
4091         m_url = re.match(self._VALID_URL, url)
4092         video_id = m_url.group('id')
4093         blog = m_url.group('blog_name')
4094
4095         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4096         webpage = self._download_webpage(url, video_id)
4097
4098         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4099         video = re.search(re_video, webpage)
4100         if video is None:
4101            raise ExtractorError(u'Unable to extract video')
4102         video_url = video.group('video_url')
4103         ext = video.group('ext')
4104
4105         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4106             webpage, u'thumbnail', fatal=False)  # We pick the first poster
4107         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4108
4109         # The only place where you can get a title, it's not complete,
4110         # but searching in other places doesn't work for all videos
4111         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4112             webpage, u'title', flags=re.DOTALL)
4113
4114         return [{'id': video_id,
4115                  'url': video_url,
4116                  'title': video_title,
4117                  'thumbnail': video_thumbnail,
4118                  'ext': ext
4119                  }]
4120
4121 class BandcampIE(InfoExtractor):
4122     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4123
4124     def _real_extract(self, url):
4125         mobj = re.match(self._VALID_URL, url)
4126         title = mobj.group('title')
4127         webpage = self._download_webpage(url, title)
4128         # We get the link to the free download page
4129         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4130         if m_download is None:
4131             raise ExtractorError(u'No free songs found')
4132
4133         download_link = m_download.group(1)
4134         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4135                        webpage, re.MULTILINE|re.DOTALL).group('id')
4136
4137         download_webpage = self._download_webpage(download_link, id,
4138                                                   'Downloading free downloads page')
4139         # We get the dictionary of the track from some javascrip code
4140         info = re.search(r'items: (.*?),$',
4141                          download_webpage, re.MULTILINE).group(1)
4142         info = json.loads(info)[0]
4143         # We pick mp3-320 for now, until format selection can be easily implemented.
4144         mp3_info = info[u'downloads'][u'mp3-320']
4145         # If we try to use this url it says the link has expired
4146         initial_url = mp3_info[u'url']
4147         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4148         m_url = re.match(re_url, initial_url)
4149         #We build the url we will use to get the final track url
4150         # This url is build in Bandcamp in the script download_bunde_*.js
4151         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4152         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4153         # If we could correctly generate the .rand field the url would be
4154         #in the "download_url" key
4155         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4156
4157         track_info = {'id':id,
4158                       'title' : info[u'title'],
4159                       'ext' :   'mp3',
4160                       'url' :   final_url,
4161                       'thumbnail' : info[u'thumb_url'],
4162                       'uploader' :  info[u'artist']
4163                       }
4164
4165         return [track_info]
4166
4167 class RedTubeIE(InfoExtractor):
4168     """Information Extractor for redtube"""
4169     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4170
4171     def _real_extract(self,url):
4172         mobj = re.match(self._VALID_URL, url)
4173         if mobj is None:
4174             raise ExtractorError(u'Invalid URL: %s' % url)
4175
4176         video_id = mobj.group('id')
4177         video_extension = 'mp4'
4178         webpage = self._download_webpage(url, video_id)
4179
4180         self.report_extraction(video_id)
4181
4182         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4183             webpage, u'video URL')
4184
4185         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4186             webpage, u'title')
4187
4188         return [{
4189             'id':       video_id,
4190             'url':      video_url,
4191             'ext':      video_extension,
4192             'title':    video_title,
4193         }]
4194
4195 class InaIE(InfoExtractor):
4196     """Information Extractor for Ina.fr"""
4197     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4198
4199     def _real_extract(self,url):
4200         mobj = re.match(self._VALID_URL, url)
4201
4202         video_id = mobj.group('id')
4203         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4204         video_extension = 'mp4'
4205         webpage = self._download_webpage(mrss_url, video_id)
4206
4207         self.report_extraction(video_id)
4208
4209         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4210             webpage, u'video URL')
4211
4212         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4213             webpage, u'title')
4214
4215         return [{
4216             'id':       video_id,
4217             'url':      video_url,
4218             'ext':      video_extension,
4219             'title':    video_title,
4220         }]
4221
4222 class HowcastIE(InfoExtractor):
4223     """Information Extractor for Howcast.com"""
4224     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4225
4226     def _real_extract(self, url):
4227         mobj = re.match(self._VALID_URL, url)
4228
4229         video_id = mobj.group('id')
4230         webpage_url = 'http://www.howcast.com/videos/' + video_id
4231         webpage = self._download_webpage(webpage_url, video_id)
4232
4233         self.report_extraction(video_id)
4234
4235         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4236             webpage, u'video URL')
4237
4238         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4239             webpage, u'title')
4240
4241         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4242             webpage, u'description', fatal=False)
4243
4244         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4245             webpage, u'thumbnail', fatal=False)
4246
4247         return [{
4248             'id':       video_id,
4249             'url':      video_url,
4250             'ext':      'mp4',
4251             'title':    video_title,
4252             'description': video_description,
4253             'thumbnail': thumbnail,
4254         }]
4255
4256 class VineIE(InfoExtractor):
4257     """Information Extractor for Vine.co"""
4258     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4259
4260     def _real_extract(self, url):
4261         mobj = re.match(self._VALID_URL, url)
4262
4263         video_id = mobj.group('id')
4264         webpage_url = 'https://vine.co/v/' + video_id
4265         webpage = self._download_webpage(webpage_url, video_id)
4266
4267         self.report_extraction(video_id)
4268
4269         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4270             webpage, u'video URL')
4271
4272         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4273             webpage, u'title')
4274
4275         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4276             webpage, u'thumbnail', fatal=False)
4277
4278         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4279             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4280
4281         return [{
4282             'id':        video_id,
4283             'url':       video_url,
4284             'ext':       'mp4',
4285             'title':     video_title,
4286             'thumbnail': thumbnail,
4287             'uploader':  uploader,
4288         }]
4289
4290 class FlickrIE(InfoExtractor):
4291     """Information Extractor for Flickr videos"""
4292     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4293
4294     def _real_extract(self, url):
4295         mobj = re.match(self._VALID_URL, url)
4296
4297         video_id = mobj.group('id')
4298         video_uploader_id = mobj.group('uploader_id')
4299         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4300         webpage = self._download_webpage(webpage_url, video_id)
4301
4302         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4303
4304         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4305         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4306
4307         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4308             first_xml, u'node_id')
4309
4310         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4311         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4312
4313         self.report_extraction(video_id)
4314
4315         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4316         if mobj is None:
4317             raise ExtractorError(u'Unable to extract video url')
4318         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4319
4320         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4321             webpage, u'video title')
4322
4323         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4324             webpage, u'description', fatal=False)
4325
4326         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4327             webpage, u'thumbnail', fatal=False)
4328
4329         return [{
4330             'id':          video_id,
4331             'url':         video_url,
4332             'ext':         'mp4',
4333             'title':       video_title,
4334             'description': video_description,
4335             'thumbnail':   thumbnail,
4336             'uploader_id': video_uploader_id,
4337         }]
4338
4339 class TeamcocoIE(InfoExtractor):
4340     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4341
4342     def _real_extract(self, url):
4343         mobj = re.match(self._VALID_URL, url)
4344         if mobj is None:
4345             raise ExtractorError(u'Invalid URL: %s' % url)
4346         url_title = mobj.group('url_title')
4347         webpage = self._download_webpage(url, url_title)
4348
4349         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4350             webpage, u'video id')
4351
4352         self.report_extraction(video_id)
4353
4354         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4355             webpage, u'title')
4356
4357         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4358             webpage, u'thumbnail', fatal=False)
4359
4360         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4361             webpage, u'description', fatal=False)
4362
4363         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4364         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4365
4366         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4367             data, u'video URL')
4368
4369         return [{
4370             'id':          video_id,
4371             'url':         video_url,
4372             'ext':         'mp4',
4373             'title':       video_title,
4374             'thumbnail':   thumbnail,
4375             'description': video_description,
4376         }]
4377
4378 class XHamsterIE(InfoExtractor):
4379     """Information Extractor for xHamster"""
4380     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4381
4382     def _real_extract(self,url):
4383         mobj = re.match(self._VALID_URL, url)
4384
4385         video_id = mobj.group('id')
4386         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4387         webpage = self._download_webpage(mrss_url, video_id)
4388
4389         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4390         if mobj is None:
4391             raise ExtractorError(u'Unable to extract media URL')
4392         if len(mobj.group('server')) == 0:
4393             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4394         else:
4395             video_url = mobj.group('server')+'/key='+mobj.group('file')
4396         video_extension = video_url.split('.')[-1]
4397
4398         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4399             webpage, u'title')
4400
4401         # Can't see the description anywhere in the UI
4402         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4403         #     webpage, u'description', fatal=False)
4404         # if video_description: video_description = unescapeHTML(video_description)
4405
4406         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4407         if mobj:
4408             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4409         else:
4410             video_upload_date = None
4411             self._downloader.report_warning(u'Unable to extract upload date')
4412
4413         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4414             webpage, u'uploader id', default=u'anonymous')
4415
4416         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4417             webpage, u'thumbnail', fatal=False)
4418
4419         return [{
4420             'id':       video_id,
4421             'url':      video_url,
4422             'ext':      video_extension,
4423             'title':    video_title,
4424             # 'description': video_description,
4425             'upload_date': video_upload_date,
4426             'uploader_id': video_uploader_id,
4427             'thumbnail': video_thumbnail
4428         }]
4429
4430 class HypemIE(InfoExtractor):
4431     """Information Extractor for hypem"""
4432     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4433
4434     def _real_extract(self, url):
4435         mobj = re.match(self._VALID_URL, url)
4436         if mobj is None:
4437             raise ExtractorError(u'Invalid URL: %s' % url)
4438         track_id = mobj.group(1)
4439
4440         data = { 'ax': 1, 'ts': time.time() }
4441         data_encoded = compat_urllib_parse.urlencode(data)
4442         complete_url = url + "?" + data_encoded
4443         request = compat_urllib_request.Request(complete_url)
4444         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4445         cookie = urlh.headers.get('Set-Cookie', '')
4446
4447         self.report_extraction(track_id)
4448
4449         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4450             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4451         try:
4452             track_list = json.loads(html_tracks)
4453             track = track_list[u'tracks'][0]
4454         except ValueError:
4455             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4456
4457         key = track[u"key"]
4458         track_id = track[u"id"]
4459         artist = track[u"artist"]
4460         title = track[u"song"]
4461
4462         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4463         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4464         request.add_header('cookie', cookie)
4465         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4466         try:
4467             song_data = json.loads(song_data_json)
4468         except ValueError:
4469             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4470         final_url = song_data[u"url"]
4471
4472         return [{
4473             'id':       track_id,
4474             'url':      final_url,
4475             'ext':      "mp3",
4476             'title':    title,
4477             'artist':   artist,
4478         }]
4479
4480 class Vbox7IE(InfoExtractor):
4481     """Information Extractor for Vbox7"""
4482     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4483
4484     def _real_extract(self,url):
4485         mobj = re.match(self._VALID_URL, url)
4486         if mobj is None:
4487             raise ExtractorError(u'Invalid URL: %s' % url)
4488         video_id = mobj.group(1)
4489
4490         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4491         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4492         redirect_url = urlh.geturl() + new_location
4493         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4494
4495         title = self._html_search_regex(r'<title>(.*)</title>',
4496             webpage, u'title').split('/')[0].strip()
4497
4498         ext = "flv"
4499         info_url = "http://vbox7.com/play/magare.do"
4500         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4501         info_request = compat_urllib_request.Request(info_url, data)
4502         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4503         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4504         if info_response is None:
4505             raise ExtractorError(u'Unable to extract the media url')
4506         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4507
4508         return [{
4509             'id':        video_id,
4510             'url':       final_url,
4511             'ext':       ext,
4512             'title':     title,
4513             'thumbnail': thumbnail_url,
4514         }]
4515
4516 class GametrailersIE(InfoExtractor):
4517     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4518
4519     def _real_extract(self, url):
4520         mobj = re.match(self._VALID_URL, url)
4521         if mobj is None:
4522             raise ExtractorError(u'Invalid URL: %s' % url)
4523         video_id = mobj.group('id')
4524         video_type = mobj.group('type')
4525         webpage = self._download_webpage(url, video_id)
4526         if video_type == 'full-episodes':
4527             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4528         else:
4529             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4530         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4531         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4532
4533         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4534                                            video_id, u'Downloading video info')
4535         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4536                                                video_id, u'Downloading video urls info')
4537
4538         self.report_extraction(video_id)
4539         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4540                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4541                       <image>.*
4542                         <url>(?P<thumb>.*?)</url>.*
4543                       </image>'''
4544
4545         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4546         if m_info is None:
4547             raise ExtractorError(u'Unable to extract video info')
4548         video_title = m_info.group('title')
4549         video_description = m_info.group('description')
4550         video_thumb = m_info.group('thumb')
4551
4552         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4553         if m_urls is None or len(m_urls) == 0:
4554             raise ExtractError(u'Unable to extrat video url')
4555         # They are sorted from worst to best quality
4556         video_url = m_urls[-1].group('url')
4557
4558         return {'url':         video_url,
4559                 'id':          video_id,
4560                 'title':       video_title,
4561                 # Videos are actually flv not mp4
4562                 'ext':         'flv',
4563                 'thumbnail':   video_thumb,
4564                 'description': video_description,
4565                 }
4566
4567 def gen_extractors():
4568     """ Return a list of an instance of every supported extractor.
4569     The order does matter; the first extractor matched is the one handling the URL.
4570     """
4571     return [
4572         YoutubePlaylistIE(),
4573         YoutubeChannelIE(),
4574         YoutubeUserIE(),
4575         YoutubeSearchIE(),
4576         YoutubeIE(),
4577         MetacafeIE(),
4578         DailymotionIE(),
4579         GoogleSearchIE(),
4580         PhotobucketIE(),
4581         YahooIE(),
4582         YahooSearchIE(),
4583         DepositFilesIE(),
4584         FacebookIE(),
4585         BlipTVIE(),
4586         BlipTVUserIE(),
4587         VimeoIE(),
4588         MyVideoIE(),
4589         ComedyCentralIE(),
4590         EscapistIE(),
4591         CollegeHumorIE(),
4592         XVideosIE(),
4593         SoundcloudSetIE(),
4594         SoundcloudIE(),
4595         InfoQIE(),
4596         MixcloudIE(),
4597         StanfordOpenClassroomIE(),
4598         MTVIE(),
4599         YoukuIE(),
4600         XNXXIE(),
4601         YouJizzIE(),
4602         PornotubeIE(),
4603         YouPornIE(),
4604         GooglePlusIE(),
4605         ArteTvIE(),
4606         NBAIE(),
4607         WorldStarHipHopIE(),
4608         JustinTVIE(),
4609         FunnyOrDieIE(),
4610         SteamIE(),
4611         UstreamIE(),
4612         RBMARadioIE(),
4613         EightTracksIE(),
4614         KeekIE(),
4615         TEDIE(),
4616         MySpassIE(),
4617         SpiegelIE(),
4618         LiveLeakIE(),
4619         ARDIE(),
4620         ZDFIE(),
4621         TumblrIE(),
4622         BandcampIE(),
4623         RedTubeIE(),
4624         InaIE(),
4625         HowcastIE(),
4626         VineIE(),
4627         FlickrIE(),
4628         TeamcocoIE(),
4629         XHamsterIE(),
4630         HypemIE(),
4631         Vbox7IE(),
4632         GametrailersIE(),
4633         GenericIE()
4634     ]
4635
4636 def get_info_extractor(ie_name):
4637     """Returns the info extractor class with the given ie_name"""
4638     return globals()[ie_name+'IE']