_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 class InfoExtractor(object):
  27     """Information Extractor class.
  28
  29     Information extractors are the classes that, given a URL, extract
  30     information about the video (or videos) the URL refers to. This
  31     information includes the real video URL, the video title, author and
  32     others. The information is stored in a dictionary which is then
  33     passed to the FileDownloader. The FileDownloader processes this
  34     information possibly downloading the video to the file system, among
  35     other possible outcomes.
  36
  37     The dictionaries must include the following fields:
  38
  39     id:             Video identifier.
  40     url:            Final video URL.
  41     title:          Video title, unescaped.
  42     ext:            Video filename extension.
  43
  44     The following fields are optional:
  45
  46     format:         The video format, defaults to ext (used for --get-format)
  47     thumbnail:      Full URL to a video thumbnail image.
  48     description:    One-line video description.
  49     uploader:       Full name of the video uploader.
  50     upload_date:    Video upload date (YYYYMMDD).
  51     uploader_id:    Nickname or id of the video uploader.
  52     location:       Physical location of the video.
  53     player_url:     SWF Player URL (used for rtmpdump).
  54     subtitles:      The subtitle file contents.
  55     urlhandle:      [internal] The urlHandle to be used to download the file,
  56                     like returned by urllib.request.urlopen
  57
  58     The fields should all be Unicode strings.
  59
  60     Subclasses of this one should re-define the _real_initialize() and
  61     _real_extract() methods and define a _VALID_URL regexp.
  62     Probably, they should also be added to the list of extractors.
  63
  64     _real_extract() must return a *list* of information dictionaries as
  65     described above.
  66
  67     Finally, the _WORKING attribute should be set to False for broken IEs
  68     in order to warn the users and skip the tests.
  69     """
  70
  71     _ready = False
  72     _downloader = None
  73     _WORKING = True
  74
  75     def __init__(self, downloader=None):
  76         """Constructor. Receives an optional downloader."""
  77         self._ready = False
  78         self.set_downloader(downloader)
  79
  80     @classmethod
  81     def suitable(cls, url):
  82         """Receives a URL and returns True if suitable for this IE."""
  83         return re.match(cls._VALID_URL, url) is not None
  84
  85     @classmethod
  86     def working(cls):
  87         """Getter method for _WORKING."""
  88         return cls._WORKING
  89
  90     def initialize(self):
  91         """Initializes an instance (authentication, etc)."""
  92         if not self._ready:
  93             self._real_initialize()
  94             self._ready = True
  95
  96     def extract(self, url):
  97         """Extracts URL information and returns it in list of dicts."""
  98         self.initialize()
  99         return self._real_extract(url)
 100
 101     def set_downloader(self, downloader):
 102         """Sets the downloader for this IE."""
 103         self._downloader = downloader
 104
 105     def _real_initialize(self):
 106         """Real initialization process. Redefine in subclasses."""
 107         pass
 108
 109     def _real_extract(self, url):
 110         """Real extraction process. Redefine in subclasses."""
 111         pass
 112
 113     @property
 114     def IE_NAME(self):
 115         return type(self).__name__[:-2]
 116
 117     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 118         """ Returns the response handle """
 119         if note is None:
 120             self.report_download_webpage(video_id)
 121         elif note is not False:
 122             self.to_screen(u'%s: %s' % (video_id, note))
 123         try:
 124             return compat_urllib_request.urlopen(url_or_request)
 125         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 126             if errnote is None:
 127                 errnote = u'Unable to download webpage'
 128             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 129
 130     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 131         """ Returns a tuple (page content as string, URL handle) """
 132         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 133         content_type = urlh.headers.get('Content-Type', '')
 134         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 135         if m:
 136             encoding = m.group(1)
 137         else:
 138             encoding = 'utf-8'
 139         webpage_bytes = urlh.read()
 140         if self._downloader.params.get('dump_intermediate_pages', False):
 141             try:
 142                 url = url_or_request.get_full_url()
 143             except AttributeError:
 144                 url = url_or_request
 145             self.to_screen(u'Dumping request to ' + url)
 146             dump = base64.b64encode(webpage_bytes).decode('ascii')
 147             self._downloader.to_screen(dump)
 148         content = webpage_bytes.decode(encoding, 'replace')
 149         return (content, urlh)
 150
 151     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 152         """ Returns the data of the page as a string """
 153         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 154
 155     def to_screen(self, msg):
 156         """Print msg to screen, prefixing it with '[ie_name]'"""
 157         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 158
 159     def report_extraction(self, id_or_name):
 160         """Report information extraction."""
 161         self.to_screen(u'%s: Extracting information' % id_or_name)
 162
 163     def report_download_webpage(self, video_id):
 164         """Report webpage download."""
 165         self.to_screen(u'%s: Downloading webpage' % video_id)
 166
 167     def report_age_confirmation(self):
 168         """Report attempt to confirm age."""
 169         self.to_screen(u'Confirming age')
 170
 171     #Methods for following #608
 172     #They set the correct value of the '_type' key
 173     def video_result(self, video_info):
 174         """Returns a video"""
 175         video_info['_type'] = 'video'
 176         return video_info
 177     def url_result(self, url, ie=None):
 178         """Returns a url that points to a page that should be processed"""
 179         #TODO: ie should be the class used for getting the info
 180         video_info = {'_type': 'url',
 181                       'url': url,
 182                       'ie_key': ie}
 183         return video_info
 184     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 185         """Returns a playlist"""
 186         video_info = {'_type': 'playlist',
 187                       'entries': entries}
 188         if playlist_id:
 189             video_info['id'] = playlist_id
 190         if playlist_title:
 191             video_info['title'] = playlist_title
 192         return video_info
 193
 194 class SearchInfoExtractor(InfoExtractor):
 195     """
 196     Base class for paged search queries extractors.
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 199     """
 200
 201     @classmethod
 202     def _make_valid_url(cls):
 203         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 204
 205     @classmethod
 206     def suitable(cls, url):
 207         return re.match(cls._make_valid_url(), url) is not None
 208
 209     def _real_extract(self, query):
 210         mobj = re.match(self._make_valid_url(), query)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid search query "%s"' % query)
 213
 214         prefix = mobj.group('prefix')
 215         query = mobj.group('query')
 216         if prefix == '':
 217             return self._get_n_results(query, 1)
 218         elif prefix == 'all':
 219             return self._get_n_results(query, self._MAX_RESULTS)
 220         else:
 221             n = int(prefix)
 222             if n <= 0:
 223                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
 224             elif n > self._MAX_RESULTS:
 225                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 226                 n = self._MAX_RESULTS
 227             return self._get_n_results(query, n)
 228
 229     def _get_n_results(self, query, n):
 230         """Get a specified number of results for a query"""
 231         raise NotImplementedError("This method must be implemented by sublclasses")
 232
 233
 234 class YoutubeIE(InfoExtractor):
 235     """Information extractor for youtube.com."""
 236
 237     _VALID_URL = r"""^
 238                      (
 239                          (?:https?://)?                                       # http(s):// (optional)
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 243                          (?:                                                  # the various things that can precede the ID:
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 245                              |(?:                                             # or the v= param in all its forms
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 249                                  v=
 250                              )
 251                          )?                                                   # optional -> youtube.com/xxxx is OK
 252                      )?                                                       # all until now is optional -> you can pass the naked ID
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 254                      (?(1).+)?                                                # if we found the ID, everything can follow
 255                      $"""
 256     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 257     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 258     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 259     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 260     _NETRC_MACHINE = 'youtube'
 261     # Listed in order of quality
 262     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 263     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 264     _video_extensions = {
 265         '13': '3gp',
 266         '17': 'mp4',
 267         '18': 'mp4',
 268         '22': 'mp4',
 269         '37': 'mp4',
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 271         '43': 'webm',
 272         '44': 'webm',
 273         '45': 'webm',
 274         '46': 'webm',
 275     }
 276     _video_dimensions = {
 277         '5': '240x400',
 278         '6': '???',
 279         '13': '???',
 280         '17': '144x176',
 281         '18': '360x640',
 282         '22': '720x1280',
 283         '34': '360x640',
 284         '35': '480x854',
 285         '37': '1080x1920',
 286         '38': '3072x4096',
 287         '43': '360x640',
 288         '44': '480x854',
 289         '45': '720x1280',
 290         '46': '1080x1920',
 291     }
 292     IE_NAME = u'youtube'
 293
 294     @classmethod
 295     def suitable(cls, url):
 296         """Receives a URL and returns True if suitable for this IE."""
 297         if YoutubePlaylistIE.suitable(url): return False
 298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 299
 300     def report_lang(self):
 301         """Report attempt to set language."""
 302         self.to_screen(u'Setting language')
 303
 304     def report_login(self):
 305         """Report attempt to log in."""
 306         self.to_screen(u'Logging in')
 307
 308     def report_video_webpage_download(self, video_id):
 309         """Report attempt to download video webpage."""
 310         self.to_screen(u'%s: Downloading video webpage' % video_id)
 311
 312     def report_video_info_webpage_download(self, video_id):
 313         """Report attempt to download video info webpage."""
 314         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 315
 316     def report_video_subtitles_download(self, video_id):
 317         """Report attempt to download video info webpage."""
 318         self.to_screen(u'%s: Checking available subtitles' % video_id)
 319
 320     def report_video_subtitles_request(self, video_id, sub_lang, format):
 321         """Report attempt to download video info webpage."""
 322         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 323
 324     def report_video_subtitles_available(self, video_id, sub_lang_list):
 325         """Report available subtitles."""
 326         sub_lang = ",".join(list(sub_lang_list.keys()))
 327         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 328
 329     def report_information_extraction(self, video_id):
 330         """Report attempt to extract video information."""
 331         self.to_screen(u'%s: Extracting video information' % video_id)
 332
 333     def report_unavailable_format(self, video_id, format):
 334         """Report extracted video URL."""
 335         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 336
 337     def report_rtmp_download(self):
 338         """Indicate the download will use the RTMP protocol."""
 339         self.to_screen(u'RTMP download detected')
 340
 341     def _get_available_subtitles(self, video_id):
 342         self.report_video_subtitles_download(video_id)
 343         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 344         try:
 345             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 346         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 347             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 348         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 349         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 350         if not sub_lang_list:
 351             return (u'video doesn\'t have subtitles', None)
 352         return sub_lang_list
 353
 354     def _list_available_subtitles(self, video_id):
 355         sub_lang_list = self._get_available_subtitles(video_id)
 356         self.report_video_subtitles_available(video_id, sub_lang_list)
 357
 358     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 359         """
 360         Return tuple:
 361         (error_message, sub_lang, sub)
 362         """
 363         self.report_video_subtitles_request(video_id, sub_lang, format)
 364         params = compat_urllib_parse.urlencode({
 365             'lang': sub_lang,
 366             'name': sub_name,
 367             'v': video_id,
 368             'fmt': format,
 369         })
 370         url = 'http://www.youtube.com/api/timedtext?' + params
 371         try:
 372             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 375         if not sub:
 376             return (u'Did not fetch video subtitles', None, None)
 377         return (None, sub_lang, sub)
 378
 379     def _extract_subtitle(self, video_id):
 380         """
 381         Return a list with a tuple:
 382         [(error_message, sub_lang, sub)]
 383         """
 384         sub_lang_list = self._get_available_subtitles(video_id)
 385         sub_format = self._downloader.params.get('subtitlesformat')
 386         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 387             return [(sub_lang_list[0], None, None)]
 388         if self._downloader.params.get('subtitleslang', False):
 389             sub_lang = self._downloader.params.get('subtitleslang')
 390         elif 'en' in sub_lang_list:
 391             sub_lang = 'en'
 392         else:
 393             sub_lang = list(sub_lang_list.keys())[0]
 394         if not sub_lang in sub_lang_list:
 395             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 396
 397         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 398         return [subtitle]
 399
 400     def _extract_all_subtitles(self, video_id):
 401         sub_lang_list = self._get_available_subtitles(video_id)
 402         sub_format = self._downloader.params.get('subtitlesformat')
 403         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 404             return [(sub_lang_list[0], None, None)]
 405         subtitles = []
 406         for sub_lang in sub_lang_list:
 407             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 408             subtitles.append(subtitle)
 409         return subtitles
 410
 411     def _print_formats(self, formats):
 412         print('Available formats:')
 413         for x in formats:
 414             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 415
 416     def _real_initialize(self):
 417         if self._downloader is None:
 418             return
 419
 420         username = None
 421         password = None
 422         downloader_params = self._downloader.params
 423
 424         # Attempt to use provided username and password or .netrc data
 425         if downloader_params.get('username', None) is not None:
 426             username = downloader_params['username']
 427             password = downloader_params['password']
 428         elif downloader_params.get('usenetrc', False):
 429             try:
 430                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 431                 if info is not None:
 432                     username = info[0]
 433                     password = info[2]
 434                 else:
 435                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 436             except (IOError, netrc.NetrcParseError) as err:
 437                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 438                 return
 439
 440         # Set language
 441         request = compat_urllib_request.Request(self._LANG_URL)
 442         try:
 443             self.report_lang()
 444             compat_urllib_request.urlopen(request).read()
 445         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 446             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 447             return
 448
 449         # No authentication to be performed
 450         if username is None:
 451             return
 452
 453         request = compat_urllib_request.Request(self._LOGIN_URL)
 454         try:
 455             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 458             return
 459
 460         galx = None
 461         dsh = None
 462         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 463         if match:
 464           galx = match.group(1)
 465
 466         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 467         if match:
 468           dsh = match.group(1)
 469
 470         # Log in
 471         login_form_strs = {
 472                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 473                 u'Email': username,
 474                 u'GALX': galx,
 475                 u'Passwd': password,
 476                 u'PersistentCookie': u'yes',
 477                 u'_utf8': u'霱',
 478                 u'bgresponse': u'js_disabled',
 479                 u'checkConnection': u'',
 480                 u'checkedDomains': u'youtube',
 481                 u'dnConn': u'',
 482                 u'dsh': dsh,
 483                 u'pstMsg': u'0',
 484                 u'rmShown': u'1',
 485                 u'secTok': u'',
 486                 u'signIn': u'Sign in',
 487                 u'timeStmp': u'',
 488                 u'service': u'youtube',
 489                 u'uilel': u'3',
 490                 u'hl': u'en_US',
 491         }
 492         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 493         # chokes on unicode
 494         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 495         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 496         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 497         try:
 498             self.report_login()
 499             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 500             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 501                 self._downloader.report_warning(u'unable to log in: bad username or password')
 502                 return
 503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 504             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 505             return
 506
 507         # Confirm age
 508         age_form = {
 509                 'next_url':     '/',
 510                 'action_confirm':   'Confirm',
 511                 }
 512         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 513         try:
 514             self.report_age_confirmation()
 515             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 516         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 517             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 518
 519     def _extract_id(self, url):
 520         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 521         if mobj is None:
 522             raise ExtractorError(u'Invalid URL: %s' % url)
 523         video_id = mobj.group(2)
 524         return video_id
 525
 526     def _real_extract(self, url):
 527         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 528         mobj = re.search(self._NEXT_URL_RE, url)
 529         if mobj:
 530             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 531         video_id = self._extract_id(url)
 532
 533         # Get video webpage
 534         self.report_video_webpage_download(video_id)
 535         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 536         request = compat_urllib_request.Request(url)
 537         try:
 538             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 539         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 540             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 541
 542         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 543
 544         # Attempt to extract SWF player URL
 545         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 546         if mobj is not None:
 547             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 548         else:
 549             player_url = None
 550
 551         # Get video info
 552         self.report_video_info_webpage_download(video_id)
 553         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 554             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 555                     % (video_id, el_type))
 556             video_info_webpage = self._download_webpage(video_info_url, video_id,
 557                                     note=False,
 558                                     errnote='unable to download video info webpage')
 559             video_info = compat_parse_qs(video_info_webpage)
 560             if 'token' in video_info:
 561                 break
 562         if 'token' not in video_info:
 563             if 'reason' in video_info:
 564                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 565             else:
 566                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 567
 568         # Check for "rental" videos
 569         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 570             raise ExtractorError(u'"rental" videos not supported')
 571
 572         # Start extracting information
 573         self.report_information_extraction(video_id)
 574
 575         # uploader
 576         if 'author' not in video_info:
 577             raise ExtractorError(u'Unable to extract uploader name')
 578         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 579
 580         # uploader_id
 581         video_uploader_id = None
 582         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 583         if mobj is not None:
 584             video_uploader_id = mobj.group(1)
 585         else:
 586             self._downloader.report_warning(u'unable to extract uploader nickname')
 587
 588         # title
 589         if 'title' not in video_info:
 590             raise ExtractorError(u'Unable to extract video title')
 591         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 592
 593         # thumbnail image
 594         if 'thumbnail_url' not in video_info:
 595             self._downloader.report_warning(u'unable to extract video thumbnail')
 596             video_thumbnail = ''
 597         else:   # don't panic if we can't find it
 598             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 599
 600         # upload date
 601         upload_date = None
 602         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 603         if mobj is not None:
 604             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 605             upload_date = unified_strdate(upload_date)
 606
 607         # description
 608         video_description = get_element_by_id("eow-description", video_webpage)
 609         if video_description:
 610             video_description = clean_html(video_description)
 611         else:
 612             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 613             if fd_mobj:
 614                 video_description = unescapeHTML(fd_mobj.group(1))
 615             else:
 616                 video_description = u''
 617
 618         # subtitles
 619         video_subtitles = None
 620
 621         if self._downloader.params.get('writesubtitles', False):
 622             video_subtitles = self._extract_subtitle(video_id)
 623             if video_subtitles:
 624                 (sub_error, sub_lang, sub) = video_subtitles[0]
 625                 if sub_error:
 626                     self._downloader.report_error(sub_error)
 627
 628         if self._downloader.params.get('allsubtitles', False):
 629             video_subtitles = self._extract_all_subtitles(video_id)
 630             for video_subtitle in video_subtitles:
 631                 (sub_error, sub_lang, sub) = video_subtitle
 632                 if sub_error:
 633                     self._downloader.report_error(sub_error)
 634
 635         if self._downloader.params.get('listsubtitles', False):
 636             sub_lang_list = self._list_available_subtitles(video_id)
 637             return
 638
 639         if 'length_seconds' not in video_info:
 640             self._downloader.report_warning(u'unable to extract video duration')
 641             video_duration = ''
 642         else:
 643             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 644
 645         # token
 646         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 647
 648         # Decide which formats to download
 649         req_format = self._downloader.params.get('format', None)
 650
 651         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 652             self.report_rtmp_download()
 653             video_url_list = [(None, video_info['conn'][0])]
 654         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 655             url_map = {}
 656             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 657                 url_data = compat_parse_qs(url_data_str)
 658                 if 'itag' in url_data and 'url' in url_data:
 659                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 660                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 661                     url_map[url_data['itag'][0]] = url
 662
 663             format_limit = self._downloader.params.get('format_limit', None)
 664             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 665             if format_limit is not None and format_limit in available_formats:
 666                 format_list = available_formats[available_formats.index(format_limit):]
 667             else:
 668                 format_list = available_formats
 669             existing_formats = [x for x in format_list if x in url_map]
 670             if len(existing_formats) == 0:
 671                 raise ExtractorError(u'no known formats available for video')
 672             if self._downloader.params.get('listformats', None):
 673                 self._print_formats(existing_formats)
 674                 return
 675             if req_format is None or req_format == 'best':
 676                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 677             elif req_format == 'worst':
 678                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 679             elif req_format in ('-1', 'all'):
 680                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 681             else:
 682                 # Specific formats. We pick the first in a slash-delimeted sequence.
 683                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 684                 req_formats = req_format.split('/')
 685                 video_url_list = None
 686                 for rf in req_formats:
 687                     if rf in url_map:
 688                         video_url_list = [(rf, url_map[rf])]
 689                         break
 690                 if video_url_list is None:
 691                     raise ExtractorError(u'requested format not available')
 692         else:
 693             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 694
 695         results = []
 696         for format_param, video_real_url in video_url_list:
 697             # Extension
 698             video_extension = self._video_extensions.get(format_param, 'flv')
 699
 700             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 701                                               self._video_dimensions.get(format_param, '???'))
 702
 703             results.append({
 704                 'id':       video_id,
 705                 'url':      video_real_url,
 706                 'uploader': video_uploader,
 707                 'uploader_id': video_uploader_id,
 708                 'upload_date':  upload_date,
 709                 'title':    video_title,
 710                 'ext':      video_extension,
 711                 'format':   video_format,
 712                 'thumbnail':    video_thumbnail,
 713                 'description':  video_description,
 714                 'player_url':   player_url,
 715                 'subtitles':    video_subtitles,
 716                 'duration':     video_duration
 717             })
 718         return results
 719
 720
 721 class MetacafeIE(InfoExtractor):
 722     """Information Extractor for metacafe.com."""
 723
 724     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 725     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 726     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 727     IE_NAME = u'metacafe'
 728
 729     def report_disclaimer(self):
 730         """Report disclaimer retrieval."""
 731         self.to_screen(u'Retrieving disclaimer')
 732
 733     def _real_initialize(self):
 734         # Retrieve disclaimer
 735         request = compat_urllib_request.Request(self._DISCLAIMER)
 736         try:
 737             self.report_disclaimer()
 738             disclaimer = compat_urllib_request.urlopen(request).read()
 739         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 740             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 741
 742         # Confirm age
 743         disclaimer_form = {
 744             'filters': '0',
 745             'submit': "Continue - I'm over 18",
 746             }
 747         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 748         try:
 749             self.report_age_confirmation()
 750             disclaimer = compat_urllib_request.urlopen(request).read()
 751         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 752             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 753
 754     def _real_extract(self, url):
 755         # Extract id and simplified title from URL
 756         mobj = re.match(self._VALID_URL, url)
 757         if mobj is None:
 758             raise ExtractorError(u'Invalid URL: %s' % url)
 759
 760         video_id = mobj.group(1)
 761
 762         # Check if video comes from YouTube
 763         mobj2 = re.match(r'^yt-(.*)$', video_id)
 764         if mobj2 is not None:
 765             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 766
 767         # Retrieve video webpage to extract further information
 768         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 769
 770         # Extract URL, uploader and title from webpage
 771         self.report_extraction(video_id)
 772         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 773         if mobj is not None:
 774             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 775             video_extension = mediaURL[-3:]
 776
 777             # Extract gdaKey if available
 778             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 779             if mobj is None:
 780                 video_url = mediaURL
 781             else:
 782                 gdaKey = mobj.group(1)
 783                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 784         else:
 785             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 786             if mobj is None:
 787                 raise ExtractorError(u'Unable to extract media URL')
 788             vardict = compat_parse_qs(mobj.group(1))
 789             if 'mediaData' not in vardict:
 790                 raise ExtractorError(u'Unable to extract media URL')
 791             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 792             if mobj is None:
 793                 raise ExtractorError(u'Unable to extract media URL')
 794             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 795             video_extension = mediaURL[-3:]
 796             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 797
 798         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 799         if mobj is None:
 800             raise ExtractorError(u'Unable to extract title')
 801         video_title = mobj.group(1).decode('utf-8')
 802
 803         mobj = re.search(r'submitter=(.*?);', webpage)
 804         if mobj is None:
 805             raise ExtractorError(u'Unable to extract uploader nickname')
 806         video_uploader = mobj.group(1)
 807
 808         return [{
 809             'id':       video_id.decode('utf-8'),
 810             'url':      video_url.decode('utf-8'),
 811             'uploader': video_uploader.decode('utf-8'),
 812             'upload_date':  None,
 813             'title':    video_title,
 814             'ext':      video_extension.decode('utf-8'),
 815         }]
 816
 817 class DailymotionIE(InfoExtractor):
 818     """Information Extractor for Dailymotion"""
 819
 820     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 821     IE_NAME = u'dailymotion'
 822
 823     def _real_extract(self, url):
 824         # Extract id and simplified title from URL
 825         mobj = re.match(self._VALID_URL, url)
 826         if mobj is None:
 827             raise ExtractorError(u'Invalid URL: %s' % url)
 828
 829         video_id = mobj.group(1).split('_')[0].split('?')[0]
 830
 831         video_extension = 'mp4'
 832
 833         # Retrieve video webpage to extract further information
 834         request = compat_urllib_request.Request(url)
 835         request.add_header('Cookie', 'family_filter=off')
 836         webpage = self._download_webpage(request, video_id)
 837
 838         # Extract URL, uploader and title from webpage
 839         self.report_extraction(video_id)
 840         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 841         if mobj is None:
 842             raise ExtractorError(u'Unable to extract media URL')
 843         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 844
 845         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 846             if key in flashvars:
 847                 max_quality = key
 848                 self.to_screen(u'Using %s' % key)
 849                 break
 850         else:
 851             raise ExtractorError(u'Unable to extract video URL')
 852
 853         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 854         if mobj is None:
 855             raise ExtractorError(u'Unable to extract video URL')
 856
 857         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 858
 859         # TODO: support choosing qualities
 860
 861         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 862         if mobj is None:
 863             raise ExtractorError(u'Unable to extract title')
 864         video_title = unescapeHTML(mobj.group('title'))
 865
 866         video_uploader = None
 867         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 868         if mobj is None:
 869             # lookin for official user
 870             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 871             if mobj_official is None:
 872                 self._downloader.report_warning(u'unable to extract uploader nickname')
 873             else:
 874                 video_uploader = mobj_official.group(1)
 875         else:
 876             video_uploader = mobj.group(1)
 877
 878         video_upload_date = None
 879         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 880         if mobj is not None:
 881             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 882
 883         return [{
 884             'id':       video_id,
 885             'url':      video_url,
 886             'uploader': video_uploader,
 887             'upload_date':  video_upload_date,
 888             'title':    video_title,
 889             'ext':      video_extension,
 890         }]
 891
 892
 893 class PhotobucketIE(InfoExtractor):
 894     """Information extractor for photobucket.com."""
 895
 896     # TODO: the original _VALID_URL was:
 897     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 898     # Check if it's necessary to keep the old extracion process
 899     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 900     IE_NAME = u'photobucket'
 901
 902     def _real_extract(self, url):
 903         # Extract id from URL
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             raise ExtractorError(u'Invalid URL: %s' % url)
 907
 908         video_id = mobj.group('id')
 909
 910         video_extension = mobj.group('ext')
 911
 912         # Retrieve video webpage to extract further information
 913         webpage = self._download_webpage(url, video_id)
 914
 915         # Extract URL, uploader, and title from webpage
 916         self.report_extraction(video_id)
 917         # We try first by looking the javascript code:
 918         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 919         if mobj is not None:
 920             info = json.loads(mobj.group('json'))
 921             return [{
 922                 'id':       video_id,
 923                 'url':      info[u'downloadUrl'],
 924                 'uploader': info[u'username'],
 925                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 926                 'title':    info[u'title'],
 927                 'ext':      video_extension,
 928                 'thumbnail': info[u'thumbUrl'],
 929             }]
 930
 931         # We try looking in other parts of the webpage
 932         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 933         if mobj is None:
 934             raise ExtractorError(u'Unable to extract media URL')
 935         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 936
 937         video_url = mediaURL
 938
 939         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 940         if mobj is None:
 941             raise ExtractorError(u'Unable to extract title')
 942         video_title = mobj.group(1).decode('utf-8')
 943
 944         video_uploader = mobj.group(2).decode('utf-8')
 945
 946         return [{
 947             'id':       video_id.decode('utf-8'),
 948             'url':      video_url.decode('utf-8'),
 949             'uploader': video_uploader,
 950             'upload_date':  None,
 951             'title':    video_title,
 952             'ext':      video_extension.decode('utf-8'),
 953         }]
 954
 955
 956 class YahooIE(InfoExtractor):
 957     """Information extractor for screen.yahoo.com."""
 958     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 959
 960     def _real_extract(self, url):
 961         mobj = re.match(self._VALID_URL, url)
 962         if mobj is None:
 963             raise ExtractorError(u'Invalid URL: %s' % url)
 964         video_id = mobj.group('id')
 965         webpage = self._download_webpage(url, video_id)
 966         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 967
 968         if m_id is None:
 969             # TODO: Check which url parameters are required
 970             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 971             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
 972             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 973                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 974                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 975                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 976                         '''
 977             self.report_extraction(video_id)
 978             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 979             if m_info is None:
 980                 raise ExtractorError(u'Unable to extract video info')
 981             video_title = m_info.group('title')
 982             video_description = m_info.group('description')
 983             video_thumb = m_info.group('thumb')
 984             video_date = m_info.group('date')
 985             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 986
 987             # TODO: Find a way to get mp4 videos
 988             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 989             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
 990             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 991             video_url = m_rest.group('url')
 992             video_path = m_rest.group('path')
 993             if m_rest is None:
 994                 raise ExtractorError(u'Unable to extract video url')
 995
 996         else: # We have to use a different method if another id is defined
 997             long_id = m_id.group('new_id')
 998             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
 999             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1000             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1001             info = json.loads(json_str)
1002             res = info[u'query'][u'results'][u'mediaObj'][0]
1003             stream = res[u'streams'][0]
1004             video_path = stream[u'path']
1005             video_url = stream[u'host']
1006             meta = res[u'meta']
1007             video_title = meta[u'title']
1008             video_description = meta[u'description']
1009             video_thumb = meta[u'thumbnail']
1010             video_date = None # I can't find it
1011
1012         info_dict = {
1013                      'id': video_id,
1014                      'url': video_url,
1015                      'play_path': video_path,
1016                      'title':video_title,
1017                      'description': video_description,
1018                      'thumbnail': video_thumb,
1019                      'upload_date': video_date,
1020                      'ext': 'flv',
1021                      }
1022         return info_dict
1023
1024 class VimeoIE(InfoExtractor):
1025     """Information extractor for vimeo.com."""
1026
1027     # _VALID_URL matches Vimeo URLs
1028     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1029     IE_NAME = u'vimeo'
1030
1031     def _real_extract(self, url, new_video=True):
1032         # Extract ID from URL
1033         mobj = re.match(self._VALID_URL, url)
1034         if mobj is None:
1035             raise ExtractorError(u'Invalid URL: %s' % url)
1036
1037         video_id = mobj.group('id')
1038         if not mobj.group('proto'):
1039             url = 'https://' + url
1040         if mobj.group('direct_link'):
1041             url = 'https://vimeo.com/' + video_id
1042
1043         # Retrieve video webpage to extract further information
1044         request = compat_urllib_request.Request(url, None, std_headers)
1045         webpage = self._download_webpage(request, video_id)
1046
1047         # Now we begin extracting as much information as we can from what we
1048         # retrieved. First we extract the information common to all extractors,
1049         # and latter we extract those that are Vimeo specific.
1050         self.report_extraction(video_id)
1051
1052         # Extract the config JSON
1053         try:
1054             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055             config = json.loads(config)
1056         except:
1057             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1058                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1059             else:
1060                 raise ExtractorError(u'Unable to extract info section')
1061
1062         # Extract title
1063         video_title = config["video"]["title"]
1064
1065         # Extract uploader and uploader_id
1066         video_uploader = config["video"]["owner"]["name"]
1067         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1068
1069         # Extract video thumbnail
1070         video_thumbnail = config["video"]["thumbnail"]
1071
1072         # Extract video description
1073         video_description = get_element_by_attribute("itemprop", "description", webpage)
1074         if video_description: video_description = clean_html(video_description)
1075         else: video_description = u''
1076
1077         # Extract upload date
1078         video_upload_date = None
1079         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1080         if mobj is not None:
1081             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1082
1083         # Vimeo specific: extract request signature and timestamp
1084         sig = config['request']['signature']
1085         timestamp = config['request']['timestamp']
1086
1087         # Vimeo specific: extract video codec and quality information
1088         # First consider quality, then codecs, then take everything
1089         # TODO bind to format param
1090         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1091         files = { 'hd': [], 'sd': [], 'other': []}
1092         for codec_name, codec_extension in codecs:
1093             if codec_name in config["video"]["files"]:
1094                 if 'hd' in config["video"]["files"][codec_name]:
1095                     files['hd'].append((codec_name, codec_extension, 'hd'))
1096                 elif 'sd' in config["video"]["files"][codec_name]:
1097                     files['sd'].append((codec_name, codec_extension, 'sd'))
1098                 else:
1099                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1100
1101         for quality in ('hd', 'sd', 'other'):
1102             if len(files[quality]) > 0:
1103                 video_quality = files[quality][0][2]
1104                 video_codec = files[quality][0][0]
1105                 video_extension = files[quality][0][1]
1106                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1107                 break
1108         else:
1109             raise ExtractorError(u'No known codec found')
1110
1111         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1113
1114         return [{
1115             'id':       video_id,
1116             'url':      video_url,
1117             'uploader': video_uploader,
1118             'uploader_id': video_uploader_id,
1119             'upload_date':  video_upload_date,
1120             'title':    video_title,
1121             'ext':      video_extension,
1122             'thumbnail':    video_thumbnail,
1123             'description':  video_description,
1124         }]
1125
1126
1127 class ArteTvIE(InfoExtractor):
1128     """arte.tv information extractor."""
1129
1130     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1131     _LIVE_URL = r'index-[0-9]+\.html$'
1132
1133     IE_NAME = u'arte.tv'
1134
1135     def fetch_webpage(self, url):
1136         request = compat_urllib_request.Request(url)
1137         try:
1138             self.report_download_webpage(url)
1139             webpage = compat_urllib_request.urlopen(request).read()
1140         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1141             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1142         except ValueError as err:
1143             raise ExtractorError(u'Invalid URL: %s' % url)
1144         return webpage
1145
1146     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1147         page = self.fetch_webpage(url)
1148         mobj = re.search(regex, page, regexFlags)
1149         info = {}
1150
1151         if mobj is None:
1152             raise ExtractorError(u'Invalid URL: %s' % url)
1153
1154         for (i, key, err) in matchTuples:
1155             if mobj.group(i) is None:
1156                 raise ExtractorError(err)
1157             else:
1158                 info[key] = mobj.group(i)
1159
1160         return info
1161
1162     def extractLiveStream(self, url):
1163         video_lang = url.split('/')[-4]
1164         info = self.grep_webpage(
1165             url,
1166             r'src="(.*?/videothek_js.*?\.js)',
1167             0,
1168             [
1169                 (1, 'url', u'Invalid URL: %s' % url)
1170             ]
1171         )
1172         http_host = url.split('/')[2]
1173         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1174         info = self.grep_webpage(
1175             next_url,
1176             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1177                 '(http://.*?\.swf).*?' +
1178                 '(rtmp://.*?)\'',
1179             re.DOTALL,
1180             [
1181                 (1, 'path',   u'could not extract video path: %s' % url),
1182                 (2, 'player', u'could not extract video player: %s' % url),
1183                 (3, 'url',    u'could not extract video url: %s' % url)
1184             ]
1185         )
1186         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1187
1188     def extractPlus7Stream(self, url):
1189         video_lang = url.split('/')[-3]
1190         info = self.grep_webpage(
1191             url,
1192             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1193             0,
1194             [
1195                 (1, 'url', u'Invalid URL: %s' % url)
1196             ]
1197         )
1198         next_url = compat_urllib_parse.unquote(info.get('url'))
1199         info = self.grep_webpage(
1200             next_url,
1201             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1202             0,
1203             [
1204                 (1, 'url', u'Could not find <video> tag: %s' % url)
1205             ]
1206         )
1207         next_url = compat_urllib_parse.unquote(info.get('url'))
1208
1209         info = self.grep_webpage(
1210             next_url,
1211             r'<video id="(.*?)".*?>.*?' +
1212                 '<name>(.*?)</name>.*?' +
1213                 '<dateVideo>(.*?)</dateVideo>.*?' +
1214                 '<url quality="hd">(.*?)</url>',
1215             re.DOTALL,
1216             [
1217                 (1, 'id',    u'could not extract video id: %s' % url),
1218                 (2, 'title', u'could not extract video title: %s' % url),
1219                 (3, 'date',  u'could not extract video date: %s' % url),
1220                 (4, 'url',   u'could not extract video url: %s' % url)
1221             ]
1222         )
1223
1224         return {
1225             'id':           info.get('id'),
1226             'url':          compat_urllib_parse.unquote(info.get('url')),
1227             'uploader':     u'arte.tv',
1228             'upload_date':  unified_strdate(info.get('date')),
1229             'title':        info.get('title').decode('utf-8'),
1230             'ext':          u'mp4',
1231             'format':       u'NA',
1232             'player_url':   None,
1233         }
1234
1235     def _real_extract(self, url):
1236         video_id = url.split('/')[-1]
1237         self.report_extraction(video_id)
1238
1239         if re.search(self._LIVE_URL, video_id) is not None:
1240             self.extractLiveStream(url)
1241             return
1242         else:
1243             info = self.extractPlus7Stream(url)
1244
1245         return [info]
1246
1247
1248 class GenericIE(InfoExtractor):
1249     """Generic last-resort information extractor."""
1250
1251     _VALID_URL = r'.*'
1252     IE_NAME = u'generic'
1253
1254     def report_download_webpage(self, video_id):
1255         """Report webpage download."""
1256         if not self._downloader.params.get('test', False):
1257             self._downloader.report_warning(u'Falling back on generic information extractor.')
1258         super(GenericIE, self).report_download_webpage(video_id)
1259
1260     def report_following_redirect(self, new_url):
1261         """Report information extraction."""
1262         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1263
1264     def _test_redirect(self, url):
1265         """Check if it is a redirect, like url shorteners, in case return the new url."""
1266         class HeadRequest(compat_urllib_request.Request):
1267             def get_method(self):
1268                 return "HEAD"
1269
1270         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1271             """
1272             Subclass the HTTPRedirectHandler to make it use our
1273             HeadRequest also on the redirected URL
1274             """
1275             def redirect_request(self, req, fp, code, msg, headers, newurl):
1276                 if code in (301, 302, 303, 307):
1277                     newurl = newurl.replace(' ', '%20')
1278                     newheaders = dict((k,v) for k,v in req.headers.items()
1279                                       if k.lower() not in ("content-length", "content-type"))
1280                     return HeadRequest(newurl,
1281                                        headers=newheaders,
1282                                        origin_req_host=req.get_origin_req_host(),
1283                                        unverifiable=True)
1284                 else:
1285                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1286
1287         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1288             """
1289             Fallback to GET if HEAD is not allowed (405 HTTP error)
1290             """
1291             def http_error_405(self, req, fp, code, msg, headers):
1292                 fp.read()
1293                 fp.close()
1294
1295                 newheaders = dict((k,v) for k,v in req.headers.items()
1296                                   if k.lower() not in ("content-length", "content-type"))
1297                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1298                                                  headers=newheaders,
1299                                                  origin_req_host=req.get_origin_req_host(),
1300                                                  unverifiable=True))
1301
1302         # Build our opener
1303         opener = compat_urllib_request.OpenerDirector()
1304         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1305                         HTTPMethodFallback, HEADRedirectHandler,
1306                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1307             opener.add_handler(handler())
1308
1309         response = opener.open(HeadRequest(url))
1310         if response is None:
1311             raise ExtractorError(u'Invalid URL protocol')
1312         new_url = response.geturl()
1313
1314         if url == new_url:
1315             return False
1316
1317         self.report_following_redirect(new_url)
1318         return new_url
1319
1320     def _real_extract(self, url):
1321         new_url = self._test_redirect(url)
1322         if new_url: return [self.url_result(new_url)]
1323
1324         video_id = url.split('/')[-1]
1325         try:
1326             webpage = self._download_webpage(url, video_id)
1327         except ValueError as err:
1328             # since this is the last-resort InfoExtractor, if
1329             # this error is thrown, it'll be thrown here
1330             raise ExtractorError(u'Invalid URL: %s' % url)
1331
1332         self.report_extraction(video_id)
1333         # Start with something easy: JW Player in SWFObject
1334         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1335         if mobj is None:
1336             # Broaden the search a little bit
1337             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1338         if mobj is None:
1339             # Broaden the search a little bit: JWPlayer JS loader
1340             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1341         if mobj is None:
1342             raise ExtractorError(u'Invalid URL: %s' % url)
1343
1344         # It's possible that one of the regexes
1345         # matched, but returned an empty group:
1346         if mobj.group(1) is None:
1347             raise ExtractorError(u'Invalid URL: %s' % url)
1348
1349         video_url = compat_urllib_parse.unquote(mobj.group(1))
1350         video_id = os.path.basename(video_url)
1351
1352         # here's a fun little line of code for you:
1353         video_extension = os.path.splitext(video_id)[1][1:]
1354         video_id = os.path.splitext(video_id)[0]
1355
1356         # it's tempting to parse this further, but you would
1357         # have to take into account all the variations like
1358         #   Video Title - Site Name
1359         #   Site Name | Video Title
1360         #   Video Title - Tagline | Site Name
1361         # and so on and so forth; it's just not practical
1362         mobj = re.search(r'<title>(.*)</title>', webpage)
1363         if mobj is None:
1364             raise ExtractorError(u'Unable to extract title')
1365         video_title = mobj.group(1)
1366
1367         # video uploader is domain name
1368         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1369         if mobj is None:
1370             raise ExtractorError(u'Unable to extract title')
1371         video_uploader = mobj.group(1)
1372
1373         return [{
1374             'id':       video_id,
1375             'url':      video_url,
1376             'uploader': video_uploader,
1377             'upload_date':  None,
1378             'title':    video_title,
1379             'ext':      video_extension,
1380         }]
1381
1382
1383 class YoutubeSearchIE(SearchInfoExtractor):
1384     """Information Extractor for YouTube search queries."""
1385     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386     _MAX_RESULTS = 1000
1387     IE_NAME = u'youtube:search'
1388     _SEARCH_KEY = 'ytsearch'
1389
1390     def report_download_page(self, query, pagenum):
1391         """Report attempt to download search page with given number."""
1392         query = query.decode(preferredencoding())
1393         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1394
1395     def _get_n_results(self, query, n):
1396         """Get a specified number of results for a query"""
1397
1398         video_ids = []
1399         pagenum = 0
1400         limit = n
1401
1402         while (50 * pagenum) < limit:
1403             self.report_download_page(query, pagenum+1)
1404             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1405             request = compat_urllib_request.Request(result_url)
1406             try:
1407                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1408             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1409                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1410             api_response = json.loads(data)['data']
1411
1412             if not 'items' in api_response:
1413                 raise ExtractorError(u'[youtube] No video results')
1414
1415             new_ids = list(video['id'] for video in api_response['items'])
1416             video_ids += new_ids
1417
1418             limit = min(n, api_response['totalItems'])
1419             pagenum += 1
1420
1421         if len(video_ids) > n:
1422             video_ids = video_ids[:n]
1423         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1424         return self.playlist_result(videos, query)
1425
1426
1427 class GoogleSearchIE(SearchInfoExtractor):
1428     """Information Extractor for Google Video search queries."""
1429     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1430     _MAX_RESULTS = 1000
1431     IE_NAME = u'video.google:search'
1432     _SEARCH_KEY = 'gvsearch'
1433
1434     def _get_n_results(self, query, n):
1435         """Get a specified number of results for a query"""
1436
1437         res = {
1438             '_type': 'playlist',
1439             'id': query,
1440             'entries': []
1441         }
1442
1443         for pagenum in itertools.count(1):
1444             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1445             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1446                                              note='Downloading result page ' + str(pagenum))
1447
1448             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1449                 e = {
1450                     '_type': 'url',
1451                     'url': mobj.group(1)
1452                 }
1453                 res['entries'].append(e)
1454
1455             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1456                 return res
1457
1458 class YahooSearchIE(SearchInfoExtractor):
1459     """Information Extractor for Yahoo! Video search queries."""
1460
1461     _MAX_RESULTS = 1000
1462     IE_NAME = u'screen.yahoo:search'
1463     _SEARCH_KEY = 'yvsearch'
1464
1465     def _get_n_results(self, query, n):
1466         """Get a specified number of results for a query"""
1467
1468         res = {
1469             '_type': 'playlist',
1470             'id': query,
1471             'entries': []
1472         }
1473         for pagenum in itertools.count(0):
1474             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1475             webpage = self._download_webpage(result_url, query,
1476                                              note='Downloading results page '+str(pagenum+1))
1477             info = json.loads(webpage)
1478             m = info[u'm']
1479             results = info[u'results']
1480
1481             for (i, r) in enumerate(results):
1482                 if (pagenum * 30) +i >= n:
1483                     break
1484                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1485                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1486                 res['entries'].append(e)
1487             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1488                 break
1489
1490         return res
1491
1492
1493 class YoutubePlaylistIE(InfoExtractor):
1494     """Information Extractor for YouTube playlists."""
1495
1496     _VALID_URL = r"""(?:
1497                         (?:https?://)?
1498                         (?:\w+\.)?
1499                         youtube\.com/
1500                         (?:
1501                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1502                            \? (?:.*?&)*? (?:p|a|list)=
1503                         |  p/
1504                         )
1505                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1506                         .*
1507                      |
1508                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1509                      )"""
1510     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1511     _MAX_RESULTS = 50
1512     IE_NAME = u'youtube:playlist'
1513
1514     @classmethod
1515     def suitable(cls, url):
1516         """Receives a URL and returns True if suitable for this IE."""
1517         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1518
1519     def _real_extract(self, url):
1520         # Extract playlist id
1521         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1522         if mobj is None:
1523             raise ExtractorError(u'Invalid URL: %s' % url)
1524
1525         # Download playlist videos from API
1526         playlist_id = mobj.group(1) or mobj.group(2)
1527         page_num = 1
1528         videos = []
1529
1530         while True:
1531             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1532             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1533
1534             try:
1535                 response = json.loads(page)
1536             except ValueError as err:
1537                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1538
1539             if 'feed' not in response:
1540                 raise ExtractorError(u'Got a malformed response from YouTube API')
1541             playlist_title = response['feed']['title']['$t']
1542             if 'entry' not in response['feed']:
1543                 # Number of videos is a multiple of self._MAX_RESULTS
1544                 break
1545
1546             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1547                         for entry in response['feed']['entry']
1548                         if 'content' in entry ]
1549
1550             if len(response['feed']['entry']) < self._MAX_RESULTS:
1551                 break
1552             page_num += 1
1553
1554         videos = [v[1] for v in sorted(videos)]
1555
1556         url_results = [self.url_result(url, 'Youtube') for url in videos]
1557         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1558
1559
1560 class YoutubeChannelIE(InfoExtractor):
1561     """Information Extractor for YouTube channels."""
1562
1563     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1564     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1565     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1566     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1567     IE_NAME = u'youtube:channel'
1568
1569     def extract_videos_from_page(self, page):
1570         ids_in_page = []
1571         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1572             if mobj.group(1) not in ids_in_page:
1573                 ids_in_page.append(mobj.group(1))
1574         return ids_in_page
1575
1576     def _real_extract(self, url):
1577         # Extract channel id
1578         mobj = re.match(self._VALID_URL, url)
1579         if mobj is None:
1580             raise ExtractorError(u'Invalid URL: %s' % url)
1581
1582         # Download channel page
1583         channel_id = mobj.group(1)
1584         video_ids = []
1585         pagenum = 1
1586
1587         url = self._TEMPLATE_URL % (channel_id, pagenum)
1588         page = self._download_webpage(url, channel_id,
1589                                       u'Downloading page #%s' % pagenum)
1590
1591         # Extract video identifiers
1592         ids_in_page = self.extract_videos_from_page(page)
1593         video_ids.extend(ids_in_page)
1594
1595         # Download any subsequent channel pages using the json-based channel_ajax query
1596         if self._MORE_PAGES_INDICATOR in page:
1597             while True:
1598                 pagenum = pagenum + 1
1599
1600                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1601                 page = self._download_webpage(url, channel_id,
1602                                               u'Downloading page #%s' % pagenum)
1603
1604                 page = json.loads(page)
1605
1606                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1607                 video_ids.extend(ids_in_page)
1608
1609                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1610                     break
1611
1612         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1613
1614         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1615         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1616         return [self.playlist_result(url_entries, channel_id)]
1617
1618
1619 class YoutubeUserIE(InfoExtractor):
1620     """Information Extractor for YouTube users."""
1621
1622     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1623     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1624     _GDATA_PAGE_SIZE = 50
1625     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1626     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1627     IE_NAME = u'youtube:user'
1628
1629     def _real_extract(self, url):
1630         # Extract username
1631         mobj = re.match(self._VALID_URL, url)
1632         if mobj is None:
1633             raise ExtractorError(u'Invalid URL: %s' % url)
1634
1635         username = mobj.group(1)
1636
1637         # Download video ids using YouTube Data API. Result size per
1638         # query is limited (currently to 50 videos) so we need to query
1639         # page by page until there are no video ids - it means we got
1640         # all of them.
1641
1642         video_ids = []
1643         pagenum = 0
1644
1645         while True:
1646             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1647
1648             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1649             page = self._download_webpage(gdata_url, username,
1650                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1651
1652             # Extract video identifiers
1653             ids_in_page = []
1654
1655             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656                 if mobj.group(1) not in ids_in_page:
1657                     ids_in_page.append(mobj.group(1))
1658
1659             video_ids.extend(ids_in_page)
1660
1661             # A little optimization - if current page is not
1662             # "full", ie. does not contain PAGE_SIZE video ids then
1663             # we can assume that this page is the last one - there
1664             # are no more ids on further pages - no need to query
1665             # again.
1666
1667             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1668                 break
1669
1670             pagenum += 1
1671
1672         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1673         url_results = [self.url_result(url, 'Youtube') for url in urls]
1674         return [self.playlist_result(url_results, playlist_title = username)]
1675
1676
1677 class BlipTVUserIE(InfoExtractor):
1678     """Information Extractor for blip.tv users."""
1679
1680     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1681     _PAGE_SIZE = 12
1682     IE_NAME = u'blip.tv:user'
1683
1684     def _real_extract(self, url):
1685         # Extract username
1686         mobj = re.match(self._VALID_URL, url)
1687         if mobj is None:
1688             raise ExtractorError(u'Invalid URL: %s' % url)
1689
1690         username = mobj.group(1)
1691
1692         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1693
1694         page = self._download_webpage(url, username, u'Downloading user page')
1695         mobj = re.search(r'data-users-id="([^"]+)"', page)
1696         page_base = page_base % mobj.group(1)
1697
1698
1699         # Download video ids using BlipTV Ajax calls. Result size per
1700         # query is limited (currently to 12 videos) so we need to query
1701         # page by page until there are no video ids - it means we got
1702         # all of them.
1703
1704         video_ids = []
1705         pagenum = 1
1706
1707         while True:
1708             url = page_base + "&page=" + str(pagenum)
1709             page = self._download_webpage(url, username,
1710                                           u'Downloading video ids from page %d' % pagenum)
1711
1712             # Extract video identifiers
1713             ids_in_page = []
1714
1715             for mobj in re.finditer(r'href="/([^"]+)"', page):
1716                 if mobj.group(1) not in ids_in_page:
1717                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1718
1719             video_ids.extend(ids_in_page)
1720
1721             # A little optimization - if current page is not
1722             # "full", ie. does not contain PAGE_SIZE video ids then
1723             # we can assume that this page is the last one - there
1724             # are no more ids on further pages - no need to query
1725             # again.
1726
1727             if len(ids_in_page) < self._PAGE_SIZE:
1728                 break
1729
1730             pagenum += 1
1731
1732         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1733         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1734         return [self.playlist_result(url_entries, playlist_title = username)]
1735
1736
1737 class DepositFilesIE(InfoExtractor):
1738     """Information extractor for depositfiles.com"""
1739
1740     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1741
1742     def _real_extract(self, url):
1743         file_id = url.split('/')[-1]
1744         # Rebuild url in english locale
1745         url = 'http://depositfiles.com/en/files/' + file_id
1746
1747         # Retrieve file webpage with 'Free download' button pressed
1748         free_download_indication = { 'gateway_result' : '1' }
1749         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1750         try:
1751             self.report_download_webpage(file_id)
1752             webpage = compat_urllib_request.urlopen(request).read()
1753         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1754             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1755
1756         # Search for the real file URL
1757         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1758         if (mobj is None) or (mobj.group(1) is None):
1759             # Try to figure out reason of the error.
1760             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1761             if (mobj is not None) and (mobj.group(1) is not None):
1762                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1763                 raise ExtractorError(u'%s' % restriction_message)
1764             else:
1765                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1766
1767         file_url = mobj.group(1)
1768         file_extension = os.path.splitext(file_url)[1][1:]
1769
1770         # Search for file title
1771         mobj = re.search(r'<b title="(.*?)">', webpage)
1772         if mobj is None:
1773             raise ExtractorError(u'Unable to extract title')
1774         file_title = mobj.group(1).decode('utf-8')
1775
1776         return [{
1777             'id':       file_id.decode('utf-8'),
1778             'url':      file_url.decode('utf-8'),
1779             'uploader': None,
1780             'upload_date':  None,
1781             'title':    file_title,
1782             'ext':      file_extension.decode('utf-8'),
1783         }]
1784
1785
1786 class FacebookIE(InfoExtractor):
1787     """Information Extractor for Facebook"""
1788
1789     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791     _NETRC_MACHINE = 'facebook'
1792     IE_NAME = u'facebook'
1793
1794     def report_login(self):
1795         """Report attempt to log in."""
1796         self.to_screen(u'Logging in')
1797
1798     def _real_initialize(self):
1799         if self._downloader is None:
1800             return
1801
1802         useremail = None
1803         password = None
1804         downloader_params = self._downloader.params
1805
1806         # Attempt to use provided username and password or .netrc data
1807         if downloader_params.get('username', None) is not None:
1808             useremail = downloader_params['username']
1809             password = downloader_params['password']
1810         elif downloader_params.get('usenetrc', False):
1811             try:
1812                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1813                 if info is not None:
1814                     useremail = info[0]
1815                     password = info[2]
1816                 else:
1817                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1818             except (IOError, netrc.NetrcParseError) as err:
1819                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1820                 return
1821
1822         if useremail is None:
1823             return
1824
1825         # Log in
1826         login_form = {
1827             'email': useremail,
1828             'pass': password,
1829             'login': 'Log+In'
1830             }
1831         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1832         try:
1833             self.report_login()
1834             login_results = compat_urllib_request.urlopen(request).read()
1835             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1836                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1837                 return
1838         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1839             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1840             return
1841
1842     def _real_extract(self, url):
1843         mobj = re.match(self._VALID_URL, url)
1844         if mobj is None:
1845             raise ExtractorError(u'Invalid URL: %s' % url)
1846         video_id = mobj.group('ID')
1847
1848         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1849         webpage = self._download_webpage(url, video_id)
1850
1851         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1852         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1853         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1854         if not m:
1855             raise ExtractorError(u'Cannot parse data')
1856         data = dict(json.loads(m.group(1)))
1857         params_raw = compat_urllib_parse.unquote(data['params'])
1858         params = json.loads(params_raw)
1859         video_data = params['video_data'][0]
1860         video_url = video_data.get('hd_src')
1861         if not video_url:
1862             video_url = video_data['sd_src']
1863         if not video_url:
1864             raise ExtractorError(u'Cannot find video URL')
1865         video_duration = int(video_data['video_duration'])
1866         thumbnail = video_data['thumbnail_src']
1867
1868         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1869         if not m:
1870             raise ExtractorError(u'Cannot find title in webpage')
1871         video_title = unescapeHTML(m.group(1))
1872
1873         info = {
1874             'id': video_id,
1875             'title': video_title,
1876             'url': video_url,
1877             'ext': 'mp4',
1878             'duration': video_duration,
1879             'thumbnail': thumbnail,
1880         }
1881         return [info]
1882
1883
1884 class BlipTVIE(InfoExtractor):
1885     """Information extractor for blip.tv"""
1886
1887     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1888     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1889     IE_NAME = u'blip.tv'
1890
1891     def report_direct_download(self, title):
1892         """Report information extraction."""
1893         self.to_screen(u'%s: Direct download detected' % title)
1894
1895     def _real_extract(self, url):
1896         mobj = re.match(self._VALID_URL, url)
1897         if mobj is None:
1898             raise ExtractorError(u'Invalid URL: %s' % url)
1899
1900         urlp = compat_urllib_parse_urlparse(url)
1901         if urlp.path.startswith('/play/'):
1902             request = compat_urllib_request.Request(url)
1903             response = compat_urllib_request.urlopen(request)
1904             redirecturl = response.geturl()
1905             rurlp = compat_urllib_parse_urlparse(redirecturl)
1906             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1907             url = 'http://blip.tv/a/a-' + file_id
1908             return self._real_extract(url)
1909
1910
1911         if '?' in url:
1912             cchar = '&'
1913         else:
1914             cchar = '?'
1915         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1916         request = compat_urllib_request.Request(json_url)
1917         request.add_header('User-Agent', 'iTunes/10.6.1')
1918         self.report_extraction(mobj.group(1))
1919         info = None
1920         try:
1921             urlh = compat_urllib_request.urlopen(request)
1922             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1923                 basename = url.split('/')[-1]
1924                 title,ext = os.path.splitext(basename)
1925                 title = title.decode('UTF-8')
1926                 ext = ext.replace('.', '')
1927                 self.report_direct_download(title)
1928                 info = {
1929                     'id': title,
1930                     'url': url,
1931                     'uploader': None,
1932                     'upload_date': None,
1933                     'title': title,
1934                     'ext': ext,
1935                     'urlhandle': urlh
1936                 }
1937         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1939         if info is None: # Regular URL
1940             try:
1941                 json_code_bytes = urlh.read()
1942                 json_code = json_code_bytes.decode('utf-8')
1943             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1945
1946             try:
1947                 json_data = json.loads(json_code)
1948                 if 'Post' in json_data:
1949                     data = json_data['Post']
1950                 else:
1951                     data = json_data
1952
1953                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1954                 video_url = data['media']['url']
1955                 umobj = re.match(self._URL_EXT, video_url)
1956                 if umobj is None:
1957                     raise ValueError('Can not determine filename extension')
1958                 ext = umobj.group(1)
1959
1960                 info = {
1961                     'id': data['item_id'],
1962                     'url': video_url,
1963                     'uploader': data['display_name'],
1964                     'upload_date': upload_date,
1965                     'title': data['title'],
1966                     'ext': ext,
1967                     'format': data['media']['mimeType'],
1968                     'thumbnail': data['thumbnailUrl'],
1969                     'description': data['description'],
1970                     'player_url': data['embedUrl'],
1971                     'user_agent': 'iTunes/10.6.1',
1972                 }
1973             except (ValueError,KeyError) as err:
1974                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1975
1976         return [info]
1977
1978
1979 class MyVideoIE(InfoExtractor):
1980     """Information Extractor for myvideo.de."""
1981
1982     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1983     IE_NAME = u'myvideo'
1984
1985     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1986     # Released into the Public Domain by Tristan Fischer on 2013-05-19
1987     # https://github.com/rg3/youtube-dl/pull/842
1988     def __rc4crypt(self,data, key):
1989         x = 0
1990         box = list(range(256))
1991         for i in list(range(256)):
1992             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1993             box[i], box[x] = box[x], box[i]
1994         x = 0
1995         y = 0
1996         out = ''
1997         for char in data:
1998             x = (x + 1) % 256
1999             y = (y + box[x]) % 256
2000             box[x], box[y] = box[y], box[x]
2001             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2002         return out
2003
2004     def __md5(self,s):
2005         return hashlib.md5(s).hexdigest().encode()
2006
2007     def _real_extract(self,url):
2008         mobj = re.match(self._VALID_URL, url)
2009         if mobj is None:
2010             raise ExtractorError(u'invalid URL: %s' % url)
2011
2012         video_id = mobj.group(1)
2013
2014         GK = (
2015           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2016           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2017           b'TnpsbA0KTVRkbU1tSTRNdz09'
2018         )
2019
2020         # Get video webpage
2021         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2022         webpage = self._download_webpage(webpage_url, video_id)
2023
2024         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2025         if mobj is not None:
2026             self.report_extraction(video_id)
2027             video_url = mobj.group(1) + '.flv'
2028
2029             mobj = re.search('<title>([^<]+)</title>', webpage)
2030             if mobj is None:
2031                 raise ExtractorError(u'Unable to extract title')
2032             video_title = mobj.group(1)
2033
2034             mobj = re.search('[.](.+?)$', video_url)
2035             if mobj is None:
2036                 raise ExtractorError(u'Unable to extract extention')
2037             video_ext = mobj.group(1)
2038
2039             return [{
2040                 'id':       video_id,
2041                 'url':      video_url,
2042                 'uploader': None,
2043                 'upload_date':  None,
2044                 'title':    video_title,
2045                 'ext':      u'flv',
2046             }]
2047
2048         # try encxml
2049         mobj = re.search('var flashvars={(.+?)}', webpage)
2050         if mobj is None:
2051             raise ExtractorError(u'Unable to extract video')
2052
2053         params = {}
2054         encxml = ''
2055         sec = mobj.group(1)
2056         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2057             if not a == '_encxml':
2058                 params[a] = b
2059             else:
2060                 encxml = compat_urllib_parse.unquote(b)
2061         if not params.get('domain'):
2062             params['domain'] = 'www.myvideo.de'
2063         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2064         if 'flash_playertype=MTV' in xmldata_url:
2065             self._downloader.report_warning(u'avoiding MTV player')
2066             xmldata_url = (
2067                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2068                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2069             ) % video_id
2070
2071         # get enc data
2072         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2073         enc_data_b = binascii.unhexlify(enc_data)
2074         sk = self.__md5(
2075             base64.b64decode(base64.b64decode(GK)) +
2076             self.__md5(
2077                 str(video_id).encode('utf-8')
2078             )
2079         )
2080         dec_data = self.__rc4crypt(enc_data_b, sk)
2081
2082         # extracting infos
2083         self.report_extraction(video_id)
2084
2085         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2086         if mobj is None:
2087             raise ExtractorError(u'unable to extract rtmpurl')
2088         video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2089         if 'myvideo2flash' in video_rtmpurl:
2090             self._downloader.report_warning(u'forcing RTMPT ...')
2091             video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2092
2093         # extract non rtmp videos
2094         if (video_rtmpurl is None) or (video_rtmpurl == ''):
2095             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2096             if mobj is None:
2097                 raise ExtractorError(u'unable to extract url')
2098             video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2099
2100         mobj = re.search('source=\'(.*?)\'', dec_data)
2101         if mobj is None:
2102             raise ExtractorError(u'unable to extract swfobj')
2103         video_file     = compat_urllib_parse.unquote(mobj.group(1))
2104
2105         if not video_file.endswith('f4m'):
2106             ppath, prefix = video_file.split('.')
2107             video_playpath = '%s:%s' % (prefix, ppath)
2108             video_hls_playlist = ''
2109         else:
2110             video_playpath = ''
2111             video_hls_playlist = (
2112                 video_filepath + video_file
2113             ).replace('.f4m', '.m3u8')
2114
2115         mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2116         if mobj is None:
2117             raise ExtractorError(u'unable to extract swfobj')
2118         video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2119
2120         mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2121         if mobj is None:
2122             raise ExtractorError(u'unable to extract title')
2123         video_title = mobj.group(1)
2124
2125         return [{
2126             'id':                 video_id,
2127             'url':                video_rtmpurl,
2128             'tc_url':             video_rtmpurl,
2129             'uploader':           None,
2130             'upload_date':        None,
2131             'title':              video_title,
2132             'ext':                u'flv',
2133             'play_path':          video_playpath,
2134             'video_file':         video_file,
2135             'video_hls_playlist': video_hls_playlist,
2136             'player_url':         video_swfobj,
2137         }]
2138
2139 class ComedyCentralIE(InfoExtractor):
2140     """Information extractor for The Daily Show and Colbert Report """
2141
2142     # urls can be abbreviations like :thedailyshow or :colbert
2143     # urls for episodes like:
2144     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2145     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2146     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2147     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2148                       |(https?://)?(www\.)?
2149                           (?P<showname>thedailyshow|colbertnation)\.com/
2150                          (full-episodes/(?P<episode>.*)|
2151                           (?P<clip>
2152                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2153                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2154                      $"""
2155
2156     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2157
2158     _video_extensions = {
2159         '3500': 'mp4',
2160         '2200': 'mp4',
2161         '1700': 'mp4',
2162         '1200': 'mp4',
2163         '750': 'mp4',
2164         '400': 'mp4',
2165     }
2166     _video_dimensions = {
2167         '3500': '1280x720',
2168         '2200': '960x540',
2169         '1700': '768x432',
2170         '1200': '640x360',
2171         '750': '512x288',
2172         '400': '384x216',
2173     }
2174
2175     @classmethod
2176     def suitable(cls, url):
2177         """Receives a URL and returns True if suitable for this IE."""
2178         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2179
2180     def _print_formats(self, formats):
2181         print('Available formats:')
2182         for x in formats:
2183             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2184
2185
2186     def _real_extract(self, url):
2187         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2188         if mobj is None:
2189             raise ExtractorError(u'Invalid URL: %s' % url)
2190
2191         if mobj.group('shortname'):
2192             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2193                 url = u'http://www.thedailyshow.com/full-episodes/'
2194             else:
2195                 url = u'http://www.colbertnation.com/full-episodes/'
2196             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2197             assert mobj is not None
2198
2199         if mobj.group('clip'):
2200             if mobj.group('showname') == 'thedailyshow':
2201                 epTitle = mobj.group('tdstitle')
2202             else:
2203                 epTitle = mobj.group('cntitle')
2204             dlNewest = False
2205         else:
2206             dlNewest = not mobj.group('episode')
2207             if dlNewest:
2208                 epTitle = mobj.group('showname')
2209             else:
2210                 epTitle = mobj.group('episode')
2211
2212         self.report_extraction(epTitle)
2213         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2214         if dlNewest:
2215             url = htmlHandle.geturl()
2216             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2217             if mobj is None:
2218                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2219             if mobj.group('episode') == '':
2220                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2221             epTitle = mobj.group('episode')
2222
2223         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2224
2225         if len(mMovieParams) == 0:
2226             # The Colbert Report embeds the information in a without
2227             # a URL prefix; so extract the alternate reference
2228             # and then add the URL prefix manually.
2229
2230             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2231             if len(altMovieParams) == 0:
2232                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2233             else:
2234                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2235
2236         uri = mMovieParams[0][1]
2237         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2238         indexXml = self._download_webpage(indexUrl, epTitle,
2239                                           u'Downloading show index',
2240                                           u'unable to download episode index')
2241
2242         results = []
2243
2244         idoc = xml.etree.ElementTree.fromstring(indexXml)
2245         itemEls = idoc.findall('.//item')
2246         for partNum,itemEl in enumerate(itemEls):
2247             mediaId = itemEl.findall('./guid')[0].text
2248             shortMediaId = mediaId.split(':')[-1]
2249             showId = mediaId.split(':')[-2].replace('.com', '')
2250             officialTitle = itemEl.findall('./title')[0].text
2251             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2252
2253             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2254                         compat_urllib_parse.urlencode({'uri': mediaId}))
2255             configXml = self._download_webpage(configUrl, epTitle,
2256                                                u'Downloading configuration for %s' % shortMediaId)
2257
2258             cdoc = xml.etree.ElementTree.fromstring(configXml)
2259             turls = []
2260             for rendition in cdoc.findall('.//rendition'):
2261                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2262                 turls.append(finfo)
2263
2264             if len(turls) == 0:
2265                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2266                 continue
2267
2268             if self._downloader.params.get('listformats', None):
2269                 self._print_formats([i[0] for i in turls])
2270                 return
2271
2272             # For now, just pick the highest bitrate
2273             format,rtmp_video_url = turls[-1]
2274
2275             # Get the format arg from the arg stream
2276             req_format = self._downloader.params.get('format', None)
2277
2278             # Select format if we can find one
2279             for f,v in turls:
2280                 if f == req_format:
2281                     format, rtmp_video_url = f, v
2282                     break
2283
2284             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2285             if not m:
2286                 raise ExtractorError(u'Cannot transform RTMP url')
2287             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2288             video_url = base + m.group('finalid')
2289
2290             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2291             info = {
2292                 'id': shortMediaId,
2293                 'url': video_url,
2294                 'uploader': showId,
2295                 'upload_date': officialDate,
2296                 'title': effTitle,
2297                 'ext': 'mp4',
2298                 'format': format,
2299                 'thumbnail': None,
2300                 'description': officialTitle,
2301             }
2302             results.append(info)
2303
2304         return results
2305
2306
2307 class EscapistIE(InfoExtractor):
2308     """Information extractor for The Escapist """
2309
2310     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2311     IE_NAME = u'escapist'
2312
2313     def _real_extract(self, url):
2314         mobj = re.match(self._VALID_URL, url)
2315         if mobj is None:
2316             raise ExtractorError(u'Invalid URL: %s' % url)
2317         showName = mobj.group('showname')
2318         videoId = mobj.group('episode')
2319
2320         self.report_extraction(showName)
2321         webPage = self._download_webpage(url, showName)
2322
2323         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2324         description = unescapeHTML(descMatch.group(1))
2325         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2326         imgUrl = unescapeHTML(imgMatch.group(1))
2327         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2328         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2329         configUrlMatch = re.search('config=(.*)$', playerUrl)
2330         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2331
2332         configJSON = self._download_webpage(configUrl, showName,
2333                                             u'Downloading configuration',
2334                                             u'unable to download configuration')
2335
2336         # Technically, it's JavaScript, not JSON
2337         configJSON = configJSON.replace("'", '"')
2338
2339         try:
2340             config = json.loads(configJSON)
2341         except (ValueError,) as err:
2342             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2343
2344         playlist = config['playlist']
2345         videoUrl = playlist[1]['url']
2346
2347         info = {
2348             'id': videoId,
2349             'url': videoUrl,
2350             'uploader': showName,
2351             'upload_date': None,
2352             'title': showName,
2353             'ext': 'mp4',
2354             'thumbnail': imgUrl,
2355             'description': description,
2356             'player_url': playerUrl,
2357         }
2358
2359         return [info]
2360
2361 class CollegeHumorIE(InfoExtractor):
2362     """Information extractor for collegehumor.com"""
2363
2364     _WORKING = False
2365     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2366     IE_NAME = u'collegehumor'
2367
2368     def report_manifest(self, video_id):
2369         """Report information extraction."""
2370         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2371
2372     def _real_extract(self, url):
2373         mobj = re.match(self._VALID_URL, url)
2374         if mobj is None:
2375             raise ExtractorError(u'Invalid URL: %s' % url)
2376         video_id = mobj.group('videoid')
2377
2378         info = {
2379             'id': video_id,
2380             'uploader': None,
2381             'upload_date': None,
2382         }
2383
2384         self.report_extraction(video_id)
2385         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2386         try:
2387             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2388         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2389             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2390
2391         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2392         try:
2393             videoNode = mdoc.findall('./video')[0]
2394             info['description'] = videoNode.findall('./description')[0].text
2395             info['title'] = videoNode.findall('./caption')[0].text
2396             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2397             manifest_url = videoNode.findall('./file')[0].text
2398         except IndexError:
2399             raise ExtractorError(u'Invalid metadata XML file')
2400
2401         manifest_url += '?hdcore=2.10.3'
2402         self.report_manifest(video_id)
2403         try:
2404             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2405         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2407
2408         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2409         try:
2410             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2411             node_id = media_node.attrib['url']
2412             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2413         except IndexError as err:
2414             raise ExtractorError(u'Invalid manifest file')
2415
2416         url_pr = compat_urllib_parse_urlparse(manifest_url)
2417         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2418
2419         info['url'] = url
2420         info['ext'] = 'f4f'
2421         return [info]
2422
2423
2424 class XVideosIE(InfoExtractor):
2425     """Information extractor for xvideos.com"""
2426
2427     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2428     IE_NAME = u'xvideos'
2429
2430     def _real_extract(self, url):
2431         mobj = re.match(self._VALID_URL, url)
2432         if mobj is None:
2433             raise ExtractorError(u'Invalid URL: %s' % url)
2434         video_id = mobj.group(1)
2435
2436         webpage = self._download_webpage(url, video_id)
2437
2438         self.report_extraction(video_id)
2439
2440
2441         # Extract video URL
2442         mobj = re.search(r'flv_url=(.+?)&', webpage)
2443         if mobj is None:
2444             raise ExtractorError(u'Unable to extract video url')
2445         video_url = compat_urllib_parse.unquote(mobj.group(1))
2446
2447
2448         # Extract title
2449         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2450         if mobj is None:
2451             raise ExtractorError(u'Unable to extract video title')
2452         video_title = mobj.group(1)
2453
2454
2455         # Extract video thumbnail
2456         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2457         if mobj is None:
2458             raise ExtractorError(u'Unable to extract video thumbnail')
2459         video_thumbnail = mobj.group(0)
2460
2461         info = {
2462             'id': video_id,
2463             'url': video_url,
2464             'uploader': None,
2465             'upload_date': None,
2466             'title': video_title,
2467             'ext': 'flv',
2468             'thumbnail': video_thumbnail,
2469             'description': None,
2470         }
2471
2472         return [info]
2473
2474
2475 class SoundcloudIE(InfoExtractor):
2476     """Information extractor for soundcloud.com
2477        To access the media, the uid of the song and a stream token
2478        must be extracted from the page source and the script must make
2479        a request to media.soundcloud.com/crossdomain.xml. Then
2480        the media can be grabbed by requesting from an url composed
2481        of the stream token and uid
2482      """
2483
2484     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2485     IE_NAME = u'soundcloud'
2486
2487     def report_resolve(self, video_id):
2488         """Report information extraction."""
2489         self.to_screen(u'%s: Resolving id' % video_id)
2490
2491     def _real_extract(self, url):
2492         mobj = re.match(self._VALID_URL, url)
2493         if mobj is None:
2494             raise ExtractorError(u'Invalid URL: %s' % url)
2495
2496         # extract uploader (which is in the url)
2497         uploader = mobj.group(1)
2498         # extract simple title (uploader + slug of song title)
2499         slug_title =  mobj.group(2)
2500         simple_title = uploader + u'-' + slug_title
2501         full_title = '%s/%s' % (uploader, slug_title)
2502
2503         self.report_resolve(full_title)
2504
2505         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2506         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2507         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2508
2509         info = json.loads(info_json)
2510         video_id = info['id']
2511         self.report_extraction(full_title)
2512
2513         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2514         stream_json = self._download_webpage(streams_url, full_title,
2515                                              u'Downloading stream definitions',
2516                                              u'unable to download stream definitions')
2517
2518         streams = json.loads(stream_json)
2519         mediaURL = streams['http_mp3_128_url']
2520         upload_date = unified_strdate(info['created_at'])
2521
2522         return [{
2523             'id':       info['id'],
2524             'url':      mediaURL,
2525             'uploader': info['user']['username'],
2526             'upload_date': upload_date,
2527             'title':    info['title'],
2528             'ext':      u'mp3',
2529             'description': info['description'],
2530         }]
2531
2532 class SoundcloudSetIE(InfoExtractor):
2533     """Information extractor for soundcloud.com sets
2534        To access the media, the uid of the song and a stream token
2535        must be extracted from the page source and the script must make
2536        a request to media.soundcloud.com/crossdomain.xml. Then
2537        the media can be grabbed by requesting from an url composed
2538        of the stream token and uid
2539      """
2540
2541     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2542     IE_NAME = u'soundcloud:set'
2543
2544     def report_resolve(self, video_id):
2545         """Report information extraction."""
2546         self.to_screen(u'%s: Resolving id' % video_id)
2547
2548     def _real_extract(self, url):
2549         mobj = re.match(self._VALID_URL, url)
2550         if mobj is None:
2551             raise ExtractorError(u'Invalid URL: %s' % url)
2552
2553         # extract uploader (which is in the url)
2554         uploader = mobj.group(1)
2555         # extract simple title (uploader + slug of song title)
2556         slug_title =  mobj.group(2)
2557         simple_title = uploader + u'-' + slug_title
2558         full_title = '%s/sets/%s' % (uploader, slug_title)
2559
2560         self.report_resolve(full_title)
2561
2562         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2563         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2564         info_json = self._download_webpage(resolv_url, full_title)
2565
2566         videos = []
2567         info = json.loads(info_json)
2568         if 'errors' in info:
2569             for err in info['errors']:
2570                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2571             return
2572
2573         self.report_extraction(full_title)
2574         for track in info['tracks']:
2575             video_id = track['id']
2576
2577             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2579
2580             self.report_extraction(video_id)
2581             streams = json.loads(stream_json)
2582             mediaURL = streams['http_mp3_128_url']
2583
2584             videos.append({
2585                 'id':       video_id,
2586                 'url':      mediaURL,
2587                 'uploader': track['user']['username'],
2588                 'upload_date':  unified_strdate(track['created_at']),
2589                 'title':    track['title'],
2590                 'ext':      u'mp3',
2591                 'description': track['description'],
2592             })
2593         return videos
2594
2595
2596 class InfoQIE(InfoExtractor):
2597     """Information extractor for infoq.com"""
2598     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2599
2600     def _real_extract(self, url):
2601         mobj = re.match(self._VALID_URL, url)
2602         if mobj is None:
2603             raise ExtractorError(u'Invalid URL: %s' % url)
2604
2605         webpage = self._download_webpage(url, video_id=url)
2606         self.report_extraction(url)
2607
2608         # Extract video URL
2609         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2610         if mobj is None:
2611             raise ExtractorError(u'Unable to extract video url')
2612         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2613         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2614
2615         # Extract title
2616         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2617         if mobj is None:
2618             raise ExtractorError(u'Unable to extract video title')
2619         video_title = mobj.group(1)
2620
2621         # Extract description
2622         video_description = u'No description available.'
2623         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2624         if mobj is not None:
2625             video_description = mobj.group(1)
2626
2627         video_filename = video_url.split('/')[-1]
2628         video_id, extension = video_filename.split('.')
2629
2630         info = {
2631             'id': video_id,
2632             'url': video_url,
2633             'uploader': None,
2634             'upload_date': None,
2635             'title': video_title,
2636             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2637             'thumbnail': None,
2638             'description': video_description,
2639         }
2640
2641         return [info]
2642
2643 class MixcloudIE(InfoExtractor):
2644     """Information extractor for www.mixcloud.com"""
2645
2646     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2647     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648     IE_NAME = u'mixcloud'
2649
2650     def report_download_json(self, file_id):
2651         """Report JSON download."""
2652         self.to_screen(u'Downloading json')
2653
2654     def get_urls(self, jsonData, fmt, bitrate='best'):
2655         """Get urls from 'audio_formats' section in json"""
2656         file_url = None
2657         try:
2658             bitrate_list = jsonData[fmt]
2659             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2660                 bitrate = max(bitrate_list) # select highest
2661
2662             url_list = jsonData[fmt][bitrate]
2663         except TypeError: # we have no bitrate info.
2664             url_list = jsonData[fmt]
2665         return url_list
2666
2667     def check_urls(self, url_list):
2668         """Returns 1st active url from list"""
2669         for url in url_list:
2670             try:
2671                 compat_urllib_request.urlopen(url)
2672                 return url
2673             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674                 url = None
2675
2676         return None
2677
2678     def _print_formats(self, formats):
2679         print('Available formats:')
2680         for fmt in formats.keys():
2681             for b in formats[fmt]:
2682                 try:
2683                     ext = formats[fmt][b][0]
2684                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2685                 except TypeError: # we have no bitrate info
2686                     ext = formats[fmt][0]
2687                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2688                     break
2689
2690     def _real_extract(self, url):
2691         mobj = re.match(self._VALID_URL, url)
2692         if mobj is None:
2693             raise ExtractorError(u'Invalid URL: %s' % url)
2694         # extract uploader & filename from url
2695         uploader = mobj.group(1).decode('utf-8')
2696         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2697
2698         # construct API request
2699         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2700         # retrieve .json file with links to files
2701         request = compat_urllib_request.Request(file_url)
2702         try:
2703             self.report_download_json(file_url)
2704             jsonData = compat_urllib_request.urlopen(request).read()
2705         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2706             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2707
2708         # parse JSON
2709         json_data = json.loads(jsonData)
2710         player_url = json_data['player_swf_url']
2711         formats = dict(json_data['audio_formats'])
2712
2713         req_format = self._downloader.params.get('format', None)
2714         bitrate = None
2715
2716         if self._downloader.params.get('listformats', None):
2717             self._print_formats(formats)
2718             return
2719
2720         if req_format is None or req_format == 'best':
2721             for format_param in formats.keys():
2722                 url_list = self.get_urls(formats, format_param)
2723                 # check urls
2724                 file_url = self.check_urls(url_list)
2725                 if file_url is not None:
2726                     break # got it!
2727         else:
2728             if req_format not in formats:
2729                 raise ExtractorError(u'Format is not available')
2730
2731             url_list = self.get_urls(formats, req_format)
2732             file_url = self.check_urls(url_list)
2733             format_param = req_format
2734
2735         return [{
2736             'id': file_id.decode('utf-8'),
2737             'url': file_url.decode('utf-8'),
2738             'uploader': uploader.decode('utf-8'),
2739             'upload_date': None,
2740             'title': json_data['name'],
2741             'ext': file_url.split('.')[-1].decode('utf-8'),
2742             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2743             'thumbnail': json_data['thumbnail_url'],
2744             'description': json_data['description'],
2745             'player_url': player_url.decode('utf-8'),
2746         }]
2747
2748 class StanfordOpenClassroomIE(InfoExtractor):
2749     """Information extractor for Stanford's Open ClassRoom"""
2750
2751     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2752     IE_NAME = u'stanfordoc'
2753
2754     def _real_extract(self, url):
2755         mobj = re.match(self._VALID_URL, url)
2756         if mobj is None:
2757             raise ExtractorError(u'Invalid URL: %s' % url)
2758
2759         if mobj.group('course') and mobj.group('video'): # A specific video
2760             course = mobj.group('course')
2761             video = mobj.group('video')
2762             info = {
2763                 'id': course + '_' + video,
2764                 'uploader': None,
2765                 'upload_date': None,
2766             }
2767
2768             self.report_extraction(info['id'])
2769             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2770             xmlUrl = baseUrl + video + '.xml'
2771             try:
2772                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2773             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2775             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2776             try:
2777                 info['title'] = mdoc.findall('./title')[0].text
2778                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2779             except IndexError:
2780                 raise ExtractorError(u'Invalid metadata XML file')
2781             info['ext'] = info['url'].rpartition('.')[2]
2782             return [info]
2783         elif mobj.group('course'): # A course page
2784             course = mobj.group('course')
2785             info = {
2786                 'id': course,
2787                 'type': 'playlist',
2788                 'uploader': None,
2789                 'upload_date': None,
2790             }
2791
2792             coursepage = self._download_webpage(url, info['id'],
2793                                         note='Downloading course info page',
2794                                         errnote='Unable to download course info page')
2795
2796             m = re.search('<h1>([^<]+)</h1>', coursepage)
2797             if m:
2798                 info['title'] = unescapeHTML(m.group(1))
2799             else:
2800                 info['title'] = info['id']
2801
2802             m = re.search('<description>([^<]+)</description>', coursepage)
2803             if m:
2804                 info['description'] = unescapeHTML(m.group(1))
2805
2806             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2807             info['list'] = [
2808                 {
2809                     'type': 'reference',
2810                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2811                 }
2812                     for vpage in links]
2813             results = []
2814             for entry in info['list']:
2815                 assert entry['type'] == 'reference'
2816                 results += self.extract(entry['url'])
2817             return results
2818         else: # Root page
2819             info = {
2820                 'id': 'Stanford OpenClassroom',
2821                 'type': 'playlist',
2822                 'uploader': None,
2823                 'upload_date': None,
2824             }
2825
2826             self.report_download_webpage(info['id'])
2827             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2828             try:
2829                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2830             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2831                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2832
2833             info['title'] = info['id']
2834
2835             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2836             info['list'] = [
2837                 {
2838                     'type': 'reference',
2839                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2840                 }
2841                     for cpage in links]
2842
2843             results = []
2844             for entry in info['list']:
2845                 assert entry['type'] == 'reference'
2846                 results += self.extract(entry['url'])
2847             return results
2848
2849 class MTVIE(InfoExtractor):
2850     """Information extractor for MTV.com"""
2851
2852     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2853     IE_NAME = u'mtv'
2854
2855     def _real_extract(self, url):
2856         mobj = re.match(self._VALID_URL, url)
2857         if mobj is None:
2858             raise ExtractorError(u'Invalid URL: %s' % url)
2859         if not mobj.group('proto'):
2860             url = 'http://' + url
2861         video_id = mobj.group('videoid')
2862
2863         webpage = self._download_webpage(url, video_id)
2864
2865         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2866         if mobj is None:
2867             raise ExtractorError(u'Unable to extract song name')
2868         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2869         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2870         if mobj is None:
2871             raise ExtractorError(u'Unable to extract performer')
2872         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2873         video_title = performer + ' - ' + song_name
2874
2875         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2876         if mobj is None:
2877             raise ExtractorError(u'Unable to mtvn_uri')
2878         mtvn_uri = mobj.group(1)
2879
2880         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2881         if mobj is None:
2882             raise ExtractorError(u'Unable to extract content id')
2883         content_id = mobj.group(1)
2884
2885         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2886         self.report_extraction(video_id)
2887         request = compat_urllib_request.Request(videogen_url)
2888         try:
2889             metadataXml = compat_urllib_request.urlopen(request).read()
2890         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2891             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2892
2893         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2894         renditions = mdoc.findall('.//rendition')
2895
2896         # For now, always pick the highest quality.
2897         rendition = renditions[-1]
2898
2899         try:
2900             _,_,ext = rendition.attrib['type'].partition('/')
2901             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2902             video_url = rendition.find('./src').text
2903         except KeyError:
2904             raise ExtractorError('Invalid rendition field.')
2905
2906         info = {
2907             'id': video_id,
2908             'url': video_url,
2909             'uploader': performer,
2910             'upload_date': None,
2911             'title': video_title,
2912             'ext': ext,
2913             'format': format,
2914         }
2915
2916         return [info]
2917
2918
2919 class YoukuIE(InfoExtractor):
2920     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2921
2922     def _gen_sid(self):
2923         nowTime = int(time.time() * 1000)
2924         random1 = random.randint(1000,1998)
2925         random2 = random.randint(1000,9999)
2926
2927         return "%d%d%d" %(nowTime,random1,random2)
2928
2929     def _get_file_ID_mix_string(self, seed):
2930         mixed = []
2931         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2932         seed = float(seed)
2933         for i in range(len(source)):
2934             seed  =  (seed * 211 + 30031 ) % 65536
2935             index  =  math.floor(seed / 65536 * len(source) )
2936             mixed.append(source[int(index)])
2937             source.remove(source[int(index)])
2938         #return ''.join(mixed)
2939         return mixed
2940
2941     def _get_file_id(self, fileId, seed):
2942         mixed = self._get_file_ID_mix_string(seed)
2943         ids = fileId.split('*')
2944         realId = []
2945         for ch in ids:
2946             if ch:
2947                 realId.append(mixed[int(ch)])
2948         return ''.join(realId)
2949
2950     def _real_extract(self, url):
2951         mobj = re.match(self._VALID_URL, url)
2952         if mobj is None:
2953             raise ExtractorError(u'Invalid URL: %s' % url)
2954         video_id = mobj.group('ID')
2955
2956         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2957
2958         jsondata = self._download_webpage(info_url, video_id)
2959
2960         self.report_extraction(video_id)
2961         try:
2962             config = json.loads(jsondata)
2963
2964             video_title =  config['data'][0]['title']
2965             seed = config['data'][0]['seed']
2966
2967             format = self._downloader.params.get('format', None)
2968             supported_format = list(config['data'][0]['streamfileids'].keys())
2969
2970             if format is None or format == 'best':
2971                 if 'hd2' in supported_format:
2972                     format = 'hd2'
2973                 else:
2974                     format = 'flv'
2975                 ext = u'flv'
2976             elif format == 'worst':
2977                 format = 'mp4'
2978                 ext = u'mp4'
2979             else:
2980                 format = 'flv'
2981                 ext = u'flv'
2982
2983
2984             fileid = config['data'][0]['streamfileids'][format]
2985             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2986         except (UnicodeDecodeError, ValueError, KeyError):
2987             raise ExtractorError(u'Unable to extract info section')
2988
2989         files_info=[]
2990         sid = self._gen_sid()
2991         fileid = self._get_file_id(fileid, seed)
2992
2993         #column 8,9 of fileid represent the segment number
2994         #fileid[7:9] should be changed
2995         for index, key in enumerate(keys):
2996
2997             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2998             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2999
3000             info = {
3001                 'id': '%s_part%02d' % (video_id, index),
3002                 'url': download_url,
3003                 'uploader': None,
3004                 'upload_date': None,
3005                 'title': video_title,
3006                 'ext': ext,
3007             }
3008             files_info.append(info)
3009
3010         return files_info
3011
3012
3013 class XNXXIE(InfoExtractor):
3014     """Information extractor for xnxx.com"""
3015
3016     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3017     IE_NAME = u'xnxx'
3018     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3019     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3020     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3021
3022     def _real_extract(self, url):
3023         mobj = re.match(self._VALID_URL, url)
3024         if mobj is None:
3025             raise ExtractorError(u'Invalid URL: %s' % url)
3026         video_id = mobj.group(1)
3027
3028         # Get webpage content
3029         webpage = self._download_webpage(url, video_id)
3030
3031         result = re.search(self.VIDEO_URL_RE, webpage)
3032         if result is None:
3033             raise ExtractorError(u'Unable to extract video url')
3034         video_url = compat_urllib_parse.unquote(result.group(1))
3035
3036         result = re.search(self.VIDEO_TITLE_RE, webpage)
3037         if result is None:
3038             raise ExtractorError(u'Unable to extract video title')
3039         video_title = result.group(1)
3040
3041         result = re.search(self.VIDEO_THUMB_RE, webpage)
3042         if result is None:
3043             raise ExtractorError(u'Unable to extract video thumbnail')
3044         video_thumbnail = result.group(1)
3045
3046         return [{
3047             'id': video_id,
3048             'url': video_url,
3049             'uploader': None,
3050             'upload_date': None,
3051             'title': video_title,
3052             'ext': 'flv',
3053             'thumbnail': video_thumbnail,
3054             'description': None,
3055         }]
3056
3057
3058 class GooglePlusIE(InfoExtractor):
3059     """Information extractor for plus.google.com."""
3060
3061     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3062     IE_NAME = u'plus.google'
3063
3064     def report_extract_entry(self, url):
3065         """Report downloading extry"""
3066         self.to_screen(u'Downloading entry: %s' % url)
3067
3068     def report_date(self, upload_date):
3069         """Report downloading extry"""
3070         self.to_screen(u'Entry date: %s' % upload_date)
3071
3072     def report_uploader(self, uploader):
3073         """Report downloading extry"""
3074         self.to_screen(u'Uploader: %s' % uploader)
3075
3076     def report_title(self, video_title):
3077         """Report downloading extry"""
3078         self.to_screen(u'Title: %s' % video_title)
3079
3080     def report_extract_vid_page(self, video_page):
3081         """Report information extraction."""
3082         self.to_screen(u'Extracting video page: %s' % video_page)
3083
3084     def _real_extract(self, url):
3085         # Extract id from URL
3086         mobj = re.match(self._VALID_URL, url)
3087         if mobj is None:
3088             raise ExtractorError(u'Invalid URL: %s' % url)
3089
3090         post_url = mobj.group(0)
3091         video_id = mobj.group(1)
3092
3093         video_extension = 'flv'
3094
3095         # Step 1, Retrieve post webpage to extract further information
3096         self.report_extract_entry(post_url)
3097         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3098
3099         # Extract update date
3100         upload_date = None
3101         pattern = 'title="Timestamp">(.*?)</a>'
3102         mobj = re.search(pattern, webpage)
3103         if mobj:
3104             upload_date = mobj.group(1)
3105             # Convert timestring to a format suitable for filename
3106             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3107             upload_date = upload_date.strftime('%Y%m%d')
3108         self.report_date(upload_date)
3109
3110         # Extract uploader
3111         uploader = None
3112         pattern = r'rel\="author".*?>(.*?)</a>'
3113         mobj = re.search(pattern, webpage)
3114         if mobj:
3115             uploader = mobj.group(1)
3116         self.report_uploader(uploader)
3117
3118         # Extract title
3119         # Get the first line for title
3120         video_title = u'NA'
3121         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3122         mobj = re.search(pattern, webpage)
3123         if mobj:
3124             video_title = mobj.group(1)
3125         self.report_title(video_title)
3126
3127         # Step 2, Stimulate clicking the image box to launch video
3128         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3129         mobj = re.search(pattern, webpage)
3130         if mobj is None:
3131             raise ExtractorError(u'Unable to extract video page URL')
3132
3133         video_page = mobj.group(1)
3134         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3135         self.report_extract_vid_page(video_page)
3136
3137
3138         # Extract video links on video page
3139         """Extract video links of all sizes"""
3140         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3141         mobj = re.findall(pattern, webpage)
3142         if len(mobj) == 0:
3143             raise ExtractorError(u'Unable to extract video links')
3144
3145         # Sort in resolution
3146         links = sorted(mobj)
3147
3148         # Choose the lowest of the sort, i.e. highest resolution
3149         video_url = links[-1]
3150         # Only get the url. The resolution part in the tuple has no use anymore
3151         video_url = video_url[-1]
3152         # Treat escaped \u0026 style hex
3153         try:
3154             video_url = video_url.decode("unicode_escape")
3155         except AttributeError: # Python 3
3156             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3157
3158
3159         return [{
3160             'id':       video_id,
3161             'url':      video_url,
3162             'uploader': uploader,
3163             'upload_date':  upload_date,
3164             'title':    video_title,
3165             'ext':      video_extension,
3166         }]
3167
3168 class NBAIE(InfoExtractor):
3169     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3170     IE_NAME = u'nba'
3171
3172     def _real_extract(self, url):
3173         mobj = re.match(self._VALID_URL, url)
3174         if mobj is None:
3175             raise ExtractorError(u'Invalid URL: %s' % url)
3176
3177         video_id = mobj.group(1)
3178         if video_id.endswith('/index.html'):
3179             video_id = video_id[:-len('/index.html')]
3180
3181         webpage = self._download_webpage(url, video_id)
3182
3183         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3184         def _findProp(rexp, default=None):
3185             m = re.search(rexp, webpage)
3186             if m:
3187                 return unescapeHTML(m.group(1))
3188             else:
3189                 return default
3190
3191         shortened_video_id = video_id.rpartition('/')[2]
3192         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3193         info = {
3194             'id': shortened_video_id,
3195             'url': video_url,
3196             'ext': 'mp4',
3197             'title': title,
3198             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3199             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3200         }
3201         return [info]
3202
3203 class JustinTVIE(InfoExtractor):
3204     """Information extractor for justin.tv and twitch.tv"""
3205     # TODO: One broadcast may be split into multiple videos. The key
3206     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3207     # starts at 1 and increases. Can we treat all parts as one video?
3208
3209     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3210         (?:
3211             (?P<channelid>[^/]+)|
3212             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3213             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3214         )
3215         /?(?:\#.*)?$
3216         """
3217     _JUSTIN_PAGE_LIMIT = 100
3218     IE_NAME = u'justin.tv'
3219
3220     def report_download_page(self, channel, offset):
3221         """Report attempt to download a single page of videos."""
3222         self.to_screen(u'%s: Downloading video information from %d to %d' %
3223                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3224
3225     # Return count of items, list of *valid* items
3226     def _parse_page(self, url, video_id):
3227         webpage = self._download_webpage(url, video_id,
3228                                          u'Downloading video info JSON',
3229                                          u'unable to download video info JSON')
3230
3231         response = json.loads(webpage)
3232         if type(response) != list:
3233             error_text = response.get('error', 'unknown error')
3234             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3235         info = []
3236         for clip in response:
3237             video_url = clip['video_file_url']
3238             if video_url:
3239                 video_extension = os.path.splitext(video_url)[1][1:]
3240                 video_date = re.sub('-', '', clip['start_time'][:10])
3241                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3242                 video_id = clip['id']
3243                 video_title = clip.get('title', video_id)
3244                 info.append({
3245                     'id': video_id,
3246                     'url': video_url,
3247                     'title': video_title,
3248                     'uploader': clip.get('channel_name', video_uploader_id),
3249                     'uploader_id': video_uploader_id,
3250                     'upload_date': video_date,
3251                     'ext': video_extension,
3252                 })
3253         return (len(response), info)
3254
3255     def _real_extract(self, url):
3256         mobj = re.match(self._VALID_URL, url)
3257         if mobj is None:
3258             raise ExtractorError(u'invalid URL: %s' % url)
3259
3260         api_base = 'http://api.justin.tv'
3261         paged = False
3262         if mobj.group('channelid'):
3263             paged = True
3264             video_id = mobj.group('channelid')
3265             api = api_base + '/channel/archives/%s.json' % video_id
3266         elif mobj.group('chapterid'):
3267             chapter_id = mobj.group('chapterid')
3268
3269             webpage = self._download_webpage(url, chapter_id)
3270             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3271             if not m:
3272                 raise ExtractorError(u'Cannot find archive of a chapter')
3273             archive_id = m.group(1)
3274
3275             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3276             chapter_info_xml = self._download_webpage(api, chapter_id,
3277                                              note=u'Downloading chapter information',
3278                                              errnote=u'Chapter information download failed')
3279             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3280             for a in doc.findall('.//archive'):
3281                 if archive_id == a.find('./id').text:
3282                     break
3283             else:
3284                 raise ExtractorError(u'Could not find chapter in chapter information')
3285
3286             video_url = a.find('./video_file_url').text
3287             video_ext = video_url.rpartition('.')[2] or u'flv'
3288
3289             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3290             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3291                                    note='Downloading chapter metadata',
3292                                    errnote='Download of chapter metadata failed')
3293             chapter_info = json.loads(chapter_info_json)
3294
3295             bracket_start = int(doc.find('.//bracket_start').text)
3296             bracket_end = int(doc.find('.//bracket_end').text)
3297
3298             # TODO determine start (and probably fix up file)
3299             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3300             #video_url += u'?start=' + TODO:start_timestamp
3301             # bracket_start is 13290, but we want 51670615
3302             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3303                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3304
3305             info = {
3306                 'id': u'c' + chapter_id,
3307                 'url': video_url,
3308                 'ext': video_ext,
3309                 'title': chapter_info['title'],
3310                 'thumbnail': chapter_info['preview'],
3311                 'description': chapter_info['description'],
3312                 'uploader': chapter_info['channel']['display_name'],
3313                 'uploader_id': chapter_info['channel']['name'],
3314             }
3315             return [info]
3316         else:
3317             video_id = mobj.group('videoid')
3318             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3319
3320         self.report_extraction(video_id)
3321
3322         info = []
3323         offset = 0
3324         limit = self._JUSTIN_PAGE_LIMIT
3325         while True:
3326             if paged:
3327                 self.report_download_page(video_id, offset)
3328             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3329             page_count, page_info = self._parse_page(page_url, video_id)
3330             info.extend(page_info)
3331             if not paged or page_count != limit:
3332                 break
3333             offset += limit
3334         return info
3335
3336 class FunnyOrDieIE(InfoExtractor):
3337     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3338
3339     def _real_extract(self, url):
3340         mobj = re.match(self._VALID_URL, url)
3341         if mobj is None:
3342             raise ExtractorError(u'invalid URL: %s' % url)
3343
3344         video_id = mobj.group('id')
3345         webpage = self._download_webpage(url, video_id)
3346
3347         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3348         if not m:
3349             raise ExtractorError(u'Unable to find video information')
3350         video_url = unescapeHTML(m.group('url'))
3351
3352         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3353         if not m:
3354             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3355             if not m:
3356                 raise ExtractorError(u'Cannot find video title')
3357         title = clean_html(m.group('title'))
3358
3359         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3360         if m:
3361             desc = unescapeHTML(m.group('desc'))
3362         else:
3363             desc = None
3364
3365         info = {
3366             'id': video_id,
3367             'url': video_url,
3368             'ext': 'mp4',
3369             'title': title,
3370             'description': desc,
3371         }
3372         return [info]
3373
3374 class SteamIE(InfoExtractor):
3375     _VALID_URL = r"""http://store\.steampowered\.com/
3376                 (agecheck/)?
3377                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3378                 (?P<gameID>\d+)/?
3379                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3380                 """
3381
3382     @classmethod
3383     def suitable(cls, url):
3384         """Receives a URL and returns True if suitable for this IE."""
3385         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3386
3387     def _real_extract(self, url):
3388         m = re.match(self._VALID_URL, url, re.VERBOSE)
3389         gameID = m.group('gameID')
3390         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3391         self.report_age_confirmation()
3392         webpage = self._download_webpage(videourl, gameID)
3393         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3394
3395         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3396         mweb = re.finditer(urlRE, webpage)
3397         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3398         titles = re.finditer(namesRE, webpage)
3399         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3400         thumbs = re.finditer(thumbsRE, webpage)
3401         videos = []
3402         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3403             video_id = vid.group('videoID')
3404             title = vtitle.group('videoName')
3405             video_url = vid.group('videoURL')
3406             video_thumb = thumb.group('thumbnail')
3407             if not video_url:
3408                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3409             info = {
3410                 'id':video_id,
3411                 'url':video_url,
3412                 'ext': 'flv',
3413                 'title': unescapeHTML(title),
3414                 'thumbnail': video_thumb
3415                   }
3416             videos.append(info)
3417         return [self.playlist_result(videos, gameID, game_title)]
3418
3419 class UstreamIE(InfoExtractor):
3420     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3421     IE_NAME = u'ustream'
3422
3423     def _real_extract(self, url):
3424         m = re.match(self._VALID_URL, url)
3425         video_id = m.group('videoID')
3426         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3427         webpage = self._download_webpage(url, video_id)
3428         self.report_extraction(video_id)
3429         try:
3430             m = re.search(r'data-title="(?P<title>.+)"',webpage)
3431             title = m.group('title')
3432             m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3433                           webpage, re.DOTALL)
3434             uploader = unescapeHTML(m.group('uploader').strip())
3435             m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3436             thumb = m.group('thumb')
3437         except AttributeError:
3438             raise ExtractorError(u'Unable to extract info')
3439         info = {
3440                 'id':video_id,
3441                 'url':video_url,
3442                 'ext': 'flv',
3443                 'title': title,
3444                 'uploader': uploader,
3445                 'thumbnail': thumb,
3446                   }
3447         return info
3448
3449 class WorldStarHipHopIE(InfoExtractor):
3450     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3451     IE_NAME = u'WorldStarHipHop'
3452
3453     def _real_extract(self, url):
3454         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3455
3456         m = re.match(self._VALID_URL, url)
3457         video_id = m.group('id')
3458
3459         webpage_src = self._download_webpage(url, video_id)
3460
3461         mobj = re.search(_src_url, webpage_src)
3462
3463         if mobj is not None:
3464             video_url = mobj.group(1)
3465             if 'mp4' in video_url:
3466                 ext = 'mp4'
3467             else:
3468                 ext = 'flv'
3469         else:
3470             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3471
3472         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3473
3474         if mobj is None:
3475             raise ExtractorError(u'Cannot determine title')
3476         title = mobj.group(1)
3477
3478         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3479         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3480         if mobj is not None:
3481             thumbnail = mobj.group(1)
3482         else:
3483             _title = r"""candytitles.*>(.*)</span>"""
3484             mobj = re.search(_title, webpage_src)
3485             if mobj is not None:
3486                 title = mobj.group(1)
3487             thumbnail = None
3488
3489         results = [{
3490                     'id': video_id,
3491                     'url' : video_url,
3492                     'title' : title,
3493                     'thumbnail' : thumbnail,
3494                     'ext' : ext,
3495                     }]
3496         return results
3497
3498 class RBMARadioIE(InfoExtractor):
3499     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3500
3501     def _real_extract(self, url):
3502         m = re.match(self._VALID_URL, url)
3503         video_id = m.group('videoID')
3504
3505         webpage = self._download_webpage(url, video_id)
3506         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3507         if not m:
3508             raise ExtractorError(u'Cannot find metadata')
3509         json_data = m.group(1)
3510
3511         try:
3512             data = json.loads(json_data)
3513         except ValueError as e:
3514             raise ExtractorError(u'Invalid JSON: ' + str(e))
3515
3516         video_url = data['akamai_url'] + '&cbr=256'
3517         url_parts = compat_urllib_parse_urlparse(video_url)
3518         video_ext = url_parts.path.rpartition('.')[2]
3519         info = {
3520                 'id': video_id,
3521                 'url': video_url,
3522                 'ext': video_ext,
3523                 'title': data['title'],
3524                 'description': data.get('teaser_text'),
3525                 'location': data.get('country_of_origin'),
3526                 'uploader': data.get('host', {}).get('name'),
3527                 'uploader_id': data.get('host', {}).get('slug'),
3528                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3529                 'duration': data.get('duration'),
3530         }
3531         return [info]
3532
3533
3534 class YouPornIE(InfoExtractor):
3535     """Information extractor for youporn.com."""
3536     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3537
3538     def _print_formats(self, formats):
3539         """Print all available formats"""
3540         print(u'Available formats:')
3541         print(u'ext\t\tformat')
3542         print(u'---------------------------------')
3543         for format in formats:
3544             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3545
3546     def _specific(self, req_format, formats):
3547         for x in formats:
3548             if(x["format"]==req_format):
3549                 return x
3550         return None
3551
3552     def _real_extract(self, url):
3553         mobj = re.match(self._VALID_URL, url)
3554         if mobj is None:
3555             raise ExtractorError(u'Invalid URL: %s' % url)
3556
3557         video_id = mobj.group('videoid')
3558
3559         req = compat_urllib_request.Request(url)
3560         req.add_header('Cookie', 'age_verified=1')
3561         webpage = self._download_webpage(req, video_id)
3562
3563         # Get the video title
3564         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3565         if result is None:
3566             raise ExtractorError(u'Unable to extract video title')
3567         video_title = result.group('title').strip()
3568
3569         # Get the video date
3570         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3571         if result is None:
3572             self._downloader.report_warning(u'unable to extract video date')
3573             upload_date = None
3574         else:
3575             upload_date = unified_strdate(result.group('date').strip())
3576
3577         # Get the video uploader
3578         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3579         if result is None:
3580             self._downloader.report_warning(u'unable to extract uploader')
3581             video_uploader = None
3582         else:
3583             video_uploader = result.group('uploader').strip()
3584             video_uploader = clean_html( video_uploader )
3585
3586         # Get all of the formats available
3587         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3588         result = re.search(DOWNLOAD_LIST_RE, webpage)
3589         if result is None:
3590             raise ExtractorError(u'Unable to extract download list')
3591         download_list_html = result.group('download_list').strip()
3592
3593         # Get all of the links from the page
3594         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3595         links = re.findall(LINK_RE, download_list_html)
3596         if(len(links) == 0):
3597             raise ExtractorError(u'ERROR: no known formats available for video')
3598
3599         self.to_screen(u'Links found: %d' % len(links))
3600
3601         formats = []
3602         for link in links:
3603
3604             # A link looks like this:
3605             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3606             # A path looks like this:
3607             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3608             video_url = unescapeHTML( link )
3609             path = compat_urllib_parse_urlparse( video_url ).path
3610             extension = os.path.splitext( path )[1][1:]
3611             format = path.split('/')[4].split('_')[:2]
3612             size = format[0]
3613             bitrate = format[1]
3614             format = "-".join( format )
3615             title = u'%s-%s-%s' % (video_title, size, bitrate)
3616
3617             formats.append({
3618                 'id': video_id,
3619                 'url': video_url,
3620                 'uploader': video_uploader,
3621                 'upload_date': upload_date,
3622                 'title': title,
3623                 'ext': extension,
3624                 'format': format,
3625                 'thumbnail': None,
3626                 'description': None,
3627                 'player_url': None
3628             })
3629
3630         if self._downloader.params.get('listformats', None):
3631             self._print_formats(formats)
3632             return
3633
3634         req_format = self._downloader.params.get('format', None)
3635         self.to_screen(u'Format: %s' % req_format)
3636
3637         if req_format is None or req_format == 'best':
3638             return [formats[0]]
3639         elif req_format == 'worst':
3640             return [formats[-1]]
3641         elif req_format in ('-1', 'all'):
3642             return formats
3643         else:
3644             format = self._specific( req_format, formats )
3645             if result is None:
3646                 raise ExtractorError(u'Requested format not available')
3647             return [format]
3648
3649
3650
3651 class PornotubeIE(InfoExtractor):
3652     """Information extractor for pornotube.com."""
3653     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3654
3655     def _real_extract(self, url):
3656         mobj = re.match(self._VALID_URL, url)
3657         if mobj is None:
3658             raise ExtractorError(u'Invalid URL: %s' % url)
3659
3660         video_id = mobj.group('videoid')
3661         video_title = mobj.group('title')
3662
3663         # Get webpage content
3664         webpage = self._download_webpage(url, video_id)
3665
3666         # Get the video URL
3667         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3668         result = re.search(VIDEO_URL_RE, webpage)
3669         if result is None:
3670             raise ExtractorError(u'Unable to extract video url')
3671         video_url = compat_urllib_parse.unquote(result.group('url'))
3672
3673         #Get the uploaded date
3674         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3675         result = re.search(VIDEO_UPLOADED_RE, webpage)
3676         if result is None:
3677             raise ExtractorError(u'Unable to extract video title')
3678         upload_date = unified_strdate(result.group('date'))
3679
3680         info = {'id': video_id,
3681                 'url': video_url,
3682                 'uploader': None,
3683                 'upload_date': upload_date,
3684                 'title': video_title,
3685                 'ext': 'flv',
3686                 'format': 'flv'}
3687
3688         return [info]
3689
3690 class YouJizzIE(InfoExtractor):
3691     """Information extractor for youjizz.com."""
3692     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3693
3694     def _real_extract(self, url):
3695         mobj = re.match(self._VALID_URL, url)
3696         if mobj is None:
3697             raise ExtractorError(u'Invalid URL: %s' % url)
3698
3699         video_id = mobj.group('videoid')
3700
3701         # Get webpage content
3702         webpage = self._download_webpage(url, video_id)
3703
3704         # Get the video title
3705         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3706         if result is None:
3707             raise ExtractorError(u'ERROR: unable to extract video title')
3708         video_title = result.group('title').strip()
3709
3710         # Get the embed page
3711         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3712         if result is None:
3713             raise ExtractorError(u'ERROR: unable to extract embed page')
3714
3715         embed_page_url = result.group(0).strip()
3716         video_id = result.group('videoid')
3717
3718         webpage = self._download_webpage(embed_page_url, video_id)
3719
3720         # Get the video URL
3721         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3722         if result is None:
3723             raise ExtractorError(u'ERROR: unable to extract video url')
3724         video_url = result.group('source')
3725
3726         info = {'id': video_id,
3727                 'url': video_url,
3728                 'title': video_title,
3729                 'ext': 'flv',
3730                 'format': 'flv',
3731                 'player_url': embed_page_url}
3732
3733         return [info]
3734
3735 class EightTracksIE(InfoExtractor):
3736     IE_NAME = '8tracks'
3737     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3738
3739     def _real_extract(self, url):
3740         mobj = re.match(self._VALID_URL, url)
3741         if mobj is None:
3742             raise ExtractorError(u'Invalid URL: %s' % url)
3743         playlist_id = mobj.group('id')
3744
3745         webpage = self._download_webpage(url, playlist_id)
3746
3747         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3748         if not m:
3749             raise ExtractorError(u'Cannot find trax information')
3750         json_like = m.group(1)
3751         data = json.loads(json_like)
3752
3753         session = str(random.randint(0, 1000000000))
3754         mix_id = data['id']
3755         track_count = data['tracks_count']
3756         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3757         next_url = first_url
3758         res = []
3759         for i in itertools.count():
3760             api_json = self._download_webpage(next_url, playlist_id,
3761                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3762                 errnote=u'Failed to download song information')
3763             api_data = json.loads(api_json)
3764             track_data = api_data[u'set']['track']
3765             info = {
3766                 'id': track_data['id'],
3767                 'url': track_data['track_file_stream_url'],
3768                 'title': track_data['performer'] + u' - ' + track_data['name'],
3769                 'raw_title': track_data['name'],
3770                 'uploader_id': data['user']['login'],
3771                 'ext': 'm4a',
3772             }
3773             res.append(info)
3774             if api_data['set']['at_last_track']:
3775                 break
3776             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3777         return res
3778
3779 class KeekIE(InfoExtractor):
3780     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3781     IE_NAME = u'keek'
3782
3783     def _real_extract(self, url):
3784         m = re.match(self._VALID_URL, url)
3785         video_id = m.group('videoID')
3786         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3787         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3788         webpage = self._download_webpage(url, video_id)
3789         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3790         title = unescapeHTML(m.group('title'))
3791         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3792         uploader = clean_html(m.group('uploader'))
3793         info = {
3794                 'id': video_id,
3795                 'url': video_url,
3796                 'ext': 'mp4',
3797                 'title': title,
3798                 'thumbnail': thumbnail,
3799                 'uploader': uploader
3800         }
3801         return [info]
3802
3803 class TEDIE(InfoExtractor):
3804     _VALID_URL=r'''http://www\.ted\.com/
3805                    (
3806                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3807                         |
3808                         ((?P<type_talk>talks)) # We have a simple talk
3809                    )
3810                    (/lang/(.*?))? # The url may contain the language
3811                    /(?P<name>\w+) # Here goes the name and then ".html"
3812                    '''
3813
3814     @classmethod
3815     def suitable(cls, url):
3816         """Receives a URL and returns True if suitable for this IE."""
3817         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3818
3819     def _real_extract(self, url):
3820         m=re.match(self._VALID_URL, url, re.VERBOSE)
3821         if m.group('type_talk'):
3822             return [self._talk_info(url)]
3823         else :
3824             playlist_id=m.group('playlist_id')
3825             name=m.group('name')
3826             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3827             return [self._playlist_videos_info(url,name,playlist_id)]
3828
3829     def _talk_video_link(self,mediaSlug):
3830         '''Returns the video link for that mediaSlug'''
3831         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3832
3833     def _playlist_videos_info(self,url,name,playlist_id=0):
3834         '''Returns the videos of the playlist'''
3835         video_RE=r'''
3836                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3837                      ([.\s]*?)data-playlist_item_id="(\d+)"
3838                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3839                      '''
3840         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3841         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3842         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3843         m_names=re.finditer(video_name_RE,webpage)
3844
3845         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3846         m_playlist = re.search(playlist_RE, webpage)
3847         playlist_title = m_playlist.group('playlist_title')
3848
3849         playlist_entries = []
3850         for m_video, m_name in zip(m_videos,m_names):
3851             video_id=m_video.group('video_id')
3852             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3853             playlist_entries.append(self.url_result(talk_url, 'TED'))
3854         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3855
3856     def _talk_info(self, url, video_id=0):
3857         """Return the video for the talk in the url"""
3858         m=re.match(self._VALID_URL, url,re.VERBOSE)
3859         videoName=m.group('name')
3860         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3861         # If the url includes the language we get the title translated
3862         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3863         title=re.search(title_RE, webpage).group('title')
3864         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3865                         "id":(?P<videoID>[\d]+).*?
3866                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3867         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3868         thumb_match=re.search(thumb_RE,webpage)
3869         info_match=re.search(info_RE,webpage,re.VERBOSE)
3870         video_id=info_match.group('videoID')
3871         mediaSlug=info_match.group('mediaSlug')
3872         video_url=self._talk_video_link(mediaSlug)
3873         info = {
3874                 'id': video_id,
3875                 'url': video_url,
3876                 'ext': 'mp4',
3877                 'title': title,
3878                 'thumbnail': thumb_match.group('thumbnail')
3879                 }
3880         return info
3881
3882 class MySpassIE(InfoExtractor):
3883     _VALID_URL = r'http://www.myspass.de/.*'
3884
3885     def _real_extract(self, url):
3886         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3887
3888         # video id is the last path element of the URL
3889         # usually there is a trailing slash, so also try the second but last
3890         url_path = compat_urllib_parse_urlparse(url).path
3891         url_parent_path, video_id = os.path.split(url_path)
3892         if not video_id:
3893             _, video_id = os.path.split(url_parent_path)
3894
3895         # get metadata
3896         metadata_url = META_DATA_URL_TEMPLATE % video_id
3897         metadata_text = self._download_webpage(metadata_url, video_id)
3898         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3899
3900         # extract values from metadata
3901         url_flv_el = metadata.find('url_flv')
3902         if url_flv_el is None:
3903             raise ExtractorError(u'Unable to extract download url')
3904         video_url = url_flv_el.text
3905         extension = os.path.splitext(video_url)[1][1:]
3906         title_el = metadata.find('title')
3907         if title_el is None:
3908             raise ExtractorError(u'Unable to extract title')
3909         title = title_el.text
3910         format_id_el = metadata.find('format_id')
3911         if format_id_el is None:
3912             format = ext
3913         else:
3914             format = format_id_el.text
3915         description_el = metadata.find('description')
3916         if description_el is not None:
3917             description = description_el.text
3918         else:
3919             description = None
3920         imagePreview_el = metadata.find('imagePreview')
3921         if imagePreview_el is not None:
3922             thumbnail = imagePreview_el.text
3923         else:
3924             thumbnail = None
3925         info = {
3926             'id': video_id,
3927             'url': video_url,
3928             'title': title,
3929             'ext': extension,
3930             'format': format,
3931             'thumbnail': thumbnail,
3932             'description': description
3933         }
3934         return [info]
3935
3936 class SpiegelIE(InfoExtractor):
3937     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3938
3939     def _real_extract(self, url):
3940         m = re.match(self._VALID_URL, url)
3941         video_id = m.group('videoID')
3942
3943         webpage = self._download_webpage(url, video_id)
3944         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3945         if not m:
3946             raise ExtractorError(u'Cannot find title')
3947         video_title = unescapeHTML(m.group(1))
3948
3949         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3950         xml_code = self._download_webpage(xml_url, video_id,
3951                     note=u'Downloading XML', errnote=u'Failed to download XML')
3952
3953         idoc = xml.etree.ElementTree.fromstring(xml_code)
3954         last_type = idoc[-1]
3955         filename = last_type.findall('./filename')[0].text
3956         duration = float(last_type.findall('./duration')[0].text)
3957
3958         video_url = 'http://video2.spiegel.de/flash/' + filename
3959         video_ext = filename.rpartition('.')[2]
3960         info = {
3961             'id': video_id,
3962             'url': video_url,
3963             'ext': video_ext,
3964             'title': video_title,
3965             'duration': duration,
3966         }
3967         return [info]
3968
3969 class LiveLeakIE(InfoExtractor):
3970
3971     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3972     IE_NAME = u'liveleak'
3973
3974     def _real_extract(self, url):
3975         mobj = re.match(self._VALID_URL, url)
3976         if mobj is None:
3977             raise ExtractorError(u'Invalid URL: %s' % url)
3978
3979         video_id = mobj.group('video_id')
3980
3981         webpage = self._download_webpage(url, video_id)
3982
3983         m = re.search(r'file: "(.*?)",', webpage)
3984         if not m:
3985             raise ExtractorError(u'Unable to find video url')
3986         video_url = m.group(1)
3987
3988         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3989         if not m:
3990             raise ExtractorError(u'Cannot find video title')
3991         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3992
3993         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3994         if m:
3995             desc = unescapeHTML(m.group('desc'))
3996         else:
3997             desc = None
3998
3999         m = re.search(r'By:.*?(\w+)</a>', webpage)
4000         if m:
4001             uploader = clean_html(m.group(1))
4002         else:
4003             uploader = None
4004
4005         info = {
4006             'id':  video_id,
4007             'url': video_url,
4008             'ext': 'mp4',
4009             'title': title,
4010             'description': desc,
4011             'uploader': uploader
4012         }
4013
4014         return [info]
4015
4016 class ARDIE(InfoExtractor):
4017     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4018     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4019     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4020
4021     def _real_extract(self, url):
4022         # determine video id from url
4023         m = re.match(self._VALID_URL, url)
4024
4025         numid = re.search(r'documentId=([0-9]+)', url)
4026         if numid:
4027             video_id = numid.group(1)
4028         else:
4029             video_id = m.group('video_id')
4030
4031         # determine title and media streams from webpage
4032         html = self._download_webpage(url, video_id)
4033         title = re.search(self._TITLE, html).group('title')
4034         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4035         if not streams:
4036             assert '"fsk"' in html
4037             raise ExtractorError(u'This video is only available after 8:00 pm')
4038
4039         # choose default media type and highest quality for now
4040         stream = max([s for s in streams if int(s["media_type"]) == 0],
4041                      key=lambda s: int(s["quality"]))
4042
4043         # there's two possibilities: RTMP stream or HTTP download
4044         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4045         if stream['rtmp_url']:
4046             self.to_screen(u'RTMP download detected')
4047             assert stream['video_url'].startswith('mp4:')
4048             info["url"] = stream["rtmp_url"]
4049             info["play_path"] = stream['video_url']
4050         else:
4051             assert stream["video_url"].endswith('.mp4')
4052             info["url"] = stream["video_url"]
4053         return [info]
4054
4055 class TumblrIE(InfoExtractor):
4056     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4057
4058     def _real_extract(self, url):
4059         m_url = re.match(self._VALID_URL, url)
4060         video_id = m_url.group('id')
4061         blog = m_url.group('blog_name')
4062
4063         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4064         webpage = self._download_webpage(url, video_id)
4065
4066         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4067         video = re.search(re_video, webpage)
4068         if video is None:
4069             self.to_screen("No video found")
4070             return []
4071         video_url = video.group('video_url')
4072         ext = video.group('ext')
4073
4074         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4075         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4076
4077         # The only place where you can get a title, it's not complete,
4078         # but searching in other places doesn't work for all videos
4079         re_title = r'<title>(?P<title>.*?)</title>'
4080         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4081
4082         return [{'id': video_id,
4083                  'url': video_url,
4084                  'title': title,
4085                  'thumbnail': thumb,
4086                  'ext': ext
4087                  }]
4088
4089 class BandcampIE(InfoExtractor):
4090     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4091
4092     def _real_extract(self, url):
4093         mobj = re.match(self._VALID_URL, url)
4094         title = mobj.group('title')
4095         webpage = self._download_webpage(url, title)
4096         # We get the link to the free download page
4097         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4098         if m_download is None:
4099             raise ExtractorError(u'No free songs founded')
4100
4101         download_link = m_download.group(1)
4102         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4103                        webpage, re.MULTILINE|re.DOTALL).group('id')
4104
4105         download_webpage = self._download_webpage(download_link, id,
4106                                                   'Downloading free downloads page')
4107         # We get the dictionary of the track from some javascrip code
4108         info = re.search(r'items: (.*?),$',
4109                          download_webpage, re.MULTILINE).group(1)
4110         info = json.loads(info)[0]
4111         # We pick mp3-320 for now, until format selection can be easily implemented.
4112         mp3_info = info[u'downloads'][u'mp3-320']
4113         # If we try to use this url it says the link has expired
4114         initial_url = mp3_info[u'url']
4115         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4116         m_url = re.match(re_url, initial_url)
4117         #We build the url we will use to get the final track url
4118         # This url is build in Bandcamp in the script download_bunde_*.js
4119         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4120         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4121         # If we could correctly generate the .rand field the url would be
4122         #in the "download_url" key
4123         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4124
4125         track_info = {'id':id,
4126                       'title' : info[u'title'],
4127                       'ext' : 'mp3',
4128                       'url' : final_url,
4129                       'thumbnail' : info[u'thumb_url'],
4130                       'uploader' : info[u'artist']
4131                       }
4132
4133         return [track_info]
4134
4135 class RedTubeIE(InfoExtractor):
4136     """Information Extractor for redtube"""
4137     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4138
4139     def _real_extract(self,url):
4140         mobj = re.match(self._VALID_URL, url)
4141         if mobj is None:
4142             raise ExtractorError(u'Invalid URL: %s' % url)
4143
4144         video_id = mobj.group('id')
4145         video_extension = 'mp4'
4146         webpage = self._download_webpage(url, video_id)
4147         self.report_extraction(video_id)
4148         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4149
4150         if mobj is None:
4151             raise ExtractorError(u'Unable to extract media URL')
4152
4153         video_url = mobj.group(1)
4154         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4155         if mobj is None:
4156             raise ExtractorError(u'Unable to extract title')
4157         video_title = mobj.group(1)
4158
4159         return [{
4160             'id':       video_id,
4161             'url':      video_url,
4162             'ext':      video_extension,
4163             'title':    video_title,
4164         }]
4165
4166 class InaIE(InfoExtractor):
4167     """Information Extractor for Ina.fr"""
4168     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4169
4170     def _real_extract(self,url):
4171         mobj = re.match(self._VALID_URL, url)
4172
4173         video_id = mobj.group('id')
4174         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4175         video_extension = 'mp4'
4176         webpage = self._download_webpage(mrss_url, video_id)
4177
4178         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4179         if mobj is None:
4180             raise ExtractorError(u'Unable to extract media URL')
4181         video_url = mobj.group(1)
4182
4183         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4184         if mobj is None:
4185             raise ExtractorError(u'Unable to extract title')
4186         video_title = mobj.group(1)
4187
4188         return [{
4189             'id':       video_id,
4190             'url':      video_url,
4191             'ext':      video_extension,
4192             'title':    video_title,
4193         }]
4194
4195 class HowcastIE(InfoExtractor):
4196     """Information Extractor for Howcast.com"""
4197     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4198
4199     def _real_extract(self, url):
4200         mobj = re.match(self._VALID_URL, url)
4201
4202         video_id = mobj.group('id')
4203         webpage_url = 'http://www.howcast.com/videos/' + video_id
4204         webpage = self._download_webpage(webpage_url, video_id)
4205
4206         self.report_extraction(video_id)
4207
4208         mobj = re.search(r'\'file\': "(http://mobile-media\.howcast\.com/\d+\.mp4)"', webpage)
4209         if mobj is None:
4210             raise ExtractorError(u'Unable to extract video URL')
4211         video_url = mobj.group(1)
4212
4213         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4214         if mobj is None:
4215             raise ExtractorError(u'Unable to extract title')
4216         video_title = mobj.group(1) or mobj.group(2)
4217
4218         mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4219         if mobj is None:
4220             self._downloader.report_warning(u'unable to extract description')
4221             video_description = None
4222         else:
4223             video_description = mobj.group(1) or mobj.group(2)
4224
4225         mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4226         if mobj is None:
4227             raise ExtractorError(u'Unable to extract thumbnail')
4228         thumbnail = mobj.group(1)
4229
4230         return [{
4231             'id':       video_id,
4232             'url':      video_url,
4233             'ext':      'mp4',
4234             'title':    video_title,
4235             'description': video_description,
4236             'thumbnail': thumbnail,
4237         }]
4238
4239 class VineIE(InfoExtractor):
4240     """Information Extractor for Vine.co"""
4241     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4242
4243     def _real_extract(self, url):
4244
4245         mobj = re.match(self._VALID_URL, url)
4246
4247         video_id = mobj.group('id')
4248         webpage_url = 'https://vine.co/v/' + video_id
4249         webpage = self._download_webpage(webpage_url, video_id)
4250
4251         self.report_extraction(video_id)
4252
4253         mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4254         if mobj is None:
4255             raise ExtractorError(u'Unable to extract video URL')
4256         video_url = mobj.group(1)
4257
4258         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4259         if mobj is None:
4260             raise ExtractorError(u'Unable to extract title')
4261         video_title = mobj.group(1)
4262
4263         mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4264         if mobj is None:
4265             raise ExtractorError(u'Unable to extract thumbnail')
4266         thumbnail = mobj.group(1)
4267
4268         mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4269         if mobj is None:
4270             raise ExtractorError(u'Unable to extract uploader')
4271         uploader = mobj.group(1)
4272
4273         return [{
4274             'id':        video_id,
4275             'url':       video_url,
4276             'ext':       'mp4',
4277             'title':     video_title,
4278             'thumbnail': thumbnail,
4279             'uploader':  uploader,
4280         }]
4281
4282 class FlickrIE(InfoExtractor):
4283     """Information Extractor for Flickr videos"""
4284     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4285
4286     def _real_extract(self, url):
4287         mobj = re.match(self._VALID_URL, url)
4288
4289         video_id = mobj.group('id')
4290         video_uploader_id = mobj.group('uploader_id')
4291         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4292         webpage = self._download_webpage(webpage_url, video_id)
4293
4294         mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4295         if mobj is None:
4296             raise ExtractorError(u'Unable to extract video secret')
4297         secret = mobj.group(1)
4298
4299         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4300         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4301
4302         mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4303         if mobj is None:
4304             raise ExtractorError(u'Unable to extract node_id')
4305         node_id = mobj.group(1)
4306
4307         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4308         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4309
4310         self.report_extraction(video_id)
4311
4312         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4313         if mobj is None:
4314             raise ExtractorError(u'Unable to extract video url')
4315         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4316
4317         mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4318         if mobj is None:
4319             raise ExtractorError(u'Unable to extract title')
4320         video_title = mobj.group(1) or mobj.group(2)
4321
4322         mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4323         if mobj is None:
4324             self._downloader.report_warning(u'unable to extract description')
4325             video_description = None
4326         else:
4327             video_description = mobj.group(1) or mobj.group(2)
4328
4329         mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4330         if mobj is None:
4331             raise ExtractorError(u'Unable to extract thumbnail')
4332         thumbnail = mobj.group(1) or mobj.group(2)
4333
4334         return [{
4335             'id':          video_id,
4336             'url':         video_url,
4337             'ext':         'mp4',
4338             'title':       video_title,
4339             'description': video_description,
4340             'thumbnail':   thumbnail,
4341             'uploader_id': video_uploader_id,
4342         }]
4343
4344 class TeamcocoIE(InfoExtractor):
4345     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4346
4347     def _real_extract(self, url):
4348         mobj = re.match(self._VALID_URL, url)
4349         if mobj is None:
4350             raise ExtractorError(u'Invalid URL: %s' % url)
4351         url_title = mobj.group('url_title')
4352         webpage = self._download_webpage(url, url_title)
4353
4354         mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4355         video_id = mobj.group(1)
4356
4357         self.report_extraction(video_id)
4358
4359         mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4360         if mobj is None:
4361             raise ExtractorError(u'Unable to extract title')
4362         video_title = mobj.group(1)
4363
4364         mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4365         if mobj is None:
4366             raise ExtractorError(u'Unable to extract thumbnail')
4367         thumbnail = mobj.group(1)
4368
4369         mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4370         if mobj is None:
4371             raise ExtractorError(u'Unable to extract description')
4372         description = mobj.group(1)
4373
4374         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4375         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4376         mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4377         if mobj is None:
4378             raise ExtractorError(u'Unable to extract video url')
4379         video_url = mobj.group(1)
4380
4381         return [{
4382             'id':          video_id,
4383             'url':         video_url,
4384             'ext':         'mp4',
4385             'title':       video_title,
4386             'thumbnail':   thumbnail,
4387             'description': description,
4388         }]
4389
4390 def gen_extractors():
4391     """ Return a list of an instance of every supported extractor.
4392     The order does matter; the first extractor matched is the one handling the URL.
4393     """
4394     return [
4395         YoutubePlaylistIE(),
4396         YoutubeChannelIE(),
4397         YoutubeUserIE(),
4398         YoutubeSearchIE(),
4399         YoutubeIE(),
4400         MetacafeIE(),
4401         DailymotionIE(),
4402         GoogleSearchIE(),
4403         PhotobucketIE(),
4404         YahooIE(),
4405         YahooSearchIE(),
4406         DepositFilesIE(),
4407         FacebookIE(),
4408         BlipTVUserIE(),
4409         BlipTVIE(),
4410         VimeoIE(),
4411         MyVideoIE(),
4412         ComedyCentralIE(),
4413         EscapistIE(),
4414         CollegeHumorIE(),
4415         XVideosIE(),
4416         SoundcloudSetIE(),
4417         SoundcloudIE(),
4418         InfoQIE(),
4419         MixcloudIE(),
4420         StanfordOpenClassroomIE(),
4421         MTVIE(),
4422         YoukuIE(),
4423         XNXXIE(),
4424         YouJizzIE(),
4425         PornotubeIE(),
4426         YouPornIE(),
4427         GooglePlusIE(),
4428         ArteTvIE(),
4429         NBAIE(),
4430         WorldStarHipHopIE(),
4431         JustinTVIE(),
4432         FunnyOrDieIE(),
4433         SteamIE(),
4434         UstreamIE(),
4435         RBMARadioIE(),
4436         EightTracksIE(),
4437         KeekIE(),
4438         TEDIE(),
4439         MySpassIE(),
4440         SpiegelIE(),
4441         LiveLeakIE(),
4442         ARDIE(),
4443         TumblrIE(),
4444         BandcampIE(),
4445         RedTubeIE(),
4446         InaIE(),
4447         HowcastIE(),
4448         VineIE(),
4449         FlickrIE(),
4450         TeamcocoIE(),
4451         GenericIE()
4452     ]
4453
4454 def get_info_extractor(ie_name):
4455     """Returns the info extractor class with the given ie_name"""
4456     return globals()[ie_name+'IE']