_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 476
 477     def _extract_id(self, url):
 478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 479         if mobj is None:
 480             raise ExtractorError(u'Invalid URL: %s' % url)
 481         video_id = mobj.group(2)
 482         return video_id
 483
 484     def _real_extract(self, url):
 485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 486         mobj = re.search(self._NEXT_URL_RE, url)
 487         if mobj:
 488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 489         video_id = self._extract_id(url)
 490
 491         # Get video webpage
 492         self.report_video_webpage_download(video_id)
 493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 494         request = compat_urllib_request.Request(url)
 495         try:
 496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 499
 500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 501
 502         # Attempt to extract SWF player URL
 503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 504         if mobj is not None:
 505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 506         else:
 507             player_url = None
 508
 509         # Get video info
 510         self.report_video_info_webpage_download(video_id)
 511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 513                     % (video_id, el_type))
 514             video_info_webpage = self._download_webpage(video_info_url, video_id,
 515                                     note=False,
 516                                     errnote='unable to download video info webpage')
 517             video_info = compat_parse_qs(video_info_webpage)
 518             if 'token' in video_info:
 519                 break
 520         if 'token' not in video_info:
 521             if 'reason' in video_info:
 522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 523             else:
 524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 525
 526         # Check for "rental" videos
 527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 528             raise ExtractorError(u'"rental" videos not supported')
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             raise ExtractorError(u'Unable to extract uploader name')
 536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 537
 538         # uploader_id
 539         video_uploader_id = None
 540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 541         if mobj is not None:
 542             video_uploader_id = mobj.group(1)
 543         else:
 544             self._downloader.report_warning(u'unable to extract uploader nickname')
 545
 546         # title
 547         if 'title' not in video_info:
 548             raise ExtractorError(u'Unable to extract video title')
 549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 550
 551         # thumbnail image
 552         if 'thumbnail_url' not in video_info:
 553             self._downloader.report_warning(u'unable to extract video thumbnail')
 554             video_thumbnail = ''
 555         else:   # don't panic if we can't find it
 556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 557
 558         # upload date
 559         upload_date = None
 560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 561         if mobj is not None:
 562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 563             upload_date = unified_strdate(upload_date)
 564
 565         # description
 566         video_description = get_element_by_id("eow-description", video_webpage)
 567         if video_description:
 568             video_description = clean_html(video_description)
 569         else:
 570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 571             if fd_mobj:
 572                 video_description = unescapeHTML(fd_mobj.group(1))
 573             else:
 574                 video_description = u''
 575
 576         # subtitles
 577         video_subtitles = None
 578
 579         if self._downloader.params.get('writesubtitles', False):
 580             video_subtitles = self._extract_subtitle(video_id)
 581             if video_subtitles:
 582                 (sub_error, sub_lang, sub) = video_subtitles[0]
 583                 if sub_error:
 584                     self._downloader.report_error(sub_error)
 585
 586         if self._downloader.params.get('allsubtitles', False):
 587             video_subtitles = self._extract_all_subtitles(video_id)
 588             for video_subtitle in video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitle
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('listsubtitles', False):
 594             sub_lang_list = self._list_available_subtitles(video_id)
 595             return
 596
 597         if 'length_seconds' not in video_info:
 598             self._downloader.report_warning(u'unable to extract video duration')
 599             video_duration = ''
 600         else:
 601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 602
 603         # token
 604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 605
 606         # Decide which formats to download
 607         req_format = self._downloader.params.get('format', None)
 608
 609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 610             self.report_rtmp_download()
 611             video_url_list = [(None, video_info['conn'][0])]
 612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 613             url_map = {}
 614             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 615                 url_data = compat_parse_qs(url_data_str)
 616                 if 'itag' in url_data and 'url' in url_data:
 617                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 618                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 619                     url_map[url_data['itag'][0]] = url
 620
 621             format_limit = self._downloader.params.get('format_limit', None)
 622             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 623             if format_limit is not None and format_limit in available_formats:
 624                 format_list = available_formats[available_formats.index(format_limit):]
 625             else:
 626                 format_list = available_formats
 627             existing_formats = [x for x in format_list if x in url_map]
 628             if len(existing_formats) == 0:
 629                 raise ExtractorError(u'no known formats available for video')
 630             if self._downloader.params.get('listformats', None):
 631                 self._print_formats(existing_formats)
 632                 return
 633             if req_format is None or req_format == 'best':
 634                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 635             elif req_format == 'worst':
 636                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 637             elif req_format in ('-1', 'all'):
 638                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 639             else:
 640                 # Specific formats. We pick the first in a slash-delimeted sequence.
 641                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 642                 req_formats = req_format.split('/')
 643                 video_url_list = None
 644                 for rf in req_formats:
 645                     if rf in url_map:
 646                         video_url_list = [(rf, url_map[rf])]
 647                         break
 648                 if video_url_list is None:
 649                     raise ExtractorError(u'requested format not available')
 650         else:
 651             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 652
 653         results = []
 654         for format_param, video_real_url in video_url_list:
 655             # Extension
 656             video_extension = self._video_extensions.get(format_param, 'flv')
 657
 658             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 659                                               self._video_dimensions.get(format_param, '???'))
 660
 661             results.append({
 662                 'id':       video_id,
 663                 'url':      video_real_url,
 664                 'uploader': video_uploader,
 665                 'uploader_id': video_uploader_id,
 666                 'upload_date':  upload_date,
 667                 'title':    video_title,
 668                 'ext':      video_extension,
 669                 'format':   video_format,
 670                 'thumbnail':    video_thumbnail,
 671                 'description':  video_description,
 672                 'player_url':   player_url,
 673                 'subtitles':    video_subtitles,
 674                 'duration':     video_duration
 675             })
 676         return results
 677
 678
 679 class MetacafeIE(InfoExtractor):
 680     """Information Extractor for metacafe.com."""
 681
 682     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 683     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 684     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 685     IE_NAME = u'metacafe'
 686
 687     def report_disclaimer(self):
 688         """Report disclaimer retrieval."""
 689         self.to_screen(u'Retrieving disclaimer')
 690
 691     def _real_initialize(self):
 692         # Retrieve disclaimer
 693         request = compat_urllib_request.Request(self._DISCLAIMER)
 694         try:
 695             self.report_disclaimer()
 696             disclaimer = compat_urllib_request.urlopen(request).read()
 697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 698             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 711
 712     def _real_extract(self, url):
 713         # Extract id and simplified title from URL
 714         mobj = re.match(self._VALID_URL, url)
 715         if mobj is None:
 716             raise ExtractorError(u'Invalid URL: %s' % url)
 717
 718         video_id = mobj.group(1)
 719
 720         # Check if video comes from YouTube
 721         mobj2 = re.match(r'^yt-(.*)$', video_id)
 722         if mobj2 is not None:
 723             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 724
 725         # Retrieve video webpage to extract further information
 726         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 727
 728         # Extract URL, uploader and title from webpage
 729         self.report_extraction(video_id)
 730         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 731         if mobj is not None:
 732             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 733             video_extension = mediaURL[-3:]
 734
 735             # Extract gdaKey if available
 736             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 737             if mobj is None:
 738                 video_url = mediaURL
 739             else:
 740                 gdaKey = mobj.group(1)
 741                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 742         else:
 743             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 744             if mobj is None:
 745                 raise ExtractorError(u'Unable to extract media URL')
 746             vardict = compat_parse_qs(mobj.group(1))
 747             if 'mediaData' not in vardict:
 748                 raise ExtractorError(u'Unable to extract media URL')
 749             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 750             if mobj is None:
 751                 raise ExtractorError(u'Unable to extract media URL')
 752             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 753             video_extension = mediaURL[-3:]
 754             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 755
 756         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 757         if mobj is None:
 758             raise ExtractorError(u'Unable to extract title')
 759         video_title = mobj.group(1).decode('utf-8')
 760
 761         mobj = re.search(r'submitter=(.*?);', webpage)
 762         if mobj is None:
 763             raise ExtractorError(u'Unable to extract uploader nickname')
 764         video_uploader = mobj.group(1)
 765
 766         return [{
 767             'id':       video_id.decode('utf-8'),
 768             'url':      video_url.decode('utf-8'),
 769             'uploader': video_uploader.decode('utf-8'),
 770             'upload_date':  None,
 771             'title':    video_title,
 772             'ext':      video_extension.decode('utf-8'),
 773         }]
 774
 775 class DailymotionIE(InfoExtractor):
 776     """Information Extractor for Dailymotion"""
 777
 778     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 779     IE_NAME = u'dailymotion'
 780
 781     def _real_extract(self, url):
 782         # Extract id and simplified title from URL
 783         mobj = re.match(self._VALID_URL, url)
 784         if mobj is None:
 785             raise ExtractorError(u'Invalid URL: %s' % url)
 786
 787         video_id = mobj.group(1).split('_')[0].split('?')[0]
 788
 789         video_extension = 'mp4'
 790
 791         # Retrieve video webpage to extract further information
 792         request = compat_urllib_request.Request(url)
 793         request.add_header('Cookie', 'family_filter=off')
 794         webpage = self._download_webpage(request, video_id)
 795
 796         # Extract URL, uploader and title from webpage
 797         self.report_extraction(video_id)
 798         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 799         if mobj is None:
 800             raise ExtractorError(u'Unable to extract media URL')
 801         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 802
 803         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 804             if key in flashvars:
 805                 max_quality = key
 806                 self.to_screen(u'Using %s' % key)
 807                 break
 808         else:
 809             raise ExtractorError(u'Unable to extract video URL')
 810
 811         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 812         if mobj is None:
 813             raise ExtractorError(u'Unable to extract video URL')
 814
 815         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 816
 817         # TODO: support choosing qualities
 818
 819         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 820         if mobj is None:
 821             raise ExtractorError(u'Unable to extract title')
 822         video_title = unescapeHTML(mobj.group('title'))
 823
 824         video_uploader = None
 825         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 826         if mobj is None:
 827             # lookin for official user
 828             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 829             if mobj_official is None:
 830                 self._downloader.report_warning(u'unable to extract uploader nickname')
 831             else:
 832                 video_uploader = mobj_official.group(1)
 833         else:
 834             video_uploader = mobj.group(1)
 835
 836         video_upload_date = None
 837         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 838         if mobj is not None:
 839             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 840
 841         return [{
 842             'id':       video_id,
 843             'url':      video_url,
 844             'uploader': video_uploader,
 845             'upload_date':  video_upload_date,
 846             'title':    video_title,
 847             'ext':      video_extension,
 848         }]
 849
 850
 851 class PhotobucketIE(InfoExtractor):
 852     """Information extractor for photobucket.com."""
 853
 854     # TODO: the original _VALID_URL was:
 855     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 856     # Check if it's necessary to keep the old extracion process
 857     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 858     IE_NAME = u'photobucket'
 859
 860     def _real_extract(self, url):
 861         # Extract id from URL
 862         mobj = re.match(self._VALID_URL, url)
 863         if mobj is None:
 864             raise ExtractorError(u'Invalid URL: %s' % url)
 865
 866         video_id = mobj.group('id')
 867
 868         video_extension = mobj.group('ext')
 869
 870         # Retrieve video webpage to extract further information
 871         webpage = self._download_webpage(url, video_id)
 872
 873         # Extract URL, uploader, and title from webpage
 874         self.report_extraction(video_id)
 875         # We try first by looking the javascript code:
 876         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 877         if mobj is not None:
 878             info = json.loads(mobj.group('json'))
 879             return [{
 880                 'id':       video_id,
 881                 'url':      info[u'downloadUrl'],
 882                 'uploader': info[u'username'],
 883                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 884                 'title':    info[u'title'],
 885                 'ext':      video_extension,
 886                 'thumbnail': info[u'thumbUrl'],
 887             }]
 888
 889         # We try looking in other parts of the webpage
 890         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 891         if mobj is None:
 892             raise ExtractorError(u'Unable to extract media URL')
 893         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 894
 895         video_url = mediaURL
 896
 897         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 898         if mobj is None:
 899             raise ExtractorError(u'Unable to extract title')
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         video_uploader = mobj.group(2).decode('utf-8')
 903
 904         return [{
 905             'id':       video_id.decode('utf-8'),
 906             'url':      video_url.decode('utf-8'),
 907             'uploader': video_uploader,
 908             'upload_date':  None,
 909             'title':    video_title,
 910             'ext':      video_extension.decode('utf-8'),
 911         }]
 912
 913
 914 class YahooIE(InfoExtractor):
 915     """Information extractor for screen.yahoo.com."""
 916     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 917
 918     def _real_extract(self, url):
 919         mobj = re.match(self._VALID_URL, url)
 920         if mobj is None:
 921             raise ExtractorError(u'Invalid URL: %s' % url)
 922         video_id = mobj.group('id')
 923
 924         # TODO: Check which url parameters are required
 925         info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 926         webpage = self._download_webpage(info_url, video_id, "Downloading info webpage")
 927         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 928                     <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 929                     <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 930                     <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 931                     '''
 932         self.report_extraction(video_id)
 933         m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 934         if m_info is None:
 935             raise ExtractorError(u'Unable to extract video info')
 936         video_title = m_info.group('title')
 937         video_description = m_info.group('description')
 938         video_thumb = m_info.group('thumb')
 939         video_date = m_info.group('date')
 940         video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 941
 942         # TODO: Find a way to get mp4 videos
 943         rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 944         webpage = self._download_webpage(rest_url, video_id, 'Downloading video url webpage')
 945         m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 946         if m_rest is None:
 947             raise ExtractorError(u'Unable to extract video url')
 948
 949         info_dict = {
 950                      'id': video_id,
 951                      'url':m_rest.group('url'),
 952                      'play_path': m_rest.group('path'),
 953                      'title':video_title,
 954                      'description': video_description,
 955                      'thumbnail': video_thumb,
 956                      'upload_date': video_date,
 957                      'ext': 'flv',
 958                      }
 959         return info_dict
 960
 961 class VimeoIE(InfoExtractor):
 962     """Information extractor for vimeo.com."""
 963
 964     # _VALID_URL matches Vimeo URLs
 965     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 966     IE_NAME = u'vimeo'
 967
 968     def _real_extract(self, url, new_video=True):
 969         # Extract ID from URL
 970         mobj = re.match(self._VALID_URL, url)
 971         if mobj is None:
 972             raise ExtractorError(u'Invalid URL: %s' % url)
 973
 974         video_id = mobj.group('id')
 975         if not mobj.group('proto'):
 976             url = 'https://' + url
 977         if mobj.group('direct_link'):
 978             url = 'https://vimeo.com/' + video_id
 979
 980         # Retrieve video webpage to extract further information
 981         request = compat_urllib_request.Request(url, None, std_headers)
 982         webpage = self._download_webpage(request, video_id)
 983
 984         # Now we begin extracting as much information as we can from what we
 985         # retrieved. First we extract the information common to all extractors,
 986         # and latter we extract those that are Vimeo specific.
 987         self.report_extraction(video_id)
 988
 989         # Extract the config JSON
 990         try:
 991             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 992             config = json.loads(config)
 993         except:
 994             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
 995                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
 996             else:
 997                 raise ExtractorError(u'Unable to extract info section')
 998
 999         # Extract title
1000         video_title = config["video"]["title"]
1001
1002         # Extract uploader and uploader_id
1003         video_uploader = config["video"]["owner"]["name"]
1004         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1005
1006         # Extract video thumbnail
1007         video_thumbnail = config["video"]["thumbnail"]
1008
1009         # Extract video description
1010         video_description = get_element_by_attribute("itemprop", "description", webpage)
1011         if video_description: video_description = clean_html(video_description)
1012         else: video_description = u''
1013
1014         # Extract upload date
1015         video_upload_date = None
1016         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1017         if mobj is not None:
1018             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1019
1020         # Vimeo specific: extract request signature and timestamp
1021         sig = config['request']['signature']
1022         timestamp = config['request']['timestamp']
1023
1024         # Vimeo specific: extract video codec and quality information
1025         # First consider quality, then codecs, then take everything
1026         # TODO bind to format param
1027         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1028         files = { 'hd': [], 'sd': [], 'other': []}
1029         for codec_name, codec_extension in codecs:
1030             if codec_name in config["video"]["files"]:
1031                 if 'hd' in config["video"]["files"][codec_name]:
1032                     files['hd'].append((codec_name, codec_extension, 'hd'))
1033                 elif 'sd' in config["video"]["files"][codec_name]:
1034                     files['sd'].append((codec_name, codec_extension, 'sd'))
1035                 else:
1036                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1037
1038         for quality in ('hd', 'sd', 'other'):
1039             if len(files[quality]) > 0:
1040                 video_quality = files[quality][0][2]
1041                 video_codec = files[quality][0][0]
1042                 video_extension = files[quality][0][1]
1043                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1044                 break
1045         else:
1046             raise ExtractorError(u'No known codec found')
1047
1048         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1049                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1050
1051         return [{
1052             'id':       video_id,
1053             'url':      video_url,
1054             'uploader': video_uploader,
1055             'uploader_id': video_uploader_id,
1056             'upload_date':  video_upload_date,
1057             'title':    video_title,
1058             'ext':      video_extension,
1059             'thumbnail':    video_thumbnail,
1060             'description':  video_description,
1061         }]
1062
1063
1064 class ArteTvIE(InfoExtractor):
1065     """arte.tv information extractor."""
1066
1067     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1068     _LIVE_URL = r'index-[0-9]+\.html$'
1069
1070     IE_NAME = u'arte.tv'
1071
1072     def fetch_webpage(self, url):
1073         request = compat_urllib_request.Request(url)
1074         try:
1075             self.report_download_webpage(url)
1076             webpage = compat_urllib_request.urlopen(request).read()
1077         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1078             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1079         except ValueError as err:
1080             raise ExtractorError(u'Invalid URL: %s' % url)
1081         return webpage
1082
1083     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1084         page = self.fetch_webpage(url)
1085         mobj = re.search(regex, page, regexFlags)
1086         info = {}
1087
1088         if mobj is None:
1089             raise ExtractorError(u'Invalid URL: %s' % url)
1090
1091         for (i, key, err) in matchTuples:
1092             if mobj.group(i) is None:
1093                 raise ExtractorError(err)
1094             else:
1095                 info[key] = mobj.group(i)
1096
1097         return info
1098
1099     def extractLiveStream(self, url):
1100         video_lang = url.split('/')[-4]
1101         info = self.grep_webpage(
1102             url,
1103             r'src="(.*?/videothek_js.*?\.js)',
1104             0,
1105             [
1106                 (1, 'url', u'Invalid URL: %s' % url)
1107             ]
1108         )
1109         http_host = url.split('/')[2]
1110         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1111         info = self.grep_webpage(
1112             next_url,
1113             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1114                 '(http://.*?\.swf).*?' +
1115                 '(rtmp://.*?)\'',
1116             re.DOTALL,
1117             [
1118                 (1, 'path',   u'could not extract video path: %s' % url),
1119                 (2, 'player', u'could not extract video player: %s' % url),
1120                 (3, 'url',    u'could not extract video url: %s' % url)
1121             ]
1122         )
1123         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1124
1125     def extractPlus7Stream(self, url):
1126         video_lang = url.split('/')[-3]
1127         info = self.grep_webpage(
1128             url,
1129             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1130             0,
1131             [
1132                 (1, 'url', u'Invalid URL: %s' % url)
1133             ]
1134         )
1135         next_url = compat_urllib_parse.unquote(info.get('url'))
1136         info = self.grep_webpage(
1137             next_url,
1138             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1139             0,
1140             [
1141                 (1, 'url', u'Could not find <video> tag: %s' % url)
1142             ]
1143         )
1144         next_url = compat_urllib_parse.unquote(info.get('url'))
1145
1146         info = self.grep_webpage(
1147             next_url,
1148             r'<video id="(.*?)".*?>.*?' +
1149                 '<name>(.*?)</name>.*?' +
1150                 '<dateVideo>(.*?)</dateVideo>.*?' +
1151                 '<url quality="hd">(.*?)</url>',
1152             re.DOTALL,
1153             [
1154                 (1, 'id',    u'could not extract video id: %s' % url),
1155                 (2, 'title', u'could not extract video title: %s' % url),
1156                 (3, 'date',  u'could not extract video date: %s' % url),
1157                 (4, 'url',   u'could not extract video url: %s' % url)
1158             ]
1159         )
1160
1161         return {
1162             'id':           info.get('id'),
1163             'url':          compat_urllib_parse.unquote(info.get('url')),
1164             'uploader':     u'arte.tv',
1165             'upload_date':  unified_strdate(info.get('date')),
1166             'title':        info.get('title').decode('utf-8'),
1167             'ext':          u'mp4',
1168             'format':       u'NA',
1169             'player_url':   None,
1170         }
1171
1172     def _real_extract(self, url):
1173         video_id = url.split('/')[-1]
1174         self.report_extraction(video_id)
1175
1176         if re.search(self._LIVE_URL, video_id) is not None:
1177             self.extractLiveStream(url)
1178             return
1179         else:
1180             info = self.extractPlus7Stream(url)
1181
1182         return [info]
1183
1184
1185 class GenericIE(InfoExtractor):
1186     """Generic last-resort information extractor."""
1187
1188     _VALID_URL = r'.*'
1189     IE_NAME = u'generic'
1190
1191     def report_download_webpage(self, video_id):
1192         """Report webpage download."""
1193         if not self._downloader.params.get('test', False):
1194             self._downloader.report_warning(u'Falling back on generic information extractor.')
1195         super(GenericIE, self).report_download_webpage(video_id)
1196
1197     def report_following_redirect(self, new_url):
1198         """Report information extraction."""
1199         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1200
1201     def _test_redirect(self, url):
1202         """Check if it is a redirect, like url shorteners, in case return the new url."""
1203         class HeadRequest(compat_urllib_request.Request):
1204             def get_method(self):
1205                 return "HEAD"
1206
1207         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1208             """
1209             Subclass the HTTPRedirectHandler to make it use our
1210             HeadRequest also on the redirected URL
1211             """
1212             def redirect_request(self, req, fp, code, msg, headers, newurl):
1213                 if code in (301, 302, 303, 307):
1214                     newurl = newurl.replace(' ', '%20')
1215                     newheaders = dict((k,v) for k,v in req.headers.items()
1216                                       if k.lower() not in ("content-length", "content-type"))
1217                     return HeadRequest(newurl,
1218                                        headers=newheaders,
1219                                        origin_req_host=req.get_origin_req_host(),
1220                                        unverifiable=True)
1221                 else:
1222                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1223
1224         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1225             """
1226             Fallback to GET if HEAD is not allowed (405 HTTP error)
1227             """
1228             def http_error_405(self, req, fp, code, msg, headers):
1229                 fp.read()
1230                 fp.close()
1231
1232                 newheaders = dict((k,v) for k,v in req.headers.items()
1233                                   if k.lower() not in ("content-length", "content-type"))
1234                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1235                                                  headers=newheaders,
1236                                                  origin_req_host=req.get_origin_req_host(),
1237                                                  unverifiable=True))
1238
1239         # Build our opener
1240         opener = compat_urllib_request.OpenerDirector()
1241         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1242                         HTTPMethodFallback, HEADRedirectHandler,
1243                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1244             opener.add_handler(handler())
1245
1246         response = opener.open(HeadRequest(url))
1247         new_url = response.geturl()
1248
1249         if url == new_url:
1250             return False
1251
1252         self.report_following_redirect(new_url)
1253         return new_url
1254
1255     def _real_extract(self, url):
1256         new_url = self._test_redirect(url)
1257         if new_url: return [self.url_result(new_url)]
1258
1259         video_id = url.split('/')[-1]
1260         try:
1261             webpage = self._download_webpage(url, video_id)
1262         except ValueError as err:
1263             # since this is the last-resort InfoExtractor, if
1264             # this error is thrown, it'll be thrown here
1265             raise ExtractorError(u'Invalid URL: %s' % url)
1266
1267         self.report_extraction(video_id)
1268         # Start with something easy: JW Player in SWFObject
1269         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1270         if mobj is None:
1271             # Broaden the search a little bit
1272             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1273         if mobj is None:
1274             # Broaden the search a little bit: JWPlayer JS loader
1275             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1276         if mobj is None:
1277             raise ExtractorError(u'Invalid URL: %s' % url)
1278
1279         # It's possible that one of the regexes
1280         # matched, but returned an empty group:
1281         if mobj.group(1) is None:
1282             raise ExtractorError(u'Invalid URL: %s' % url)
1283
1284         video_url = compat_urllib_parse.unquote(mobj.group(1))
1285         video_id = os.path.basename(video_url)
1286
1287         # here's a fun little line of code for you:
1288         video_extension = os.path.splitext(video_id)[1][1:]
1289         video_id = os.path.splitext(video_id)[0]
1290
1291         # it's tempting to parse this further, but you would
1292         # have to take into account all the variations like
1293         #   Video Title - Site Name
1294         #   Site Name | Video Title
1295         #   Video Title - Tagline | Site Name
1296         # and so on and so forth; it's just not practical
1297         mobj = re.search(r'<title>(.*)</title>', webpage)
1298         if mobj is None:
1299             raise ExtractorError(u'Unable to extract title')
1300         video_title = mobj.group(1)
1301
1302         # video uploader is domain name
1303         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1304         if mobj is None:
1305             raise ExtractorError(u'Unable to extract title')
1306         video_uploader = mobj.group(1)
1307
1308         return [{
1309             'id':       video_id,
1310             'url':      video_url,
1311             'uploader': video_uploader,
1312             'upload_date':  None,
1313             'title':    video_title,
1314             'ext':      video_extension,
1315         }]
1316
1317
1318 class YoutubeSearchIE(InfoExtractor):
1319     """Information Extractor for YouTube search queries."""
1320     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1321     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1322     _max_youtube_results = 1000
1323     IE_NAME = u'youtube:search'
1324
1325     def report_download_page(self, query, pagenum):
1326         """Report attempt to download search page with given number."""
1327         query = query.decode(preferredencoding())
1328         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1329
1330     def _real_extract(self, query):
1331         mobj = re.match(self._VALID_URL, query)
1332         if mobj is None:
1333             raise ExtractorError(u'Invalid search query "%s"' % query)
1334
1335         prefix, query = query.split(':')
1336         prefix = prefix[8:]
1337         query = query.encode('utf-8')
1338         if prefix == '':
1339             return self._get_n_results(query, 1)
1340         elif prefix == 'all':
1341             self._get_n_results(query, self._max_youtube_results)
1342         else:
1343             try:
1344                 n = int(prefix)
1345                 if n <= 0:
1346                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1347                 elif n > self._max_youtube_results:
1348                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1349                     n = self._max_youtube_results
1350                 return self._get_n_results(query, n)
1351             except ValueError: # parsing prefix as integer fails
1352                 return self._get_n_results(query, 1)
1353
1354     def _get_n_results(self, query, n):
1355         """Get a specified number of results for a query"""
1356
1357         video_ids = []
1358         pagenum = 0
1359         limit = n
1360
1361         while (50 * pagenum) < limit:
1362             self.report_download_page(query, pagenum+1)
1363             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1364             request = compat_urllib_request.Request(result_url)
1365             try:
1366                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1367             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1368                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1369             api_response = json.loads(data)['data']
1370
1371             if not 'items' in api_response:
1372                 raise ExtractorError(u'[youtube] No video results')
1373
1374             new_ids = list(video['id'] for video in api_response['items'])
1375             video_ids += new_ids
1376
1377             limit = min(n, api_response['totalItems'])
1378             pagenum += 1
1379
1380         if len(video_ids) > n:
1381             video_ids = video_ids[:n]
1382         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1383         return videos
1384
1385
1386 class GoogleSearchIE(InfoExtractor):
1387     """Information Extractor for Google Video search queries."""
1388     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1389     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1390     _max_google_results = 1000
1391     IE_NAME = u'video.google:search'
1392
1393     def _real_extract(self, query):
1394         mobj = re.match(self._VALID_URL, query)
1395
1396         prefix = mobj.group('prefix')
1397         query = mobj.group('query')
1398         if prefix == '':
1399             return self._get_n_results(query, 1)
1400         elif prefix == 'all':
1401             return self._get_n_results(query, self._max_google_results)
1402         else:
1403             n = int(prefix)
1404             if n <= 0:
1405                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1406             elif n > self._max_google_results:
1407                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1408                 n = self._max_google_results
1409             return self._get_n_results(query, n)
1410
1411     def _get_n_results(self, query, n):
1412         """Get a specified number of results for a query"""
1413
1414         res = {
1415             '_type': 'playlist',
1416             'id': query,
1417             'entries': []
1418         }
1419
1420         for pagenum in itertools.count(1):
1421             result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1422             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1423                                              note='Downloading result page ' + str(pagenum))
1424
1425             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1426                 e = {
1427                     '_type': 'url',
1428                     'url': mobj.group(1)
1429                 }
1430                 res['entries'].append(e)
1431
1432             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1433                 return res
1434
1435 class YahooSearchIE(InfoExtractor):
1436     """Information Extractor for Yahoo! Video search queries."""
1437
1438     _WORKING = False
1439     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1440     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1441     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1442     _MORE_PAGES_INDICATOR = r'\s*Next'
1443     _max_yahoo_results = 1000
1444     IE_NAME = u'video.yahoo:search'
1445
1446     def report_download_page(self, query, pagenum):
1447         """Report attempt to download playlist page with given number."""
1448         query = query.decode(preferredencoding())
1449         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1450
1451     def _real_extract(self, query):
1452         mobj = re.match(self._VALID_URL, query)
1453         if mobj is None:
1454             raise ExtractorError(u'Invalid search query "%s"' % query)
1455
1456         prefix, query = query.split(':')
1457         prefix = prefix[8:]
1458         query = query.encode('utf-8')
1459         if prefix == '':
1460             self._download_n_results(query, 1)
1461             return
1462         elif prefix == 'all':
1463             self._download_n_results(query, self._max_yahoo_results)
1464             return
1465         else:
1466             try:
1467                 n = int(prefix)
1468                 if n <= 0:
1469                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1470                 elif n > self._max_yahoo_results:
1471                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1472                     n = self._max_yahoo_results
1473                 self._download_n_results(query, n)
1474                 return
1475             except ValueError: # parsing prefix as integer fails
1476                 self._download_n_results(query, 1)
1477                 return
1478
1479     def _download_n_results(self, query, n):
1480         """Downloads a specified number of results for a query"""
1481
1482         video_ids = []
1483         already_seen = set()
1484         pagenum = 1
1485
1486         while True:
1487             self.report_download_page(query, pagenum)
1488             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1489             request = compat_urllib_request.Request(result_url)
1490             try:
1491                 page = compat_urllib_request.urlopen(request).read()
1492             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1493                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1494
1495             # Extract video identifiers
1496             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1497                 video_id = mobj.group(1)
1498                 if video_id not in already_seen:
1499                     video_ids.append(video_id)
1500                     already_seen.add(video_id)
1501                     if len(video_ids) == n:
1502                         # Specified n videos reached
1503                         for id in video_ids:
1504                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1505                         return
1506
1507             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1508                 for id in video_ids:
1509                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1510                 return
1511
1512             pagenum = pagenum + 1
1513
1514
1515 class YoutubePlaylistIE(InfoExtractor):
1516     """Information Extractor for YouTube playlists."""
1517
1518     _VALID_URL = r"""(?:
1519                         (?:https?://)?
1520                         (?:\w+\.)?
1521                         youtube\.com/
1522                         (?:
1523                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1524                            \? (?:.*?&)*? (?:p|a|list)=
1525                         |  p/
1526                         )
1527                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1528                         .*
1529                      |
1530                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1531                      )"""
1532     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1533     _MAX_RESULTS = 50
1534     IE_NAME = u'youtube:playlist'
1535
1536     @classmethod
1537     def suitable(cls, url):
1538         """Receives a URL and returns True if suitable for this IE."""
1539         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1540
1541     def _real_extract(self, url):
1542         # Extract playlist id
1543         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1544         if mobj is None:
1545             raise ExtractorError(u'Invalid URL: %s' % url)
1546
1547         # Download playlist videos from API
1548         playlist_id = mobj.group(1) or mobj.group(2)
1549         page_num = 1
1550         videos = []
1551
1552         while True:
1553             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1554             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1555
1556             try:
1557                 response = json.loads(page)
1558             except ValueError as err:
1559                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1560
1561             if 'feed' not in response:
1562                 raise ExtractorError(u'Got a malformed response from YouTube API')
1563             playlist_title = response['feed']['title']['$t']
1564             if 'entry' not in response['feed']:
1565                 # Number of videos is a multiple of self._MAX_RESULTS
1566                 break
1567
1568             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1569                         for entry in response['feed']['entry']
1570                         if 'content' in entry ]
1571
1572             if len(response['feed']['entry']) < self._MAX_RESULTS:
1573                 break
1574             page_num += 1
1575
1576         videos = [v[1] for v in sorted(videos)]
1577
1578         url_results = [self.url_result(url, 'Youtube') for url in videos]
1579         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1580
1581
1582 class YoutubeChannelIE(InfoExtractor):
1583     """Information Extractor for YouTube channels."""
1584
1585     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1588     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1589     IE_NAME = u'youtube:channel'
1590
1591     def extract_videos_from_page(self, page):
1592         ids_in_page = []
1593         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1594             if mobj.group(1) not in ids_in_page:
1595                 ids_in_page.append(mobj.group(1))
1596         return ids_in_page
1597
1598     def _real_extract(self, url):
1599         # Extract channel id
1600         mobj = re.match(self._VALID_URL, url)
1601         if mobj is None:
1602             raise ExtractorError(u'Invalid URL: %s' % url)
1603
1604         # Download channel page
1605         channel_id = mobj.group(1)
1606         video_ids = []
1607         pagenum = 1
1608
1609         url = self._TEMPLATE_URL % (channel_id, pagenum)
1610         page = self._download_webpage(url, channel_id,
1611                                       u'Downloading page #%s' % pagenum)
1612
1613         # Extract video identifiers
1614         ids_in_page = self.extract_videos_from_page(page)
1615         video_ids.extend(ids_in_page)
1616
1617         # Download any subsequent channel pages using the json-based channel_ajax query
1618         if self._MORE_PAGES_INDICATOR in page:
1619             while True:
1620                 pagenum = pagenum + 1
1621
1622                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1623                 page = self._download_webpage(url, channel_id,
1624                                               u'Downloading page #%s' % pagenum)
1625
1626                 page = json.loads(page)
1627
1628                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1629                 video_ids.extend(ids_in_page)
1630
1631                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1632                     break
1633
1634         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1635
1636         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1637         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1638         return [self.playlist_result(url_entries, channel_id)]
1639
1640
1641 class YoutubeUserIE(InfoExtractor):
1642     """Information Extractor for YouTube users."""
1643
1644     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1645     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1646     _GDATA_PAGE_SIZE = 50
1647     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1648     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1649     IE_NAME = u'youtube:user'
1650
1651     def _real_extract(self, url):
1652         # Extract username
1653         mobj = re.match(self._VALID_URL, url)
1654         if mobj is None:
1655             raise ExtractorError(u'Invalid URL: %s' % url)
1656
1657         username = mobj.group(1)
1658
1659         # Download video ids using YouTube Data API. Result size per
1660         # query is limited (currently to 50 videos) so we need to query
1661         # page by page until there are no video ids - it means we got
1662         # all of them.
1663
1664         video_ids = []
1665         pagenum = 0
1666
1667         while True:
1668             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669
1670             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1671             page = self._download_webpage(gdata_url, username,
1672                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1673
1674             # Extract video identifiers
1675             ids_in_page = []
1676
1677             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1678                 if mobj.group(1) not in ids_in_page:
1679                     ids_in_page.append(mobj.group(1))
1680
1681             video_ids.extend(ids_in_page)
1682
1683             # A little optimization - if current page is not
1684             # "full", ie. does not contain PAGE_SIZE video ids then
1685             # we can assume that this page is the last one - there
1686             # are no more ids on further pages - no need to query
1687             # again.
1688
1689             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1690                 break
1691
1692             pagenum += 1
1693
1694         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1695         url_results = [self.url_result(url, 'Youtube') for url in urls]
1696         return [self.playlist_result(url_results, playlist_title = username)]
1697
1698
1699 class BlipTVUserIE(InfoExtractor):
1700     """Information Extractor for blip.tv users."""
1701
1702     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1703     _PAGE_SIZE = 12
1704     IE_NAME = u'blip.tv:user'
1705
1706     def _real_extract(self, url):
1707         # Extract username
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             raise ExtractorError(u'Invalid URL: %s' % url)
1711
1712         username = mobj.group(1)
1713
1714         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1715
1716         page = self._download_webpage(url, username, u'Downloading user page')
1717         mobj = re.search(r'data-users-id="([^"]+)"', page)
1718         page_base = page_base % mobj.group(1)
1719
1720
1721         # Download video ids using BlipTV Ajax calls. Result size per
1722         # query is limited (currently to 12 videos) so we need to query
1723         # page by page until there are no video ids - it means we got
1724         # all of them.
1725
1726         video_ids = []
1727         pagenum = 1
1728
1729         while True:
1730             url = page_base + "&page=" + str(pagenum)
1731             page = self._download_webpage(url, username,
1732                                           u'Downloading video ids from page %d' % pagenum)
1733
1734             # Extract video identifiers
1735             ids_in_page = []
1736
1737             for mobj in re.finditer(r'href="/([^"]+)"', page):
1738                 if mobj.group(1) not in ids_in_page:
1739                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1740
1741             video_ids.extend(ids_in_page)
1742
1743             # A little optimization - if current page is not
1744             # "full", ie. does not contain PAGE_SIZE video ids then
1745             # we can assume that this page is the last one - there
1746             # are no more ids on further pages - no need to query
1747             # again.
1748
1749             if len(ids_in_page) < self._PAGE_SIZE:
1750                 break
1751
1752             pagenum += 1
1753
1754         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1755         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1756         return [self.playlist_result(url_entries, playlist_title = username)]
1757
1758
1759 class DepositFilesIE(InfoExtractor):
1760     """Information extractor for depositfiles.com"""
1761
1762     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1763
1764     def _real_extract(self, url):
1765         file_id = url.split('/')[-1]
1766         # Rebuild url in english locale
1767         url = 'http://depositfiles.com/en/files/' + file_id
1768
1769         # Retrieve file webpage with 'Free download' button pressed
1770         free_download_indication = { 'gateway_result' : '1' }
1771         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1772         try:
1773             self.report_download_webpage(file_id)
1774             webpage = compat_urllib_request.urlopen(request).read()
1775         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1776             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1777
1778         # Search for the real file URL
1779         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1780         if (mobj is None) or (mobj.group(1) is None):
1781             # Try to figure out reason of the error.
1782             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1783             if (mobj is not None) and (mobj.group(1) is not None):
1784                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1785                 raise ExtractorError(u'%s' % restriction_message)
1786             else:
1787                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1788
1789         file_url = mobj.group(1)
1790         file_extension = os.path.splitext(file_url)[1][1:]
1791
1792         # Search for file title
1793         mobj = re.search(r'<b title="(.*?)">', webpage)
1794         if mobj is None:
1795             raise ExtractorError(u'Unable to extract title')
1796         file_title = mobj.group(1).decode('utf-8')
1797
1798         return [{
1799             'id':       file_id.decode('utf-8'),
1800             'url':      file_url.decode('utf-8'),
1801             'uploader': None,
1802             'upload_date':  None,
1803             'title':    file_title,
1804             'ext':      file_extension.decode('utf-8'),
1805         }]
1806
1807
1808 class FacebookIE(InfoExtractor):
1809     """Information Extractor for Facebook"""
1810
1811     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1812     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1813     _NETRC_MACHINE = 'facebook'
1814     IE_NAME = u'facebook'
1815
1816     def report_login(self):
1817         """Report attempt to log in."""
1818         self.to_screen(u'Logging in')
1819
1820     def _real_initialize(self):
1821         if self._downloader is None:
1822             return
1823
1824         useremail = None
1825         password = None
1826         downloader_params = self._downloader.params
1827
1828         # Attempt to use provided username and password or .netrc data
1829         if downloader_params.get('username', None) is not None:
1830             useremail = downloader_params['username']
1831             password = downloader_params['password']
1832         elif downloader_params.get('usenetrc', False):
1833             try:
1834                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1835                 if info is not None:
1836                     useremail = info[0]
1837                     password = info[2]
1838                 else:
1839                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1840             except (IOError, netrc.NetrcParseError) as err:
1841                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1842                 return
1843
1844         if useremail is None:
1845             return
1846
1847         # Log in
1848         login_form = {
1849             'email': useremail,
1850             'pass': password,
1851             'login': 'Log+In'
1852             }
1853         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1854         try:
1855             self.report_login()
1856             login_results = compat_urllib_request.urlopen(request).read()
1857             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1858                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1859                 return
1860         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1862             return
1863
1864     def _real_extract(self, url):
1865         mobj = re.match(self._VALID_URL, url)
1866         if mobj is None:
1867             raise ExtractorError(u'Invalid URL: %s' % url)
1868         video_id = mobj.group('ID')
1869
1870         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1871         webpage = self._download_webpage(url, video_id)
1872
1873         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1874         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1875         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1876         if not m:
1877             raise ExtractorError(u'Cannot parse data')
1878         data = dict(json.loads(m.group(1)))
1879         params_raw = compat_urllib_parse.unquote(data['params'])
1880         params = json.loads(params_raw)
1881         video_data = params['video_data'][0]
1882         video_url = video_data.get('hd_src')
1883         if not video_url:
1884             video_url = video_data['sd_src']
1885         if not video_url:
1886             raise ExtractorError(u'Cannot find video URL')
1887         video_duration = int(video_data['video_duration'])
1888         thumbnail = video_data['thumbnail_src']
1889
1890         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1891         if not m:
1892             raise ExtractorError(u'Cannot find title in webpage')
1893         video_title = unescapeHTML(m.group(1))
1894
1895         info = {
1896             'id': video_id,
1897             'title': video_title,
1898             'url': video_url,
1899             'ext': 'mp4',
1900             'duration': video_duration,
1901             'thumbnail': thumbnail,
1902         }
1903         return [info]
1904
1905
1906 class BlipTVIE(InfoExtractor):
1907     """Information extractor for blip.tv"""
1908
1909     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1910     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1911     IE_NAME = u'blip.tv'
1912
1913     def report_direct_download(self, title):
1914         """Report information extraction."""
1915         self.to_screen(u'%s: Direct download detected' % title)
1916
1917     def _real_extract(self, url):
1918         mobj = re.match(self._VALID_URL, url)
1919         if mobj is None:
1920             raise ExtractorError(u'Invalid URL: %s' % url)
1921
1922         urlp = compat_urllib_parse_urlparse(url)
1923         if urlp.path.startswith('/play/'):
1924             request = compat_urllib_request.Request(url)
1925             response = compat_urllib_request.urlopen(request)
1926             redirecturl = response.geturl()
1927             rurlp = compat_urllib_parse_urlparse(redirecturl)
1928             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1929             url = 'http://blip.tv/a/a-' + file_id
1930             return self._real_extract(url)
1931
1932
1933         if '?' in url:
1934             cchar = '&'
1935         else:
1936             cchar = '?'
1937         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1938         request = compat_urllib_request.Request(json_url)
1939         request.add_header('User-Agent', 'iTunes/10.6.1')
1940         self.report_extraction(mobj.group(1))
1941         info = None
1942         try:
1943             urlh = compat_urllib_request.urlopen(request)
1944             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1945                 basename = url.split('/')[-1]
1946                 title,ext = os.path.splitext(basename)
1947                 title = title.decode('UTF-8')
1948                 ext = ext.replace('.', '')
1949                 self.report_direct_download(title)
1950                 info = {
1951                     'id': title,
1952                     'url': url,
1953                     'uploader': None,
1954                     'upload_date': None,
1955                     'title': title,
1956                     'ext': ext,
1957                     'urlhandle': urlh
1958                 }
1959         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1960             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1961         if info is None: # Regular URL
1962             try:
1963                 json_code_bytes = urlh.read()
1964                 json_code = json_code_bytes.decode('utf-8')
1965             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1966                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1967
1968             try:
1969                 json_data = json.loads(json_code)
1970                 if 'Post' in json_data:
1971                     data = json_data['Post']
1972                 else:
1973                     data = json_data
1974
1975                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1976                 video_url = data['media']['url']
1977                 umobj = re.match(self._URL_EXT, video_url)
1978                 if umobj is None:
1979                     raise ValueError('Can not determine filename extension')
1980                 ext = umobj.group(1)
1981
1982                 info = {
1983                     'id': data['item_id'],
1984                     'url': video_url,
1985                     'uploader': data['display_name'],
1986                     'upload_date': upload_date,
1987                     'title': data['title'],
1988                     'ext': ext,
1989                     'format': data['media']['mimeType'],
1990                     'thumbnail': data['thumbnailUrl'],
1991                     'description': data['description'],
1992                     'player_url': data['embedUrl'],
1993                     'user_agent': 'iTunes/10.6.1',
1994                 }
1995             except (ValueError,KeyError) as err:
1996                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1997
1998         return [info]
1999
2000
2001 class MyVideoIE(InfoExtractor):
2002     """Information Extractor for myvideo.de."""
2003
2004     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2005     IE_NAME = u'myvideo'
2006
2007     def _real_extract(self,url):
2008         mobj = re.match(self._VALID_URL, url)
2009         if mobj is None:
2010             raise ExtractorError(u'Invalid URL: %s' % url)
2011
2012         video_id = mobj.group(1)
2013
2014         # Get video webpage
2015         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2016         webpage = self._download_webpage(webpage_url, video_id)
2017
2018         self.report_extraction(video_id)
2019         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2020                  webpage)
2021         if mobj is None:
2022             raise ExtractorError(u'Unable to extract media URL')
2023         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2024
2025         mobj = re.search('<title>([^<]+)</title>', webpage)
2026         if mobj is None:
2027             raise ExtractorError(u'Unable to extract title')
2028
2029         video_title = mobj.group(1)
2030
2031         return [{
2032             'id':       video_id,
2033             'url':      video_url,
2034             'uploader': None,
2035             'upload_date':  None,
2036             'title':    video_title,
2037             'ext':      u'flv',
2038         }]
2039
2040 class ComedyCentralIE(InfoExtractor):
2041     """Information extractor for The Daily Show and Colbert Report """
2042
2043     # urls can be abbreviations like :thedailyshow or :colbert
2044     # urls for episodes like:
2045     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2046     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2047     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2048     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2049                       |(https?://)?(www\.)?
2050                           (?P<showname>thedailyshow|colbertnation)\.com/
2051                          (full-episodes/(?P<episode>.*)|
2052                           (?P<clip>
2053                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2054                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2055                      $"""
2056
2057     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2058
2059     _video_extensions = {
2060         '3500': 'mp4',
2061         '2200': 'mp4',
2062         '1700': 'mp4',
2063         '1200': 'mp4',
2064         '750': 'mp4',
2065         '400': 'mp4',
2066     }
2067     _video_dimensions = {
2068         '3500': '1280x720',
2069         '2200': '960x540',
2070         '1700': '768x432',
2071         '1200': '640x360',
2072         '750': '512x288',
2073         '400': '384x216',
2074     }
2075
2076     @classmethod
2077     def suitable(cls, url):
2078         """Receives a URL and returns True if suitable for this IE."""
2079         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2080
2081     def _print_formats(self, formats):
2082         print('Available formats:')
2083         for x in formats:
2084             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2085
2086
2087     def _real_extract(self, url):
2088         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2089         if mobj is None:
2090             raise ExtractorError(u'Invalid URL: %s' % url)
2091
2092         if mobj.group('shortname'):
2093             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2094                 url = u'http://www.thedailyshow.com/full-episodes/'
2095             else:
2096                 url = u'http://www.colbertnation.com/full-episodes/'
2097             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2098             assert mobj is not None
2099
2100         if mobj.group('clip'):
2101             if mobj.group('showname') == 'thedailyshow':
2102                 epTitle = mobj.group('tdstitle')
2103             else:
2104                 epTitle = mobj.group('cntitle')
2105             dlNewest = False
2106         else:
2107             dlNewest = not mobj.group('episode')
2108             if dlNewest:
2109                 epTitle = mobj.group('showname')
2110             else:
2111                 epTitle = mobj.group('episode')
2112
2113         self.report_extraction(epTitle)
2114         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2115         if dlNewest:
2116             url = htmlHandle.geturl()
2117             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2118             if mobj is None:
2119                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2120             if mobj.group('episode') == '':
2121                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2122             epTitle = mobj.group('episode')
2123
2124         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2125
2126         if len(mMovieParams) == 0:
2127             # The Colbert Report embeds the information in a without
2128             # a URL prefix; so extract the alternate reference
2129             # and then add the URL prefix manually.
2130
2131             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2132             if len(altMovieParams) == 0:
2133                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2134             else:
2135                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2136
2137         uri = mMovieParams[0][1]
2138         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2139         indexXml = self._download_webpage(indexUrl, epTitle,
2140                                           u'Downloading show index',
2141                                           u'unable to download episode index')
2142
2143         results = []
2144
2145         idoc = xml.etree.ElementTree.fromstring(indexXml)
2146         itemEls = idoc.findall('.//item')
2147         for partNum,itemEl in enumerate(itemEls):
2148             mediaId = itemEl.findall('./guid')[0].text
2149             shortMediaId = mediaId.split(':')[-1]
2150             showId = mediaId.split(':')[-2].replace('.com', '')
2151             officialTitle = itemEl.findall('./title')[0].text
2152             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2153
2154             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2155                         compat_urllib_parse.urlencode({'uri': mediaId}))
2156             configXml = self._download_webpage(configUrl, epTitle,
2157                                                u'Downloading configuration for %s' % shortMediaId)
2158
2159             cdoc = xml.etree.ElementTree.fromstring(configXml)
2160             turls = []
2161             for rendition in cdoc.findall('.//rendition'):
2162                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2163                 turls.append(finfo)
2164
2165             if len(turls) == 0:
2166                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2167                 continue
2168
2169             if self._downloader.params.get('listformats', None):
2170                 self._print_formats([i[0] for i in turls])
2171                 return
2172
2173             # For now, just pick the highest bitrate
2174             format,rtmp_video_url = turls[-1]
2175
2176             # Get the format arg from the arg stream
2177             req_format = self._downloader.params.get('format', None)
2178
2179             # Select format if we can find one
2180             for f,v in turls:
2181                 if f == req_format:
2182                     format, rtmp_video_url = f, v
2183                     break
2184
2185             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2186             if not m:
2187                 raise ExtractorError(u'Cannot transform RTMP url')
2188             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2189             video_url = base + m.group('finalid')
2190
2191             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2192             info = {
2193                 'id': shortMediaId,
2194                 'url': video_url,
2195                 'uploader': showId,
2196                 'upload_date': officialDate,
2197                 'title': effTitle,
2198                 'ext': 'mp4',
2199                 'format': format,
2200                 'thumbnail': None,
2201                 'description': officialTitle,
2202             }
2203             results.append(info)
2204
2205         return results
2206
2207
2208 class EscapistIE(InfoExtractor):
2209     """Information extractor for The Escapist """
2210
2211     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2212     IE_NAME = u'escapist'
2213
2214     def _real_extract(self, url):
2215         mobj = re.match(self._VALID_URL, url)
2216         if mobj is None:
2217             raise ExtractorError(u'Invalid URL: %s' % url)
2218         showName = mobj.group('showname')
2219         videoId = mobj.group('episode')
2220
2221         self.report_extraction(showName)
2222         webPage = self._download_webpage(url, showName)
2223
2224         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2225         description = unescapeHTML(descMatch.group(1))
2226         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2227         imgUrl = unescapeHTML(imgMatch.group(1))
2228         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2229         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2230         configUrlMatch = re.search('config=(.*)$', playerUrl)
2231         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2232
2233         configJSON = self._download_webpage(configUrl, showName,
2234                                             u'Downloading configuration',
2235                                             u'unable to download configuration')
2236
2237         # Technically, it's JavaScript, not JSON
2238         configJSON = configJSON.replace("'", '"')
2239
2240         try:
2241             config = json.loads(configJSON)
2242         except (ValueError,) as err:
2243             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2244
2245         playlist = config['playlist']
2246         videoUrl = playlist[1]['url']
2247
2248         info = {
2249             'id': videoId,
2250             'url': videoUrl,
2251             'uploader': showName,
2252             'upload_date': None,
2253             'title': showName,
2254             'ext': 'mp4',
2255             'thumbnail': imgUrl,
2256             'description': description,
2257             'player_url': playerUrl,
2258         }
2259
2260         return [info]
2261
2262 class CollegeHumorIE(InfoExtractor):
2263     """Information extractor for collegehumor.com"""
2264
2265     _WORKING = False
2266     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2267     IE_NAME = u'collegehumor'
2268
2269     def report_manifest(self, video_id):
2270         """Report information extraction."""
2271         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2272
2273     def _real_extract(self, url):
2274         mobj = re.match(self._VALID_URL, url)
2275         if mobj is None:
2276             raise ExtractorError(u'Invalid URL: %s' % url)
2277         video_id = mobj.group('videoid')
2278
2279         info = {
2280             'id': video_id,
2281             'uploader': None,
2282             'upload_date': None,
2283         }
2284
2285         self.report_extraction(video_id)
2286         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2287         try:
2288             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2289         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2290             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2291
2292         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2293         try:
2294             videoNode = mdoc.findall('./video')[0]
2295             info['description'] = videoNode.findall('./description')[0].text
2296             info['title'] = videoNode.findall('./caption')[0].text
2297             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2298             manifest_url = videoNode.findall('./file')[0].text
2299         except IndexError:
2300             raise ExtractorError(u'Invalid metadata XML file')
2301
2302         manifest_url += '?hdcore=2.10.3'
2303         self.report_manifest(video_id)
2304         try:
2305             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2306         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2307             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2308
2309         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2310         try:
2311             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2312             node_id = media_node.attrib['url']
2313             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2314         except IndexError as err:
2315             raise ExtractorError(u'Invalid manifest file')
2316
2317         url_pr = compat_urllib_parse_urlparse(manifest_url)
2318         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2319
2320         info['url'] = url
2321         info['ext'] = 'f4f'
2322         return [info]
2323
2324
2325 class XVideosIE(InfoExtractor):
2326     """Information extractor for xvideos.com"""
2327
2328     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2329     IE_NAME = u'xvideos'
2330
2331     def _real_extract(self, url):
2332         mobj = re.match(self._VALID_URL, url)
2333         if mobj is None:
2334             raise ExtractorError(u'Invalid URL: %s' % url)
2335         video_id = mobj.group(1)
2336
2337         webpage = self._download_webpage(url, video_id)
2338
2339         self.report_extraction(video_id)
2340
2341
2342         # Extract video URL
2343         mobj = re.search(r'flv_url=(.+?)&', webpage)
2344         if mobj is None:
2345             raise ExtractorError(u'Unable to extract video url')
2346         video_url = compat_urllib_parse.unquote(mobj.group(1))
2347
2348
2349         # Extract title
2350         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2351         if mobj is None:
2352             raise ExtractorError(u'Unable to extract video title')
2353         video_title = mobj.group(1)
2354
2355
2356         # Extract video thumbnail
2357         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2358         if mobj is None:
2359             raise ExtractorError(u'Unable to extract video thumbnail')
2360         video_thumbnail = mobj.group(0)
2361
2362         info = {
2363             'id': video_id,
2364             'url': video_url,
2365             'uploader': None,
2366             'upload_date': None,
2367             'title': video_title,
2368             'ext': 'flv',
2369             'thumbnail': video_thumbnail,
2370             'description': None,
2371         }
2372
2373         return [info]
2374
2375
2376 class SoundcloudIE(InfoExtractor):
2377     """Information extractor for soundcloud.com
2378        To access the media, the uid of the song and a stream token
2379        must be extracted from the page source and the script must make
2380        a request to media.soundcloud.com/crossdomain.xml. Then
2381        the media can be grabbed by requesting from an url composed
2382        of the stream token and uid
2383      """
2384
2385     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2386     IE_NAME = u'soundcloud'
2387
2388     def report_resolve(self, video_id):
2389         """Report information extraction."""
2390         self.to_screen(u'%s: Resolving id' % video_id)
2391
2392     def _real_extract(self, url):
2393         mobj = re.match(self._VALID_URL, url)
2394         if mobj is None:
2395             raise ExtractorError(u'Invalid URL: %s' % url)
2396
2397         # extract uploader (which is in the url)
2398         uploader = mobj.group(1)
2399         # extract simple title (uploader + slug of song title)
2400         slug_title =  mobj.group(2)
2401         simple_title = uploader + u'-' + slug_title
2402         full_title = '%s/%s' % (uploader, slug_title)
2403
2404         self.report_resolve(full_title)
2405
2406         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2407         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2408         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2409
2410         info = json.loads(info_json)
2411         video_id = info['id']
2412         self.report_extraction(full_title)
2413
2414         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2415         stream_json = self._download_webpage(streams_url, full_title,
2416                                              u'Downloading stream definitions',
2417                                              u'unable to download stream definitions')
2418
2419         streams = json.loads(stream_json)
2420         mediaURL = streams['http_mp3_128_url']
2421         upload_date = unified_strdate(info['created_at'])
2422
2423         return [{
2424             'id':       info['id'],
2425             'url':      mediaURL,
2426             'uploader': info['user']['username'],
2427             'upload_date': upload_date,
2428             'title':    info['title'],
2429             'ext':      u'mp3',
2430             'description': info['description'],
2431         }]
2432
2433 class SoundcloudSetIE(InfoExtractor):
2434     """Information extractor for soundcloud.com sets
2435        To access the media, the uid of the song and a stream token
2436        must be extracted from the page source and the script must make
2437        a request to media.soundcloud.com/crossdomain.xml. Then
2438        the media can be grabbed by requesting from an url composed
2439        of the stream token and uid
2440      """
2441
2442     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2443     IE_NAME = u'soundcloud:set'
2444
2445     def report_resolve(self, video_id):
2446         """Report information extraction."""
2447         self.to_screen(u'%s: Resolving id' % video_id)
2448
2449     def _real_extract(self, url):
2450         mobj = re.match(self._VALID_URL, url)
2451         if mobj is None:
2452             raise ExtractorError(u'Invalid URL: %s' % url)
2453
2454         # extract uploader (which is in the url)
2455         uploader = mobj.group(1)
2456         # extract simple title (uploader + slug of song title)
2457         slug_title =  mobj.group(2)
2458         simple_title = uploader + u'-' + slug_title
2459         full_title = '%s/sets/%s' % (uploader, slug_title)
2460
2461         self.report_resolve(full_title)
2462
2463         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2464         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2465         info_json = self._download_webpage(resolv_url, full_title)
2466
2467         videos = []
2468         info = json.loads(info_json)
2469         if 'errors' in info:
2470             for err in info['errors']:
2471                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2472             return
2473
2474         self.report_extraction(full_title)
2475         for track in info['tracks']:
2476             video_id = track['id']
2477
2478             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2479             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2480
2481             self.report_extraction(video_id)
2482             streams = json.loads(stream_json)
2483             mediaURL = streams['http_mp3_128_url']
2484
2485             videos.append({
2486                 'id':       video_id,
2487                 'url':      mediaURL,
2488                 'uploader': track['user']['username'],
2489                 'upload_date':  unified_strdate(track['created_at']),
2490                 'title':    track['title'],
2491                 'ext':      u'mp3',
2492                 'description': track['description'],
2493             })
2494         return videos
2495
2496
2497 class InfoQIE(InfoExtractor):
2498     """Information extractor for infoq.com"""
2499     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2500
2501     def _real_extract(self, url):
2502         mobj = re.match(self._VALID_URL, url)
2503         if mobj is None:
2504             raise ExtractorError(u'Invalid URL: %s' % url)
2505
2506         webpage = self._download_webpage(url, video_id=url)
2507         self.report_extraction(url)
2508
2509         # Extract video URL
2510         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2511         if mobj is None:
2512             raise ExtractorError(u'Unable to extract video url')
2513         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2514         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2515
2516         # Extract title
2517         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2518         if mobj is None:
2519             raise ExtractorError(u'Unable to extract video title')
2520         video_title = mobj.group(1)
2521
2522         # Extract description
2523         video_description = u'No description available.'
2524         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2525         if mobj is not None:
2526             video_description = mobj.group(1)
2527
2528         video_filename = video_url.split('/')[-1]
2529         video_id, extension = video_filename.split('.')
2530
2531         info = {
2532             'id': video_id,
2533             'url': video_url,
2534             'uploader': None,
2535             'upload_date': None,
2536             'title': video_title,
2537             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2538             'thumbnail': None,
2539             'description': video_description,
2540         }
2541
2542         return [info]
2543
2544 class MixcloudIE(InfoExtractor):
2545     """Information extractor for www.mixcloud.com"""
2546
2547     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2548     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2549     IE_NAME = u'mixcloud'
2550
2551     def report_download_json(self, file_id):
2552         """Report JSON download."""
2553         self.to_screen(u'Downloading json')
2554
2555     def get_urls(self, jsonData, fmt, bitrate='best'):
2556         """Get urls from 'audio_formats' section in json"""
2557         file_url = None
2558         try:
2559             bitrate_list = jsonData[fmt]
2560             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2561                 bitrate = max(bitrate_list) # select highest
2562
2563             url_list = jsonData[fmt][bitrate]
2564         except TypeError: # we have no bitrate info.
2565             url_list = jsonData[fmt]
2566         return url_list
2567
2568     def check_urls(self, url_list):
2569         """Returns 1st active url from list"""
2570         for url in url_list:
2571             try:
2572                 compat_urllib_request.urlopen(url)
2573                 return url
2574             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2575                 url = None
2576
2577         return None
2578
2579     def _print_formats(self, formats):
2580         print('Available formats:')
2581         for fmt in formats.keys():
2582             for b in formats[fmt]:
2583                 try:
2584                     ext = formats[fmt][b][0]
2585                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2586                 except TypeError: # we have no bitrate info
2587                     ext = formats[fmt][0]
2588                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2589                     break
2590
2591     def _real_extract(self, url):
2592         mobj = re.match(self._VALID_URL, url)
2593         if mobj is None:
2594             raise ExtractorError(u'Invalid URL: %s' % url)
2595         # extract uploader & filename from url
2596         uploader = mobj.group(1).decode('utf-8')
2597         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2598
2599         # construct API request
2600         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2601         # retrieve .json file with links to files
2602         request = compat_urllib_request.Request(file_url)
2603         try:
2604             self.report_download_json(file_url)
2605             jsonData = compat_urllib_request.urlopen(request).read()
2606         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2607             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2608
2609         # parse JSON
2610         json_data = json.loads(jsonData)
2611         player_url = json_data['player_swf_url']
2612         formats = dict(json_data['audio_formats'])
2613
2614         req_format = self._downloader.params.get('format', None)
2615         bitrate = None
2616
2617         if self._downloader.params.get('listformats', None):
2618             self._print_formats(formats)
2619             return
2620
2621         if req_format is None or req_format == 'best':
2622             for format_param in formats.keys():
2623                 url_list = self.get_urls(formats, format_param)
2624                 # check urls
2625                 file_url = self.check_urls(url_list)
2626                 if file_url is not None:
2627                     break # got it!
2628         else:
2629             if req_format not in formats:
2630                 raise ExtractorError(u'Format is not available')
2631
2632             url_list = self.get_urls(formats, req_format)
2633             file_url = self.check_urls(url_list)
2634             format_param = req_format
2635
2636         return [{
2637             'id': file_id.decode('utf-8'),
2638             'url': file_url.decode('utf-8'),
2639             'uploader': uploader.decode('utf-8'),
2640             'upload_date': None,
2641             'title': json_data['name'],
2642             'ext': file_url.split('.')[-1].decode('utf-8'),
2643             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2644             'thumbnail': json_data['thumbnail_url'],
2645             'description': json_data['description'],
2646             'player_url': player_url.decode('utf-8'),
2647         }]
2648
2649 class StanfordOpenClassroomIE(InfoExtractor):
2650     """Information extractor for Stanford's Open ClassRoom"""
2651
2652     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2653     IE_NAME = u'stanfordoc'
2654
2655     def _real_extract(self, url):
2656         mobj = re.match(self._VALID_URL, url)
2657         if mobj is None:
2658             raise ExtractorError(u'Invalid URL: %s' % url)
2659
2660         if mobj.group('course') and mobj.group('video'): # A specific video
2661             course = mobj.group('course')
2662             video = mobj.group('video')
2663             info = {
2664                 'id': course + '_' + video,
2665                 'uploader': None,
2666                 'upload_date': None,
2667             }
2668
2669             self.report_extraction(info['id'])
2670             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2671             xmlUrl = baseUrl + video + '.xml'
2672             try:
2673                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2674             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2675                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2676             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2677             try:
2678                 info['title'] = mdoc.findall('./title')[0].text
2679                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2680             except IndexError:
2681                 raise ExtractorError(u'Invalid metadata XML file')
2682             info['ext'] = info['url'].rpartition('.')[2]
2683             return [info]
2684         elif mobj.group('course'): # A course page
2685             course = mobj.group('course')
2686             info = {
2687                 'id': course,
2688                 'type': 'playlist',
2689                 'uploader': None,
2690                 'upload_date': None,
2691             }
2692
2693             coursepage = self._download_webpage(url, info['id'],
2694                                         note='Downloading course info page',
2695                                         errnote='Unable to download course info page')
2696
2697             m = re.search('<h1>([^<]+)</h1>', coursepage)
2698             if m:
2699                 info['title'] = unescapeHTML(m.group(1))
2700             else:
2701                 info['title'] = info['id']
2702
2703             m = re.search('<description>([^<]+)</description>', coursepage)
2704             if m:
2705                 info['description'] = unescapeHTML(m.group(1))
2706
2707             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2708             info['list'] = [
2709                 {
2710                     'type': 'reference',
2711                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2712                 }
2713                     for vpage in links]
2714             results = []
2715             for entry in info['list']:
2716                 assert entry['type'] == 'reference'
2717                 results += self.extract(entry['url'])
2718             return results
2719         else: # Root page
2720             info = {
2721                 'id': 'Stanford OpenClassroom',
2722                 'type': 'playlist',
2723                 'uploader': None,
2724                 'upload_date': None,
2725             }
2726
2727             self.report_download_webpage(info['id'])
2728             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2729             try:
2730                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2731             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2732                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2733
2734             info['title'] = info['id']
2735
2736             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2737             info['list'] = [
2738                 {
2739                     'type': 'reference',
2740                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2741                 }
2742                     for cpage in links]
2743
2744             results = []
2745             for entry in info['list']:
2746                 assert entry['type'] == 'reference'
2747                 results += self.extract(entry['url'])
2748             return results
2749
2750 class MTVIE(InfoExtractor):
2751     """Information extractor for MTV.com"""
2752
2753     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2754     IE_NAME = u'mtv'
2755
2756     def _real_extract(self, url):
2757         mobj = re.match(self._VALID_URL, url)
2758         if mobj is None:
2759             raise ExtractorError(u'Invalid URL: %s' % url)
2760         if not mobj.group('proto'):
2761             url = 'http://' + url
2762         video_id = mobj.group('videoid')
2763
2764         webpage = self._download_webpage(url, video_id)
2765
2766         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2767         if mobj is None:
2768             raise ExtractorError(u'Unable to extract song name')
2769         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2770         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2771         if mobj is None:
2772             raise ExtractorError(u'Unable to extract performer')
2773         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2774         video_title = performer + ' - ' + song_name
2775
2776         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2777         if mobj is None:
2778             raise ExtractorError(u'Unable to mtvn_uri')
2779         mtvn_uri = mobj.group(1)
2780
2781         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2782         if mobj is None:
2783             raise ExtractorError(u'Unable to extract content id')
2784         content_id = mobj.group(1)
2785
2786         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2787         self.report_extraction(video_id)
2788         request = compat_urllib_request.Request(videogen_url)
2789         try:
2790             metadataXml = compat_urllib_request.urlopen(request).read()
2791         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2793
2794         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2795         renditions = mdoc.findall('.//rendition')
2796
2797         # For now, always pick the highest quality.
2798         rendition = renditions[-1]
2799
2800         try:
2801             _,_,ext = rendition.attrib['type'].partition('/')
2802             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2803             video_url = rendition.find('./src').text
2804         except KeyError:
2805             raise ExtractorError('Invalid rendition field.')
2806
2807         info = {
2808             'id': video_id,
2809             'url': video_url,
2810             'uploader': performer,
2811             'upload_date': None,
2812             'title': video_title,
2813             'ext': ext,
2814             'format': format,
2815         }
2816
2817         return [info]
2818
2819
2820 class YoukuIE(InfoExtractor):
2821     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2822
2823     def _gen_sid(self):
2824         nowTime = int(time.time() * 1000)
2825         random1 = random.randint(1000,1998)
2826         random2 = random.randint(1000,9999)
2827
2828         return "%d%d%d" %(nowTime,random1,random2)
2829
2830     def _get_file_ID_mix_string(self, seed):
2831         mixed = []
2832         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2833         seed = float(seed)
2834         for i in range(len(source)):
2835             seed  =  (seed * 211 + 30031 ) % 65536
2836             index  =  math.floor(seed / 65536 * len(source) )
2837             mixed.append(source[int(index)])
2838             source.remove(source[int(index)])
2839         #return ''.join(mixed)
2840         return mixed
2841
2842     def _get_file_id(self, fileId, seed):
2843         mixed = self._get_file_ID_mix_string(seed)
2844         ids = fileId.split('*')
2845         realId = []
2846         for ch in ids:
2847             if ch:
2848                 realId.append(mixed[int(ch)])
2849         return ''.join(realId)
2850
2851     def _real_extract(self, url):
2852         mobj = re.match(self._VALID_URL, url)
2853         if mobj is None:
2854             raise ExtractorError(u'Invalid URL: %s' % url)
2855         video_id = mobj.group('ID')
2856
2857         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2858
2859         jsondata = self._download_webpage(info_url, video_id)
2860
2861         self.report_extraction(video_id)
2862         try:
2863             config = json.loads(jsondata)
2864
2865             video_title =  config['data'][0]['title']
2866             seed = config['data'][0]['seed']
2867
2868             format = self._downloader.params.get('format', None)
2869             supported_format = list(config['data'][0]['streamfileids'].keys())
2870
2871             if format is None or format == 'best':
2872                 if 'hd2' in supported_format:
2873                     format = 'hd2'
2874                 else:
2875                     format = 'flv'
2876                 ext = u'flv'
2877             elif format == 'worst':
2878                 format = 'mp4'
2879                 ext = u'mp4'
2880             else:
2881                 format = 'flv'
2882                 ext = u'flv'
2883
2884
2885             fileid = config['data'][0]['streamfileids'][format]
2886             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2887         except (UnicodeDecodeError, ValueError, KeyError):
2888             raise ExtractorError(u'Unable to extract info section')
2889
2890         files_info=[]
2891         sid = self._gen_sid()
2892         fileid = self._get_file_id(fileid, seed)
2893
2894         #column 8,9 of fileid represent the segment number
2895         #fileid[7:9] should be changed
2896         for index, key in enumerate(keys):
2897
2898             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2899             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2900
2901             info = {
2902                 'id': '%s_part%02d' % (video_id, index),
2903                 'url': download_url,
2904                 'uploader': None,
2905                 'upload_date': None,
2906                 'title': video_title,
2907                 'ext': ext,
2908             }
2909             files_info.append(info)
2910
2911         return files_info
2912
2913
2914 class XNXXIE(InfoExtractor):
2915     """Information extractor for xnxx.com"""
2916
2917     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2918     IE_NAME = u'xnxx'
2919     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2920     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2921     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2922
2923     def _real_extract(self, url):
2924         mobj = re.match(self._VALID_URL, url)
2925         if mobj is None:
2926             raise ExtractorError(u'Invalid URL: %s' % url)
2927         video_id = mobj.group(1)
2928
2929         # Get webpage content
2930         webpage = self._download_webpage(url, video_id)
2931
2932         result = re.search(self.VIDEO_URL_RE, webpage)
2933         if result is None:
2934             raise ExtractorError(u'Unable to extract video url')
2935         video_url = compat_urllib_parse.unquote(result.group(1))
2936
2937         result = re.search(self.VIDEO_TITLE_RE, webpage)
2938         if result is None:
2939             raise ExtractorError(u'Unable to extract video title')
2940         video_title = result.group(1)
2941
2942         result = re.search(self.VIDEO_THUMB_RE, webpage)
2943         if result is None:
2944             raise ExtractorError(u'Unable to extract video thumbnail')
2945         video_thumbnail = result.group(1)
2946
2947         return [{
2948             'id': video_id,
2949             'url': video_url,
2950             'uploader': None,
2951             'upload_date': None,
2952             'title': video_title,
2953             'ext': 'flv',
2954             'thumbnail': video_thumbnail,
2955             'description': None,
2956         }]
2957
2958
2959 class GooglePlusIE(InfoExtractor):
2960     """Information extractor for plus.google.com."""
2961
2962     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2963     IE_NAME = u'plus.google'
2964
2965     def report_extract_entry(self, url):
2966         """Report downloading extry"""
2967         self.to_screen(u'Downloading entry: %s' % url)
2968
2969     def report_date(self, upload_date):
2970         """Report downloading extry"""
2971         self.to_screen(u'Entry date: %s' % upload_date)
2972
2973     def report_uploader(self, uploader):
2974         """Report downloading extry"""
2975         self.to_screen(u'Uploader: %s' % uploader)
2976
2977     def report_title(self, video_title):
2978         """Report downloading extry"""
2979         self.to_screen(u'Title: %s' % video_title)
2980
2981     def report_extract_vid_page(self, video_page):
2982         """Report information extraction."""
2983         self.to_screen(u'Extracting video page: %s' % video_page)
2984
2985     def _real_extract(self, url):
2986         # Extract id from URL
2987         mobj = re.match(self._VALID_URL, url)
2988         if mobj is None:
2989             raise ExtractorError(u'Invalid URL: %s' % url)
2990
2991         post_url = mobj.group(0)
2992         video_id = mobj.group(1)
2993
2994         video_extension = 'flv'
2995
2996         # Step 1, Retrieve post webpage to extract further information
2997         self.report_extract_entry(post_url)
2998         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2999
3000         # Extract update date
3001         upload_date = None
3002         pattern = 'title="Timestamp">(.*?)</a>'
3003         mobj = re.search(pattern, webpage)
3004         if mobj:
3005             upload_date = mobj.group(1)
3006             # Convert timestring to a format suitable for filename
3007             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3008             upload_date = upload_date.strftime('%Y%m%d')
3009         self.report_date(upload_date)
3010
3011         # Extract uploader
3012         uploader = None
3013         pattern = r'rel\="author".*?>(.*?)</a>'
3014         mobj = re.search(pattern, webpage)
3015         if mobj:
3016             uploader = mobj.group(1)
3017         self.report_uploader(uploader)
3018
3019         # Extract title
3020         # Get the first line for title
3021         video_title = u'NA'
3022         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3023         mobj = re.search(pattern, webpage)
3024         if mobj:
3025             video_title = mobj.group(1)
3026         self.report_title(video_title)
3027
3028         # Step 2, Stimulate clicking the image box to launch video
3029         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3030         mobj = re.search(pattern, webpage)
3031         if mobj is None:
3032             raise ExtractorError(u'Unable to extract video page URL')
3033
3034         video_page = mobj.group(1)
3035         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3036         self.report_extract_vid_page(video_page)
3037
3038
3039         # Extract video links on video page
3040         """Extract video links of all sizes"""
3041         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3042         mobj = re.findall(pattern, webpage)
3043         if len(mobj) == 0:
3044             raise ExtractorError(u'Unable to extract video links')
3045
3046         # Sort in resolution
3047         links = sorted(mobj)
3048
3049         # Choose the lowest of the sort, i.e. highest resolution
3050         video_url = links[-1]
3051         # Only get the url. The resolution part in the tuple has no use anymore
3052         video_url = video_url[-1]
3053         # Treat escaped \u0026 style hex
3054         try:
3055             video_url = video_url.decode("unicode_escape")
3056         except AttributeError: # Python 3
3057             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3058
3059
3060         return [{
3061             'id':       video_id,
3062             'url':      video_url,
3063             'uploader': uploader,
3064             'upload_date':  upload_date,
3065             'title':    video_title,
3066             'ext':      video_extension,
3067         }]
3068
3069 class NBAIE(InfoExtractor):
3070     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3071     IE_NAME = u'nba'
3072
3073     def _real_extract(self, url):
3074         mobj = re.match(self._VALID_URL, url)
3075         if mobj is None:
3076             raise ExtractorError(u'Invalid URL: %s' % url)
3077
3078         video_id = mobj.group(1)
3079         if video_id.endswith('/index.html'):
3080             video_id = video_id[:-len('/index.html')]
3081
3082         webpage = self._download_webpage(url, video_id)
3083
3084         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3085         def _findProp(rexp, default=None):
3086             m = re.search(rexp, webpage)
3087             if m:
3088                 return unescapeHTML(m.group(1))
3089             else:
3090                 return default
3091
3092         shortened_video_id = video_id.rpartition('/')[2]
3093         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3094         info = {
3095             'id': shortened_video_id,
3096             'url': video_url,
3097             'ext': 'mp4',
3098             'title': title,
3099             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3100             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3101         }
3102         return [info]
3103
3104 class JustinTVIE(InfoExtractor):
3105     """Information extractor for justin.tv and twitch.tv"""
3106     # TODO: One broadcast may be split into multiple videos. The key
3107     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3108     # starts at 1 and increases. Can we treat all parts as one video?
3109
3110     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3111         (?:
3112             (?P<channelid>[^/]+)|
3113             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3114             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3115         )
3116         /?(?:\#.*)?$
3117         """
3118     _JUSTIN_PAGE_LIMIT = 100
3119     IE_NAME = u'justin.tv'
3120
3121     def report_download_page(self, channel, offset):
3122         """Report attempt to download a single page of videos."""
3123         self.to_screen(u'%s: Downloading video information from %d to %d' %
3124                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3125
3126     # Return count of items, list of *valid* items
3127     def _parse_page(self, url, video_id):
3128         webpage = self._download_webpage(url, video_id,
3129                                          u'Downloading video info JSON',
3130                                          u'unable to download video info JSON')
3131
3132         response = json.loads(webpage)
3133         if type(response) != list:
3134             error_text = response.get('error', 'unknown error')
3135             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3136         info = []
3137         for clip in response:
3138             video_url = clip['video_file_url']
3139             if video_url:
3140                 video_extension = os.path.splitext(video_url)[1][1:]
3141                 video_date = re.sub('-', '', clip['start_time'][:10])
3142                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3143                 video_id = clip['id']
3144                 video_title = clip.get('title', video_id)
3145                 info.append({
3146                     'id': video_id,
3147                     'url': video_url,
3148                     'title': video_title,
3149                     'uploader': clip.get('channel_name', video_uploader_id),
3150                     'uploader_id': video_uploader_id,
3151                     'upload_date': video_date,
3152                     'ext': video_extension,
3153                 })
3154         return (len(response), info)
3155
3156     def _real_extract(self, url):
3157         mobj = re.match(self._VALID_URL, url)
3158         if mobj is None:
3159             raise ExtractorError(u'invalid URL: %s' % url)
3160
3161         api_base = 'http://api.justin.tv'
3162         paged = False
3163         if mobj.group('channelid'):
3164             paged = True
3165             video_id = mobj.group('channelid')
3166             api = api_base + '/channel/archives/%s.json' % video_id
3167         elif mobj.group('chapterid'):
3168             chapter_id = mobj.group('chapterid')
3169
3170             webpage = self._download_webpage(url, chapter_id)
3171             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3172             if not m:
3173                 raise ExtractorError(u'Cannot find archive of a chapter')
3174             archive_id = m.group(1)
3175
3176             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3177             chapter_info_xml = self._download_webpage(api, chapter_id,
3178                                              note=u'Downloading chapter information',
3179                                              errnote=u'Chapter information download failed')
3180             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3181             for a in doc.findall('.//archive'):
3182                 if archive_id == a.find('./id').text:
3183                     break
3184             else:
3185                 raise ExtractorError(u'Could not find chapter in chapter information')
3186
3187             video_url = a.find('./video_file_url').text
3188             video_ext = video_url.rpartition('.')[2] or u'flv'
3189
3190             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3191             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3192                                    note='Downloading chapter metadata',
3193                                    errnote='Download of chapter metadata failed')
3194             chapter_info = json.loads(chapter_info_json)
3195
3196             bracket_start = int(doc.find('.//bracket_start').text)
3197             bracket_end = int(doc.find('.//bracket_end').text)
3198
3199             # TODO determine start (and probably fix up file)
3200             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3201             #video_url += u'?start=' + TODO:start_timestamp
3202             # bracket_start is 13290, but we want 51670615
3203             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3204                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3205
3206             info = {
3207                 'id': u'c' + chapter_id,
3208                 'url': video_url,
3209                 'ext': video_ext,
3210                 'title': chapter_info['title'],
3211                 'thumbnail': chapter_info['preview'],
3212                 'description': chapter_info['description'],
3213                 'uploader': chapter_info['channel']['display_name'],
3214                 'uploader_id': chapter_info['channel']['name'],
3215             }
3216             return [info]
3217         else:
3218             video_id = mobj.group('videoid')
3219             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3220
3221         self.report_extraction(video_id)
3222
3223         info = []
3224         offset = 0
3225         limit = self._JUSTIN_PAGE_LIMIT
3226         while True:
3227             if paged:
3228                 self.report_download_page(video_id, offset)
3229             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3230             page_count, page_info = self._parse_page(page_url, video_id)
3231             info.extend(page_info)
3232             if not paged or page_count != limit:
3233                 break
3234             offset += limit
3235         return info
3236
3237 class FunnyOrDieIE(InfoExtractor):
3238     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3239
3240     def _real_extract(self, url):
3241         mobj = re.match(self._VALID_URL, url)
3242         if mobj is None:
3243             raise ExtractorError(u'invalid URL: %s' % url)
3244
3245         video_id = mobj.group('id')
3246         webpage = self._download_webpage(url, video_id)
3247
3248         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3249         if not m:
3250             raise ExtractorError(u'Unable to find video information')
3251         video_url = unescapeHTML(m.group('url'))
3252
3253         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3254         if not m:
3255             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3256             if not m:
3257                 raise ExtractorError(u'Cannot find video title')
3258         title = clean_html(m.group('title'))
3259
3260         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3261         if m:
3262             desc = unescapeHTML(m.group('desc'))
3263         else:
3264             desc = None
3265
3266         info = {
3267             'id': video_id,
3268             'url': video_url,
3269             'ext': 'mp4',
3270             'title': title,
3271             'description': desc,
3272         }
3273         return [info]
3274
3275 class SteamIE(InfoExtractor):
3276     _VALID_URL = r"""http://store\.steampowered\.com/
3277                 (agecheck/)?
3278                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3279                 (?P<gameID>\d+)/?
3280                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3281                 """
3282
3283     @classmethod
3284     def suitable(cls, url):
3285         """Receives a URL and returns True if suitable for this IE."""
3286         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3287
3288     def _real_extract(self, url):
3289         m = re.match(self._VALID_URL, url, re.VERBOSE)
3290         gameID = m.group('gameID')
3291         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3292         self.report_age_confirmation()
3293         webpage = self._download_webpage(videourl, gameID)
3294         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3295
3296         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3297         mweb = re.finditer(urlRE, webpage)
3298         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3299         titles = re.finditer(namesRE, webpage)
3300         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3301         thumbs = re.finditer(thumbsRE, webpage)
3302         videos = []
3303         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3304             video_id = vid.group('videoID')
3305             title = vtitle.group('videoName')
3306             video_url = vid.group('videoURL')
3307             video_thumb = thumb.group('thumbnail')
3308             if not video_url:
3309                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3310             info = {
3311                 'id':video_id,
3312                 'url':video_url,
3313                 'ext': 'flv',
3314                 'title': unescapeHTML(title),
3315                 'thumbnail': video_thumb
3316                   }
3317             videos.append(info)
3318         return [self.playlist_result(videos, gameID, game_title)]
3319
3320 class UstreamIE(InfoExtractor):
3321     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3322     IE_NAME = u'ustream'
3323
3324     def _real_extract(self, url):
3325         m = re.match(self._VALID_URL, url)
3326         video_id = m.group('videoID')
3327         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3328         webpage = self._download_webpage(url, video_id)
3329         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3330         title = m.group('title')
3331         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3332         uploader = m.group('uploader')
3333         info = {
3334                 'id':video_id,
3335                 'url':video_url,
3336                 'ext': 'flv',
3337                 'title': title,
3338                 'uploader': uploader
3339                   }
3340         return [info]
3341
3342 class WorldStarHipHopIE(InfoExtractor):
3343     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3344     IE_NAME = u'WorldStarHipHop'
3345
3346     def _real_extract(self, url):
3347         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3348
3349         m = re.match(self._VALID_URL, url)
3350         video_id = m.group('id')
3351
3352         webpage_src = self._download_webpage(url, video_id)
3353
3354         mobj = re.search(_src_url, webpage_src)
3355
3356         if mobj is not None:
3357             video_url = mobj.group(1)
3358             if 'mp4' in video_url:
3359                 ext = 'mp4'
3360             else:
3361                 ext = 'flv'
3362         else:
3363             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3364
3365         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3366
3367         if mobj is None:
3368             raise ExtractorError(u'Cannot determine title')
3369         title = mobj.group(1)
3370
3371         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3372         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3373         if mobj is not None:
3374             thumbnail = mobj.group(1)
3375         else:
3376             _title = r"""candytitles.*>(.*)</span>"""
3377             mobj = re.search(_title, webpage_src)
3378             if mobj is not None:
3379                 title = mobj.group(1)
3380             thumbnail = None
3381
3382         results = [{
3383                     'id': video_id,
3384                     'url' : video_url,
3385                     'title' : title,
3386                     'thumbnail' : thumbnail,
3387                     'ext' : ext,
3388                     }]
3389         return results
3390
3391 class RBMARadioIE(InfoExtractor):
3392     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3393
3394     def _real_extract(self, url):
3395         m = re.match(self._VALID_URL, url)
3396         video_id = m.group('videoID')
3397
3398         webpage = self._download_webpage(url, video_id)
3399         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3400         if not m:
3401             raise ExtractorError(u'Cannot find metadata')
3402         json_data = m.group(1)
3403
3404         try:
3405             data = json.loads(json_data)
3406         except ValueError as e:
3407             raise ExtractorError(u'Invalid JSON: ' + str(e))
3408
3409         video_url = data['akamai_url'] + '&cbr=256'
3410         url_parts = compat_urllib_parse_urlparse(video_url)
3411         video_ext = url_parts.path.rpartition('.')[2]
3412         info = {
3413                 'id': video_id,
3414                 'url': video_url,
3415                 'ext': video_ext,
3416                 'title': data['title'],
3417                 'description': data.get('teaser_text'),
3418                 'location': data.get('country_of_origin'),
3419                 'uploader': data.get('host', {}).get('name'),
3420                 'uploader_id': data.get('host', {}).get('slug'),
3421                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3422                 'duration': data.get('duration'),
3423         }
3424         return [info]
3425
3426
3427 class YouPornIE(InfoExtractor):
3428     """Information extractor for youporn.com."""
3429     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3430
3431     def _print_formats(self, formats):
3432         """Print all available formats"""
3433         print(u'Available formats:')
3434         print(u'ext\t\tformat')
3435         print(u'---------------------------------')
3436         for format in formats:
3437             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3438
3439     def _specific(self, req_format, formats):
3440         for x in formats:
3441             if(x["format"]==req_format):
3442                 return x
3443         return None
3444
3445     def _real_extract(self, url):
3446         mobj = re.match(self._VALID_URL, url)
3447         if mobj is None:
3448             raise ExtractorError(u'Invalid URL: %s' % url)
3449
3450         video_id = mobj.group('videoid')
3451
3452         req = compat_urllib_request.Request(url)
3453         req.add_header('Cookie', 'age_verified=1')
3454         webpage = self._download_webpage(req, video_id)
3455
3456         # Get the video title
3457         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3458         if result is None:
3459             raise ExtractorError(u'Unable to extract video title')
3460         video_title = result.group('title').strip()
3461
3462         # Get the video date
3463         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3464         if result is None:
3465             self._downloader.report_warning(u'unable to extract video date')
3466             upload_date = None
3467         else:
3468             upload_date = unified_strdate(result.group('date').strip())
3469
3470         # Get the video uploader
3471         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3472         if result is None:
3473             self._downloader.report_warning(u'unable to extract uploader')
3474             video_uploader = None
3475         else:
3476             video_uploader = result.group('uploader').strip()
3477             video_uploader = clean_html( video_uploader )
3478
3479         # Get all of the formats available
3480         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3481         result = re.search(DOWNLOAD_LIST_RE, webpage)
3482         if result is None:
3483             raise ExtractorError(u'Unable to extract download list')
3484         download_list_html = result.group('download_list').strip()
3485
3486         # Get all of the links from the page
3487         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3488         links = re.findall(LINK_RE, download_list_html)
3489         if(len(links) == 0):
3490             raise ExtractorError(u'ERROR: no known formats available for video')
3491
3492         self.to_screen(u'Links found: %d' % len(links))
3493
3494         formats = []
3495         for link in links:
3496
3497             # A link looks like this:
3498             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3499             # A path looks like this:
3500             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3501             video_url = unescapeHTML( link )
3502             path = compat_urllib_parse_urlparse( video_url ).path
3503             extension = os.path.splitext( path )[1][1:]
3504             format = path.split('/')[4].split('_')[:2]
3505             size = format[0]
3506             bitrate = format[1]
3507             format = "-".join( format )
3508             title = u'%s-%s-%s' % (video_title, size, bitrate)
3509
3510             formats.append({
3511                 'id': video_id,
3512                 'url': video_url,
3513                 'uploader': video_uploader,
3514                 'upload_date': upload_date,
3515                 'title': title,
3516                 'ext': extension,
3517                 'format': format,
3518                 'thumbnail': None,
3519                 'description': None,
3520                 'player_url': None
3521             })
3522
3523         if self._downloader.params.get('listformats', None):
3524             self._print_formats(formats)
3525             return
3526
3527         req_format = self._downloader.params.get('format', None)
3528         self.to_screen(u'Format: %s' % req_format)
3529
3530         if req_format is None or req_format == 'best':
3531             return [formats[0]]
3532         elif req_format == 'worst':
3533             return [formats[-1]]
3534         elif req_format in ('-1', 'all'):
3535             return formats
3536         else:
3537             format = self._specific( req_format, formats )
3538             if result is None:
3539                 raise ExtractorError(u'Requested format not available')
3540             return [format]
3541
3542
3543
3544 class PornotubeIE(InfoExtractor):
3545     """Information extractor for pornotube.com."""
3546     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3547
3548     def _real_extract(self, url):
3549         mobj = re.match(self._VALID_URL, url)
3550         if mobj is None:
3551             raise ExtractorError(u'Invalid URL: %s' % url)
3552
3553         video_id = mobj.group('videoid')
3554         video_title = mobj.group('title')
3555
3556         # Get webpage content
3557         webpage = self._download_webpage(url, video_id)
3558
3559         # Get the video URL
3560         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3561         result = re.search(VIDEO_URL_RE, webpage)
3562         if result is None:
3563             raise ExtractorError(u'Unable to extract video url')
3564         video_url = compat_urllib_parse.unquote(result.group('url'))
3565
3566         #Get the uploaded date
3567         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3568         result = re.search(VIDEO_UPLOADED_RE, webpage)
3569         if result is None:
3570             raise ExtractorError(u'Unable to extract video title')
3571         upload_date = unified_strdate(result.group('date'))
3572
3573         info = {'id': video_id,
3574                 'url': video_url,
3575                 'uploader': None,
3576                 'upload_date': upload_date,
3577                 'title': video_title,
3578                 'ext': 'flv',
3579                 'format': 'flv'}
3580
3581         return [info]
3582
3583 class YouJizzIE(InfoExtractor):
3584     """Information extractor for youjizz.com."""
3585     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3586
3587     def _real_extract(self, url):
3588         mobj = re.match(self._VALID_URL, url)
3589         if mobj is None:
3590             raise ExtractorError(u'Invalid URL: %s' % url)
3591
3592         video_id = mobj.group('videoid')
3593
3594         # Get webpage content
3595         webpage = self._download_webpage(url, video_id)
3596
3597         # Get the video title
3598         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3599         if result is None:
3600             raise ExtractorError(u'ERROR: unable to extract video title')
3601         video_title = result.group('title').strip()
3602
3603         # Get the embed page
3604         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3605         if result is None:
3606             raise ExtractorError(u'ERROR: unable to extract embed page')
3607
3608         embed_page_url = result.group(0).strip()
3609         video_id = result.group('videoid')
3610
3611         webpage = self._download_webpage(embed_page_url, video_id)
3612
3613         # Get the video URL
3614         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3615         if result is None:
3616             raise ExtractorError(u'ERROR: unable to extract video url')
3617         video_url = result.group('source')
3618
3619         info = {'id': video_id,
3620                 'url': video_url,
3621                 'title': video_title,
3622                 'ext': 'flv',
3623                 'format': 'flv',
3624                 'player_url': embed_page_url}
3625
3626         return [info]
3627
3628 class EightTracksIE(InfoExtractor):
3629     IE_NAME = '8tracks'
3630     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3631
3632     def _real_extract(self, url):
3633         mobj = re.match(self._VALID_URL, url)
3634         if mobj is None:
3635             raise ExtractorError(u'Invalid URL: %s' % url)
3636         playlist_id = mobj.group('id')
3637
3638         webpage = self._download_webpage(url, playlist_id)
3639
3640         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3641         if not m:
3642             raise ExtractorError(u'Cannot find trax information')
3643         json_like = m.group(1)
3644         data = json.loads(json_like)
3645
3646         session = str(random.randint(0, 1000000000))
3647         mix_id = data['id']
3648         track_count = data['tracks_count']
3649         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3650         next_url = first_url
3651         res = []
3652         for i in itertools.count():
3653             api_json = self._download_webpage(next_url, playlist_id,
3654                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3655                 errnote=u'Failed to download song information')
3656             api_data = json.loads(api_json)
3657             track_data = api_data[u'set']['track']
3658             info = {
3659                 'id': track_data['id'],
3660                 'url': track_data['track_file_stream_url'],
3661                 'title': track_data['performer'] + u' - ' + track_data['name'],
3662                 'raw_title': track_data['name'],
3663                 'uploader_id': data['user']['login'],
3664                 'ext': 'm4a',
3665             }
3666             res.append(info)
3667             if api_data['set']['at_last_track']:
3668                 break
3669             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3670         return res
3671
3672 class KeekIE(InfoExtractor):
3673     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3674     IE_NAME = u'keek'
3675
3676     def _real_extract(self, url):
3677         m = re.match(self._VALID_URL, url)
3678         video_id = m.group('videoID')
3679         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3680         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3681         webpage = self._download_webpage(url, video_id)
3682         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3683         title = unescapeHTML(m.group('title'))
3684         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3685         uploader = clean_html(m.group('uploader'))
3686         info = {
3687                 'id': video_id,
3688                 'url': video_url,
3689                 'ext': 'mp4',
3690                 'title': title,
3691                 'thumbnail': thumbnail,
3692                 'uploader': uploader
3693         }
3694         return [info]
3695
3696 class TEDIE(InfoExtractor):
3697     _VALID_URL=r'''http://www\.ted\.com/
3698                    (
3699                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3700                         |
3701                         ((?P<type_talk>talks)) # We have a simple talk
3702                    )
3703                    (/lang/(.*?))? # The url may contain the language
3704                    /(?P<name>\w+) # Here goes the name and then ".html"
3705                    '''
3706
3707     @classmethod
3708     def suitable(cls, url):
3709         """Receives a URL and returns True if suitable for this IE."""
3710         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3711
3712     def _real_extract(self, url):
3713         m=re.match(self._VALID_URL, url, re.VERBOSE)
3714         if m.group('type_talk'):
3715             return [self._talk_info(url)]
3716         else :
3717             playlist_id=m.group('playlist_id')
3718             name=m.group('name')
3719             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3720             return [self._playlist_videos_info(url,name,playlist_id)]
3721
3722     def _talk_video_link(self,mediaSlug):
3723         '''Returns the video link for that mediaSlug'''
3724         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3725
3726     def _playlist_videos_info(self,url,name,playlist_id=0):
3727         '''Returns the videos of the playlist'''
3728         video_RE=r'''
3729                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3730                      ([.\s]*?)data-playlist_item_id="(\d+)"
3731                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3732                      '''
3733         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3734         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3735         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3736         m_names=re.finditer(video_name_RE,webpage)
3737
3738         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3739         m_playlist = re.search(playlist_RE, webpage)
3740         playlist_title = m_playlist.group('playlist_title')
3741
3742         playlist_entries = []
3743         for m_video, m_name in zip(m_videos,m_names):
3744             video_id=m_video.group('video_id')
3745             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3746             playlist_entries.append(self.url_result(talk_url, 'TED'))
3747         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3748
3749     def _talk_info(self, url, video_id=0):
3750         """Return the video for the talk in the url"""
3751         m=re.match(self._VALID_URL, url,re.VERBOSE)
3752         videoName=m.group('name')
3753         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3754         # If the url includes the language we get the title translated
3755         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3756         title=re.search(title_RE, webpage).group('title')
3757         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3758                         "id":(?P<videoID>[\d]+).*?
3759                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3760         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3761         thumb_match=re.search(thumb_RE,webpage)
3762         info_match=re.search(info_RE,webpage,re.VERBOSE)
3763         video_id=info_match.group('videoID')
3764         mediaSlug=info_match.group('mediaSlug')
3765         video_url=self._talk_video_link(mediaSlug)
3766         info = {
3767                 'id': video_id,
3768                 'url': video_url,
3769                 'ext': 'mp4',
3770                 'title': title,
3771                 'thumbnail': thumb_match.group('thumbnail')
3772                 }
3773         return info
3774
3775 class MySpassIE(InfoExtractor):
3776     _VALID_URL = r'http://www.myspass.de/.*'
3777
3778     def _real_extract(self, url):
3779         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3780
3781         # video id is the last path element of the URL
3782         # usually there is a trailing slash, so also try the second but last
3783         url_path = compat_urllib_parse_urlparse(url).path
3784         url_parent_path, video_id = os.path.split(url_path)
3785         if not video_id:
3786             _, video_id = os.path.split(url_parent_path)
3787
3788         # get metadata
3789         metadata_url = META_DATA_URL_TEMPLATE % video_id
3790         metadata_text = self._download_webpage(metadata_url, video_id)
3791         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3792
3793         # extract values from metadata
3794         url_flv_el = metadata.find('url_flv')
3795         if url_flv_el is None:
3796             raise ExtractorError(u'Unable to extract download url')
3797         video_url = url_flv_el.text
3798         extension = os.path.splitext(video_url)[1][1:]
3799         title_el = metadata.find('title')
3800         if title_el is None:
3801             raise ExtractorError(u'Unable to extract title')
3802         title = title_el.text
3803         format_id_el = metadata.find('format_id')
3804         if format_id_el is None:
3805             format = ext
3806         else:
3807             format = format_id_el.text
3808         description_el = metadata.find('description')
3809         if description_el is not None:
3810             description = description_el.text
3811         else:
3812             description = None
3813         imagePreview_el = metadata.find('imagePreview')
3814         if imagePreview_el is not None:
3815             thumbnail = imagePreview_el.text
3816         else:
3817             thumbnail = None
3818         info = {
3819             'id': video_id,
3820             'url': video_url,
3821             'title': title,
3822             'ext': extension,
3823             'format': format,
3824             'thumbnail': thumbnail,
3825             'description': description
3826         }
3827         return [info]
3828
3829 class SpiegelIE(InfoExtractor):
3830     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3831
3832     def _real_extract(self, url):
3833         m = re.match(self._VALID_URL, url)
3834         video_id = m.group('videoID')
3835
3836         webpage = self._download_webpage(url, video_id)
3837         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3838         if not m:
3839             raise ExtractorError(u'Cannot find title')
3840         video_title = unescapeHTML(m.group(1))
3841
3842         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3843         xml_code = self._download_webpage(xml_url, video_id,
3844                     note=u'Downloading XML', errnote=u'Failed to download XML')
3845
3846         idoc = xml.etree.ElementTree.fromstring(xml_code)
3847         last_type = idoc[-1]
3848         filename = last_type.findall('./filename')[0].text
3849         duration = float(last_type.findall('./duration')[0].text)
3850
3851         video_url = 'http://video2.spiegel.de/flash/' + filename
3852         video_ext = filename.rpartition('.')[2]
3853         info = {
3854             'id': video_id,
3855             'url': video_url,
3856             'ext': video_ext,
3857             'title': video_title,
3858             'duration': duration,
3859         }
3860         return [info]
3861
3862 class LiveLeakIE(InfoExtractor):
3863
3864     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3865     IE_NAME = u'liveleak'
3866
3867     def _real_extract(self, url):
3868         mobj = re.match(self._VALID_URL, url)
3869         if mobj is None:
3870             raise ExtractorError(u'Invalid URL: %s' % url)
3871
3872         video_id = mobj.group('video_id')
3873
3874         webpage = self._download_webpage(url, video_id)
3875
3876         m = re.search(r'file: "(.*?)",', webpage)
3877         if not m:
3878             raise ExtractorError(u'Unable to find video url')
3879         video_url = m.group(1)
3880
3881         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3882         if not m:
3883             raise ExtractorError(u'Cannot find video title')
3884         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3885
3886         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3887         if m:
3888             desc = unescapeHTML(m.group('desc'))
3889         else:
3890             desc = None
3891
3892         m = re.search(r'By:.*?(\w+)</a>', webpage)
3893         if m:
3894             uploader = clean_html(m.group(1))
3895         else:
3896             uploader = None
3897
3898         info = {
3899             'id':  video_id,
3900             'url': video_url,
3901             'ext': 'mp4',
3902             'title': title,
3903             'description': desc,
3904             'uploader': uploader
3905         }
3906
3907         return [info]
3908
3909 class ARDIE(InfoExtractor):
3910     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3911     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3912     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3913
3914     def _real_extract(self, url):
3915         # determine video id from url
3916         m = re.match(self._VALID_URL, url)
3917
3918         numid = re.search(r'documentId=([0-9]+)', url)
3919         if numid:
3920             video_id = numid.group(1)
3921         else:
3922             video_id = m.group('video_id')
3923
3924         # determine title and media streams from webpage
3925         html = self._download_webpage(url, video_id)
3926         title = re.search(self._TITLE, html).group('title')
3927         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3928         if not streams:
3929             assert '"fsk"' in html
3930             raise ExtractorError(u'This video is only available after 8:00 pm')
3931
3932         # choose default media type and highest quality for now
3933         stream = max([s for s in streams if int(s["media_type"]) == 0],
3934                      key=lambda s: int(s["quality"]))
3935
3936         # there's two possibilities: RTMP stream or HTTP download
3937         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3938         if stream['rtmp_url']:
3939             self.to_screen(u'RTMP download detected')
3940             assert stream['video_url'].startswith('mp4:')
3941             info["url"] = stream["rtmp_url"]
3942             info["play_path"] = stream['video_url']
3943         else:
3944             assert stream["video_url"].endswith('.mp4')
3945             info["url"] = stream["video_url"]
3946         return [info]
3947
3948 class TumblrIE(InfoExtractor):
3949     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3950
3951     def _real_extract(self, url):
3952         m_url = re.match(self._VALID_URL, url)
3953         video_id = m_url.group('id')
3954         blog = m_url.group('blog_name')
3955
3956         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3957         webpage = self._download_webpage(url, video_id)
3958
3959         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3960         video = re.search(re_video, webpage)
3961         if video is None:
3962             self.to_screen("No video founded")
3963             return []
3964         video_url = video.group('video_url')
3965         ext = video.group('ext')
3966
3967         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
3968         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3969
3970         # The only place where you can get a title, it's not complete,
3971         # but searching in other places doesn't work for all videos
3972         re_title = r'<title>(?P<title>.*?)</title>'
3973         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3974
3975         return [{'id': video_id,
3976                  'url': video_url,
3977                  'title': title,
3978                  'thumbnail': thumb,
3979                  'ext': ext
3980                  }]
3981
3982 class BandcampIE(InfoExtractor):
3983     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3984
3985     def _real_extract(self, url):
3986         mobj = re.match(self._VALID_URL, url)
3987         title = mobj.group('title')
3988         webpage = self._download_webpage(url, title)
3989         # We get the link to the free download page
3990         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3991         if m_download is None:
3992             raise ExtractorError(u'No free songs founded')
3993
3994         download_link = m_download.group(1)
3995         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3996                        webpage, re.MULTILINE|re.DOTALL).group('id')
3997
3998         download_webpage = self._download_webpage(download_link, id,
3999                                                   'Downloading free downloads page')
4000         # We get the dictionary of the track from some javascrip code
4001         info = re.search(r'items: (.*?),$',
4002                          download_webpage, re.MULTILINE).group(1)
4003         info = json.loads(info)[0]
4004         # We pick mp3-320 for now, until format selection can be easily implemented.
4005         mp3_info = info[u'downloads'][u'mp3-320']
4006         # If we try to use this url it says the link has expired
4007         initial_url = mp3_info[u'url']
4008         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4009         m_url = re.match(re_url, initial_url)
4010         #We build the url we will use to get the final track url
4011         # This url is build in Bandcamp in the script download_bunde_*.js
4012         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4013         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4014         # If we could correctly generate the .rand field the url would be
4015         #in the "download_url" key
4016         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4017
4018         track_info = {'id':id,
4019                       'title' : info[u'title'],
4020                       'ext' : 'mp3',
4021                       'url' : final_url,
4022                       'thumbnail' : info[u'thumb_url'],
4023                       'uploader' : info[u'artist']
4024                       }
4025
4026         return [track_info]
4027
4028 class RedTubeIE(InfoExtractor):
4029     """Information Extractor for redtube"""
4030     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4031
4032     def _real_extract(self,url):
4033         mobj = re.match(self._VALID_URL, url)
4034         if mobj is None:
4035             raise ExtractorError(u'Invalid URL: %s' % url)
4036
4037         video_id = mobj.group('id')
4038         video_extension = 'mp4'
4039         webpage = self._download_webpage(url, video_id)
4040         self.report_extraction(video_id)
4041         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4042
4043         if mobj is None:
4044             raise ExtractorError(u'Unable to extract media URL')
4045
4046         video_url = mobj.group(1)
4047         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4048         if mobj is None:
4049             raise ExtractorError(u'Unable to extract title')
4050         video_title = mobj.group(1)
4051
4052         return [{
4053             'id':       video_id,
4054             'url':      video_url,
4055             'ext':      video_extension,
4056             'title':    video_title,
4057         }]
4058
4059 class InaIE(InfoExtractor):
4060     """Information Extractor for Ina.fr"""
4061     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4062
4063     def _real_extract(self,url):
4064         mobj = re.match(self._VALID_URL, url)
4065
4066         video_id = mobj.group('id')
4067         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4068         video_extension = 'mp4'
4069         webpage = self._download_webpage(mrss_url, video_id)
4070
4071         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4072         if mobj is None:
4073             raise ExtractorError(u'Unable to extract media URL')
4074         video_url = mobj.group(1)
4075
4076         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4077         if mobj is None:
4078             raise ExtractorError(u'Unable to extract title')
4079         video_title = mobj.group(1)
4080
4081         return [{
4082             'id':       video_id,
4083             'url':      video_url,
4084             'ext':      video_extension,
4085             'title':    video_title,
4086         }]
4087
4088 def gen_extractors():
4089     """ Return a list of an instance of every supported extractor.
4090     The order does matter; the first extractor matched is the one handling the URL.
4091     """
4092     return [
4093         YoutubePlaylistIE(),
4094         YoutubeChannelIE(),
4095         YoutubeUserIE(),
4096         YoutubeSearchIE(),
4097         YoutubeIE(),
4098         MetacafeIE(),
4099         DailymotionIE(),
4100         GoogleSearchIE(),
4101         PhotobucketIE(),
4102         YahooIE(),
4103         YahooSearchIE(),
4104         DepositFilesIE(),
4105         FacebookIE(),
4106         BlipTVUserIE(),
4107         BlipTVIE(),
4108         VimeoIE(),
4109         MyVideoIE(),
4110         ComedyCentralIE(),
4111         EscapistIE(),
4112         CollegeHumorIE(),
4113         XVideosIE(),
4114         SoundcloudSetIE(),
4115         SoundcloudIE(),
4116         InfoQIE(),
4117         MixcloudIE(),
4118         StanfordOpenClassroomIE(),
4119         MTVIE(),
4120         YoukuIE(),
4121         XNXXIE(),
4122         YouJizzIE(),
4123         PornotubeIE(),
4124         YouPornIE(),
4125         GooglePlusIE(),
4126         ArteTvIE(),
4127         NBAIE(),
4128         WorldStarHipHopIE(),
4129         JustinTVIE(),
4130         FunnyOrDieIE(),
4131         SteamIE(),
4132         UstreamIE(),
4133         RBMARadioIE(),
4134         EightTracksIE(),
4135         KeekIE(),
4136         TEDIE(),
4137         MySpassIE(),
4138         SpiegelIE(),
4139         LiveLeakIE(),
4140         ARDIE(),
4141         TumblrIE(),
4142         BandcampIE(),
4143         RedTubeIE(),
4144         InaIE(),
4145         GenericIE()
4146     ]
4147
4148 def get_info_extractor(ie_name):
4149     """Returns the info extractor class with the given ie_name"""
4150     return globals()[ie_name+'IE']