git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 476
 477     def _extract_id(self, url):
 478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 479         if mobj is None:
 480             raise ExtractorError(u'Invalid URL: %s' % url)
 481         video_id = mobj.group(2)
 482         return video_id
 483
 484     def _real_extract(self, url):
 485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 486         mobj = re.search(self._NEXT_URL_RE, url)
 487         if mobj:
 488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 489         video_id = self._extract_id(url)
 490
 491         # Get video webpage
 492         self.report_video_webpage_download(video_id)
 493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 494         request = compat_urllib_request.Request(url)
 495         try:
 496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 499
 500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 501
 502         # Attempt to extract SWF player URL
 503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 504         if mobj is not None:
 505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 506         else:
 507             player_url = None
 508
 509         # Get video info
 510         self.report_video_info_webpage_download(video_id)
 511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 513                     % (video_id, el_type))
 514             video_info_webpage = self._download_webpage(video_info_url, video_id,
 515                                     note=False,
 516                                     errnote='unable to download video info webpage')
 517             video_info = compat_parse_qs(video_info_webpage)
 518             if 'token' in video_info:
 519                 break
 520         if 'token' not in video_info:
 521             if 'reason' in video_info:
 522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 523             else:
 524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 525
 526         # Check for "rental" videos
 527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 528             raise ExtractorError(u'"rental" videos not supported')
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             raise ExtractorError(u'Unable to extract uploader name')
 536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 537
 538         # uploader_id
 539         video_uploader_id = None
 540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 541         if mobj is not None:
 542             video_uploader_id = mobj.group(1)
 543         else:
 544             self._downloader.report_warning(u'unable to extract uploader nickname')
 545
 546         # title
 547         if 'title' not in video_info:
 548             raise ExtractorError(u'Unable to extract video title')
 549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 550
 551         # thumbnail image
 552         if 'thumbnail_url' not in video_info:
 553             self._downloader.report_warning(u'unable to extract video thumbnail')
 554             video_thumbnail = ''
 555         else:   # don't panic if we can't find it
 556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 557
 558         # upload date
 559         upload_date = None
 560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 561         if mobj is not None:
 562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 563             upload_date = unified_strdate(upload_date)
 564
 565         # description
 566         video_description = get_element_by_id("eow-description", video_webpage)
 567         if video_description:
 568             video_description = clean_html(video_description)
 569         else:
 570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 571             if fd_mobj:
 572                 video_description = unescapeHTML(fd_mobj.group(1))
 573             else:
 574                 video_description = u''
 575
 576         # subtitles
 577         video_subtitles = None
 578
 579         if self._downloader.params.get('writesubtitles', False):
 580             video_subtitles = self._extract_subtitle(video_id)
 581             if video_subtitles:
 582                 (sub_error, sub_lang, sub) = video_subtitles[0]
 583                 if sub_error:
 584                     self._downloader.report_error(sub_error)
 585
 586         if self._downloader.params.get('allsubtitles', False):
 587             video_subtitles = self._extract_all_subtitles(video_id)
 588             for video_subtitle in video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitle
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('listsubtitles', False):
 594             sub_lang_list = self._list_available_subtitles(video_id)
 595             return
 596
 597         if 'length_seconds' not in video_info:
 598             self._downloader.report_warning(u'unable to extract video duration')
 599             video_duration = ''
 600         else:
 601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 602
 603         # token
 604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 605
 606         # Decide which formats to download
 607         req_format = self._downloader.params.get('format', None)
 608
 609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 610             self.report_rtmp_download()
 611             video_url_list = [(None, video_info['conn'][0])]
 612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 613             url_map = {}
 614             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 615                 url_data = compat_parse_qs(url_data_str)
 616                 if 'itag' in url_data and 'url' in url_data:
 617                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 618                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 619                     url_map[url_data['itag'][0]] = url
 620
 621             format_limit = self._downloader.params.get('format_limit', None)
 622             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 623             if format_limit is not None and format_limit in available_formats:
 624                 format_list = available_formats[available_formats.index(format_limit):]
 625             else:
 626                 format_list = available_formats
 627             existing_formats = [x for x in format_list if x in url_map]
 628             if len(existing_formats) == 0:
 629                 raise ExtractorError(u'no known formats available for video')
 630             if self._downloader.params.get('listformats', None):
 631                 self._print_formats(existing_formats)
 632                 return
 633             if req_format is None or req_format == 'best':
 634                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 635             elif req_format == 'worst':
 636                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 637             elif req_format in ('-1', 'all'):
 638                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 639             else:
 640                 # Specific formats. We pick the first in a slash-delimeted sequence.
 641                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 642                 req_formats = req_format.split('/')
 643                 video_url_list = None
 644                 for rf in req_formats:
 645                     if rf in url_map:
 646                         video_url_list = [(rf, url_map[rf])]
 647                         break
 648                 if video_url_list is None:
 649                     raise ExtractorError(u'requested format not available')
 650         else:
 651             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 652
 653         results = []
 654         for format_param, video_real_url in video_url_list:
 655             # Extension
 656             video_extension = self._video_extensions.get(format_param, 'flv')
 657
 658             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 659                                               self._video_dimensions.get(format_param, '???'))
 660
 661             results.append({
 662                 'id':       video_id,
 663                 'url':      video_real_url,
 664                 'uploader': video_uploader,
 665                 'uploader_id': video_uploader_id,
 666                 'upload_date':  upload_date,
 667                 'title':    video_title,
 668                 'ext':      video_extension,
 669                 'format':   video_format,
 670                 'thumbnail':    video_thumbnail,
 671                 'description':  video_description,
 672                 'player_url':   player_url,
 673                 'subtitles':    video_subtitles,
 674                 'duration':     video_duration
 675             })
 676         return results
 677
 678
 679 class MetacafeIE(InfoExtractor):
 680     """Information Extractor for metacafe.com."""
 681
 682     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 683     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 684     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 685     IE_NAME = u'metacafe'
 686
 687     def report_disclaimer(self):
 688         """Report disclaimer retrieval."""
 689         self.to_screen(u'Retrieving disclaimer')
 690
 691     def _real_initialize(self):
 692         # Retrieve disclaimer
 693         request = compat_urllib_request.Request(self._DISCLAIMER)
 694         try:
 695             self.report_disclaimer()
 696             disclaimer = compat_urllib_request.urlopen(request).read()
 697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 698             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 711
 712     def _real_extract(self, url):
 713         # Extract id and simplified title from URL
 714         mobj = re.match(self._VALID_URL, url)
 715         if mobj is None:
 716             raise ExtractorError(u'Invalid URL: %s' % url)
 717
 718         video_id = mobj.group(1)
 719
 720         # Check if video comes from YouTube
 721         mobj2 = re.match(r'^yt-(.*)$', video_id)
 722         if mobj2 is not None:
 723             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 724
 725         # Retrieve video webpage to extract further information
 726         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 727
 728         # Extract URL, uploader and title from webpage
 729         self.report_extraction(video_id)
 730         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 731         if mobj is not None:
 732             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 733             video_extension = mediaURL[-3:]
 734
 735             # Extract gdaKey if available
 736             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 737             if mobj is None:
 738                 video_url = mediaURL
 739             else:
 740                 gdaKey = mobj.group(1)
 741                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 742         else:
 743             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 744             if mobj is None:
 745                 raise ExtractorError(u'Unable to extract media URL')
 746             vardict = compat_parse_qs(mobj.group(1))
 747             if 'mediaData' not in vardict:
 748                 raise ExtractorError(u'Unable to extract media URL')
 749             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 750             if mobj is None:
 751                 raise ExtractorError(u'Unable to extract media URL')
 752             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 753             video_extension = mediaURL[-3:]
 754             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 755
 756         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 757         if mobj is None:
 758             raise ExtractorError(u'Unable to extract title')
 759         video_title = mobj.group(1).decode('utf-8')
 760
 761         mobj = re.search(r'submitter=(.*?);', webpage)
 762         if mobj is None:
 763             raise ExtractorError(u'Unable to extract uploader nickname')
 764         video_uploader = mobj.group(1)
 765
 766         return [{
 767             'id':       video_id.decode('utf-8'),
 768             'url':      video_url.decode('utf-8'),
 769             'uploader': video_uploader.decode('utf-8'),
 770             'upload_date':  None,
 771             'title':    video_title,
 772             'ext':      video_extension.decode('utf-8'),
 773         }]
 774
 775 class DailymotionIE(InfoExtractor):
 776     """Information Extractor for Dailymotion"""
 777
 778     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 779     IE_NAME = u'dailymotion'
 780
 781     def _real_extract(self, url):
 782         # Extract id and simplified title from URL
 783         mobj = re.match(self._VALID_URL, url)
 784         if mobj is None:
 785             raise ExtractorError(u'Invalid URL: %s' % url)
 786
 787         video_id = mobj.group(1).split('_')[0].split('?')[0]
 788
 789         video_extension = 'mp4'
 790
 791         # Retrieve video webpage to extract further information
 792         request = compat_urllib_request.Request(url)
 793         request.add_header('Cookie', 'family_filter=off')
 794         webpage = self._download_webpage(request, video_id)
 795
 796         # Extract URL, uploader and title from webpage
 797         self.report_extraction(video_id)
 798         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 799         if mobj is None:
 800             raise ExtractorError(u'Unable to extract media URL')
 801         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 802
 803         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 804             if key in flashvars:
 805                 max_quality = key
 806                 self.to_screen(u'Using %s' % key)
 807                 break
 808         else:
 809             raise ExtractorError(u'Unable to extract video URL')
 810
 811         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 812         if mobj is None:
 813             raise ExtractorError(u'Unable to extract video URL')
 814
 815         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 816
 817         # TODO: support choosing qualities
 818
 819         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 820         if mobj is None:
 821             raise ExtractorError(u'Unable to extract title')
 822         video_title = unescapeHTML(mobj.group('title'))
 823
 824         video_uploader = None
 825         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 826         if mobj is None:
 827             # lookin for official user
 828             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 829             if mobj_official is None:
 830                 self._downloader.report_warning(u'unable to extract uploader nickname')
 831             else:
 832                 video_uploader = mobj_official.group(1)
 833         else:
 834             video_uploader = mobj.group(1)
 835
 836         video_upload_date = None
 837         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 838         if mobj is not None:
 839             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 840
 841         return [{
 842             'id':       video_id,
 843             'url':      video_url,
 844             'uploader': video_uploader,
 845             'upload_date':  video_upload_date,
 846             'title':    video_title,
 847             'ext':      video_extension,
 848         }]
 849
 850
 851 class PhotobucketIE(InfoExtractor):
 852     """Information extractor for photobucket.com."""
 853
 854     # TODO: the original _VALID_URL was:
 855     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 856     # Check if it's necessary to keep the old extracion process
 857     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 858     IE_NAME = u'photobucket'
 859
 860     def _real_extract(self, url):
 861         # Extract id from URL
 862         mobj = re.match(self._VALID_URL, url)
 863         if mobj is None:
 864             raise ExtractorError(u'Invalid URL: %s' % url)
 865
 866         video_id = mobj.group('id')
 867
 868         video_extension = mobj.group('ext')
 869
 870         # Retrieve video webpage to extract further information
 871         webpage = self._download_webpage(url, video_id)
 872
 873         # Extract URL, uploader, and title from webpage
 874         self.report_extraction(video_id)
 875         # We try first by looking the javascript code:
 876         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 877         if mobj is not None:
 878             info = json.loads(mobj.group('json'))
 879             return [{
 880                 'id':       video_id,
 881                 'url':      info[u'downloadUrl'],
 882                 'uploader': info[u'username'],
 883                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 884                 'title':    info[u'title'],
 885                 'ext':      video_extension,
 886                 'thumbnail': info[u'thumbUrl'],
 887             }]
 888
 889         # We try looking in other parts of the webpage
 890         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 891         if mobj is None:
 892             raise ExtractorError(u'Unable to extract media URL')
 893         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 894
 895         video_url = mediaURL
 896
 897         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 898         if mobj is None:
 899             raise ExtractorError(u'Unable to extract title')
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         video_uploader = mobj.group(2).decode('utf-8')
 903
 904         return [{
 905             'id':       video_id.decode('utf-8'),
 906             'url':      video_url.decode('utf-8'),
 907             'uploader': video_uploader,
 908             'upload_date':  None,
 909             'title':    video_title,
 910             'ext':      video_extension.decode('utf-8'),
 911         }]
 912
 913
 914 class YahooIE(InfoExtractor):
 915     """Information extractor for screen.yahoo.com."""
 916     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 917
 918     def _real_extract(self, url):
 919         mobj = re.match(self._VALID_URL, url)
 920         if mobj is None:
 921             raise ExtractorError(u'Invalid URL: %s' % url)
 922         video_id = mobj.group('id')
 923         webpage = self._download_webpage(url, video_id)
 924         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 925
 926         if m_id is None:
 927             # TODO: Check which url parameters are required
 928             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 929             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
 930             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 931                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 932                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 933                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 934                         '''
 935             self.report_extraction(video_id)
 936             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 937             if m_info is None:
 938                 raise ExtractorError(u'Unable to extract video info')
 939             video_title = m_info.group('title')
 940             video_description = m_info.group('description')
 941             video_thumb = m_info.group('thumb')
 942             video_date = m_info.group('date')
 943             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 944
 945             # TODO: Find a way to get mp4 videos
 946             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 947             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
 948             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 949             video_url = m_rest.group('url')
 950             video_path = m_rest.group('path')
 951             if m_rest is None:
 952                 raise ExtractorError(u'Unable to extract video url')
 953
 954         else: # We have to use a different method if another id is defined
 955             long_id = m_id.group('new_id')
 956             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
 957             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
 958             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
 959             info = json.loads(json_str)
 960             res = info[u'query'][u'results'][u'mediaObj'][0]
 961             stream = res[u'streams'][0]
 962             video_path = stream[u'path']
 963             video_url = stream[u'host']
 964             meta = res[u'meta']
 965             video_title = meta[u'title']
 966             video_description = meta[u'description']
 967             video_thumb = meta[u'thumbnail']
 968             video_date = None # I can't find it
 969
 970         info_dict = {
 971                      'id': video_id,
 972                      'url': video_url,
 973                      'play_path': video_path,
 974                      'title':video_title,
 975                      'description': video_description,
 976                      'thumbnail': video_thumb,
 977                      'upload_date': video_date,
 978                      'ext': 'flv',
 979                      }
 980         return info_dict
 981
 982 class VimeoIE(InfoExtractor):
 983     """Information extractor for vimeo.com."""
 984
 985     # _VALID_URL matches Vimeo URLs
 986     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 987     IE_NAME = u'vimeo'
 988
 989     def _real_extract(self, url, new_video=True):
 990         # Extract ID from URL
 991         mobj = re.match(self._VALID_URL, url)
 992         if mobj is None:
 993             raise ExtractorError(u'Invalid URL: %s' % url)
 994
 995         video_id = mobj.group('id')
 996         if not mobj.group('proto'):
 997             url = 'https://' + url
 998         if mobj.group('direct_link'):
 999             url = 'https://vimeo.com/' + video_id
1000
1001         # Retrieve video webpage to extract further information
1002         request = compat_urllib_request.Request(url, None, std_headers)
1003         webpage = self._download_webpage(request, video_id)
1004
1005         # Now we begin extracting as much information as we can from what we
1006         # retrieved. First we extract the information common to all extractors,
1007         # and latter we extract those that are Vimeo specific.
1008         self.report_extraction(video_id)
1009
1010         # Extract the config JSON
1011         try:
1012             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013             config = json.loads(config)
1014         except:
1015             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1016                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1017             else:
1018                 raise ExtractorError(u'Unable to extract info section')
1019
1020         # Extract title
1021         video_title = config["video"]["title"]
1022
1023         # Extract uploader and uploader_id
1024         video_uploader = config["video"]["owner"]["name"]
1025         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026
1027         # Extract video thumbnail
1028         video_thumbnail = config["video"]["thumbnail"]
1029
1030         # Extract video description
1031         video_description = get_element_by_attribute("itemprop", "description", webpage)
1032         if video_description: video_description = clean_html(video_description)
1033         else: video_description = u''
1034
1035         # Extract upload date
1036         video_upload_date = None
1037         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038         if mobj is not None:
1039             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040
1041         # Vimeo specific: extract request signature and timestamp
1042         sig = config['request']['signature']
1043         timestamp = config['request']['timestamp']
1044
1045         # Vimeo specific: extract video codec and quality information
1046         # First consider quality, then codecs, then take everything
1047         # TODO bind to format param
1048         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049         files = { 'hd': [], 'sd': [], 'other': []}
1050         for codec_name, codec_extension in codecs:
1051             if codec_name in config["video"]["files"]:
1052                 if 'hd' in config["video"]["files"][codec_name]:
1053                     files['hd'].append((codec_name, codec_extension, 'hd'))
1054                 elif 'sd' in config["video"]["files"][codec_name]:
1055                     files['sd'].append((codec_name, codec_extension, 'sd'))
1056                 else:
1057                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059         for quality in ('hd', 'sd', 'other'):
1060             if len(files[quality]) > 0:
1061                 video_quality = files[quality][0][2]
1062                 video_codec = files[quality][0][0]
1063                 video_extension = files[quality][0][1]
1064                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1065                 break
1066         else:
1067             raise ExtractorError(u'No known codec found')
1068
1069         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071
1072         return [{
1073             'id':       video_id,
1074             'url':      video_url,
1075             'uploader': video_uploader,
1076             'uploader_id': video_uploader_id,
1077             'upload_date':  video_upload_date,
1078             'title':    video_title,
1079             'ext':      video_extension,
1080             'thumbnail':    video_thumbnail,
1081             'description':  video_description,
1082         }]
1083
1084
1085 class ArteTvIE(InfoExtractor):
1086     """arte.tv information extractor."""
1087
1088     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089     _LIVE_URL = r'index-[0-9]+\.html$'
1090
1091     IE_NAME = u'arte.tv'
1092
1093     def fetch_webpage(self, url):
1094         request = compat_urllib_request.Request(url)
1095         try:
1096             self.report_download_webpage(url)
1097             webpage = compat_urllib_request.urlopen(request).read()
1098         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1100         except ValueError as err:
1101             raise ExtractorError(u'Invalid URL: %s' % url)
1102         return webpage
1103
1104     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105         page = self.fetch_webpage(url)
1106         mobj = re.search(regex, page, regexFlags)
1107         info = {}
1108
1109         if mobj is None:
1110             raise ExtractorError(u'Invalid URL: %s' % url)
1111
1112         for (i, key, err) in matchTuples:
1113             if mobj.group(i) is None:
1114                 raise ExtractorError(err)
1115             else:
1116                 info[key] = mobj.group(i)
1117
1118         return info
1119
1120     def extractLiveStream(self, url):
1121         video_lang = url.split('/')[-4]
1122         info = self.grep_webpage(
1123             url,
1124             r'src="(.*?/videothek_js.*?\.js)',
1125             0,
1126             [
1127                 (1, 'url', u'Invalid URL: %s' % url)
1128             ]
1129         )
1130         http_host = url.split('/')[2]
1131         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132         info = self.grep_webpage(
1133             next_url,
1134             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135                 '(http://.*?\.swf).*?' +
1136                 '(rtmp://.*?)\'',
1137             re.DOTALL,
1138             [
1139                 (1, 'path',   u'could not extract video path: %s' % url),
1140                 (2, 'player', u'could not extract video player: %s' % url),
1141                 (3, 'url',    u'could not extract video url: %s' % url)
1142             ]
1143         )
1144         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1145
1146     def extractPlus7Stream(self, url):
1147         video_lang = url.split('/')[-3]
1148         info = self.grep_webpage(
1149             url,
1150             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1151             0,
1152             [
1153                 (1, 'url', u'Invalid URL: %s' % url)
1154             ]
1155         )
1156         next_url = compat_urllib_parse.unquote(info.get('url'))
1157         info = self.grep_webpage(
1158             next_url,
1159             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1160             0,
1161             [
1162                 (1, 'url', u'Could not find <video> tag: %s' % url)
1163             ]
1164         )
1165         next_url = compat_urllib_parse.unquote(info.get('url'))
1166
1167         info = self.grep_webpage(
1168             next_url,
1169             r'<video id="(.*?)".*?>.*?' +
1170                 '<name>(.*?)</name>.*?' +
1171                 '<dateVideo>(.*?)</dateVideo>.*?' +
1172                 '<url quality="hd">(.*?)</url>',
1173             re.DOTALL,
1174             [
1175                 (1, 'id',    u'could not extract video id: %s' % url),
1176                 (2, 'title', u'could not extract video title: %s' % url),
1177                 (3, 'date',  u'could not extract video date: %s' % url),
1178                 (4, 'url',   u'could not extract video url: %s' % url)
1179             ]
1180         )
1181
1182         return {
1183             'id':           info.get('id'),
1184             'url':          compat_urllib_parse.unquote(info.get('url')),
1185             'uploader':     u'arte.tv',
1186             'upload_date':  unified_strdate(info.get('date')),
1187             'title':        info.get('title').decode('utf-8'),
1188             'ext':          u'mp4',
1189             'format':       u'NA',
1190             'player_url':   None,
1191         }
1192
1193     def _real_extract(self, url):
1194         video_id = url.split('/')[-1]
1195         self.report_extraction(video_id)
1196
1197         if re.search(self._LIVE_URL, video_id) is not None:
1198             self.extractLiveStream(url)
1199             return
1200         else:
1201             info = self.extractPlus7Stream(url)
1202
1203         return [info]
1204
1205
1206 class GenericIE(InfoExtractor):
1207     """Generic last-resort information extractor."""
1208
1209     _VALID_URL = r'.*'
1210     IE_NAME = u'generic'
1211
1212     def report_download_webpage(self, video_id):
1213         """Report webpage download."""
1214         if not self._downloader.params.get('test', False):
1215             self._downloader.report_warning(u'Falling back on generic information extractor.')
1216         super(GenericIE, self).report_download_webpage(video_id)
1217
1218     def report_following_redirect(self, new_url):
1219         """Report information extraction."""
1220         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1221
1222     def _test_redirect(self, url):
1223         """Check if it is a redirect, like url shorteners, in case return the new url."""
1224         class HeadRequest(compat_urllib_request.Request):
1225             def get_method(self):
1226                 return "HEAD"
1227
1228         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1229             """
1230             Subclass the HTTPRedirectHandler to make it use our
1231             HeadRequest also on the redirected URL
1232             """
1233             def redirect_request(self, req, fp, code, msg, headers, newurl):
1234                 if code in (301, 302, 303, 307):
1235                     newurl = newurl.replace(' ', '%20')
1236                     newheaders = dict((k,v) for k,v in req.headers.items()
1237                                       if k.lower() not in ("content-length", "content-type"))
1238                     return HeadRequest(newurl,
1239                                        headers=newheaders,
1240                                        origin_req_host=req.get_origin_req_host(),
1241                                        unverifiable=True)
1242                 else:
1243                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1244
1245         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1246             """
1247             Fallback to GET if HEAD is not allowed (405 HTTP error)
1248             """
1249             def http_error_405(self, req, fp, code, msg, headers):
1250                 fp.read()
1251                 fp.close()
1252
1253                 newheaders = dict((k,v) for k,v in req.headers.items()
1254                                   if k.lower() not in ("content-length", "content-type"))
1255                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1256                                                  headers=newheaders,
1257                                                  origin_req_host=req.get_origin_req_host(),
1258                                                  unverifiable=True))
1259
1260         # Build our opener
1261         opener = compat_urllib_request.OpenerDirector()
1262         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263                         HTTPMethodFallback, HEADRedirectHandler,
1264                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1265             opener.add_handler(handler())
1266
1267         response = opener.open(HeadRequest(url))
1268         new_url = response.geturl()
1269
1270         if url == new_url:
1271             return False
1272
1273         self.report_following_redirect(new_url)
1274         return new_url
1275
1276     def _real_extract(self, url):
1277         new_url = self._test_redirect(url)
1278         if new_url: return [self.url_result(new_url)]
1279
1280         video_id = url.split('/')[-1]
1281         try:
1282             webpage = self._download_webpage(url, video_id)
1283         except ValueError as err:
1284             # since this is the last-resort InfoExtractor, if
1285             # this error is thrown, it'll be thrown here
1286             raise ExtractorError(u'Invalid URL: %s' % url)
1287
1288         self.report_extraction(video_id)
1289         # Start with something easy: JW Player in SWFObject
1290         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1291         if mobj is None:
1292             # Broaden the search a little bit
1293             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1294         if mobj is None:
1295             # Broaden the search a little bit: JWPlayer JS loader
1296             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1297         if mobj is None:
1298             raise ExtractorError(u'Invalid URL: %s' % url)
1299
1300         # It's possible that one of the regexes
1301         # matched, but returned an empty group:
1302         if mobj.group(1) is None:
1303             raise ExtractorError(u'Invalid URL: %s' % url)
1304
1305         video_url = compat_urllib_parse.unquote(mobj.group(1))
1306         video_id = os.path.basename(video_url)
1307
1308         # here's a fun little line of code for you:
1309         video_extension = os.path.splitext(video_id)[1][1:]
1310         video_id = os.path.splitext(video_id)[0]
1311
1312         # it's tempting to parse this further, but you would
1313         # have to take into account all the variations like
1314         #   Video Title - Site Name
1315         #   Site Name | Video Title
1316         #   Video Title - Tagline | Site Name
1317         # and so on and so forth; it's just not practical
1318         mobj = re.search(r'<title>(.*)</title>', webpage)
1319         if mobj is None:
1320             raise ExtractorError(u'Unable to extract title')
1321         video_title = mobj.group(1)
1322
1323         # video uploader is domain name
1324         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1325         if mobj is None:
1326             raise ExtractorError(u'Unable to extract title')
1327         video_uploader = mobj.group(1)
1328
1329         return [{
1330             'id':       video_id,
1331             'url':      video_url,
1332             'uploader': video_uploader,
1333             'upload_date':  None,
1334             'title':    video_title,
1335             'ext':      video_extension,
1336         }]
1337
1338
1339 class YoutubeSearchIE(InfoExtractor):
1340     """Information Extractor for YouTube search queries."""
1341     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343     _max_youtube_results = 1000
1344     IE_NAME = u'youtube:search'
1345
1346     def report_download_page(self, query, pagenum):
1347         """Report attempt to download search page with given number."""
1348         query = query.decode(preferredencoding())
1349         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1350
1351     def _real_extract(self, query):
1352         mobj = re.match(self._VALID_URL, query)
1353         if mobj is None:
1354             raise ExtractorError(u'Invalid search query "%s"' % query)
1355
1356         prefix, query = query.split(':')
1357         prefix = prefix[8:]
1358         query = query.encode('utf-8')
1359         if prefix == '':
1360             return self._get_n_results(query, 1)
1361         elif prefix == 'all':
1362             self._get_n_results(query, self._max_youtube_results)
1363         else:
1364             try:
1365                 n = int(prefix)
1366                 if n <= 0:
1367                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1368                 elif n > self._max_youtube_results:
1369                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370                     n = self._max_youtube_results
1371                 return self._get_n_results(query, n)
1372             except ValueError: # parsing prefix as integer fails
1373                 return self._get_n_results(query, 1)
1374
1375     def _get_n_results(self, query, n):
1376         """Get a specified number of results for a query"""
1377
1378         video_ids = []
1379         pagenum = 0
1380         limit = n
1381
1382         while (50 * pagenum) < limit:
1383             self.report_download_page(query, pagenum+1)
1384             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385             request = compat_urllib_request.Request(result_url)
1386             try:
1387                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1388             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1389                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1390             api_response = json.loads(data)['data']
1391
1392             if not 'items' in api_response:
1393                 raise ExtractorError(u'[youtube] No video results')
1394
1395             new_ids = list(video['id'] for video in api_response['items'])
1396             video_ids += new_ids
1397
1398             limit = min(n, api_response['totalItems'])
1399             pagenum += 1
1400
1401         if len(video_ids) > n:
1402             video_ids = video_ids[:n]
1403         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1404         return videos
1405
1406
1407 class GoogleSearchIE(InfoExtractor):
1408     """Information Extractor for Google Video search queries."""
1409     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1410     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1411     _max_google_results = 1000
1412     IE_NAME = u'video.google:search'
1413
1414     def _real_extract(self, query):
1415         mobj = re.match(self._VALID_URL, query)
1416
1417         prefix = mobj.group('prefix')
1418         query = mobj.group('query')
1419         if prefix == '':
1420             return self._get_n_results(query, 1)
1421         elif prefix == 'all':
1422             return self._get_n_results(query, self._max_google_results)
1423         else:
1424             n = int(prefix)
1425             if n <= 0:
1426                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427             elif n > self._max_google_results:
1428                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429                 n = self._max_google_results
1430             return self._get_n_results(query, n)
1431
1432     def _get_n_results(self, query, n):
1433         """Get a specified number of results for a query"""
1434
1435         res = {
1436             '_type': 'playlist',
1437             'id': query,
1438             'entries': []
1439         }
1440
1441         for pagenum in itertools.count(1):
1442             result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1443             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1444                                              note='Downloading result page ' + str(pagenum))
1445
1446             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1447                 e = {
1448                     '_type': 'url',
1449                     'url': mobj.group(1)
1450                 }
1451                 res['entries'].append(e)
1452
1453             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1454                 return res
1455
1456 class YahooSearchIE(InfoExtractor):
1457     """Information Extractor for Yahoo! Video search queries."""
1458
1459     _WORKING = False
1460     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1461     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1462     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1463     _MORE_PAGES_INDICATOR = r'\s*Next'
1464     _max_yahoo_results = 1000
1465     IE_NAME = u'video.yahoo:search'
1466
1467     def report_download_page(self, query, pagenum):
1468         """Report attempt to download playlist page with given number."""
1469         query = query.decode(preferredencoding())
1470         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1471
1472     def _real_extract(self, query):
1473         mobj = re.match(self._VALID_URL, query)
1474         if mobj is None:
1475             raise ExtractorError(u'Invalid search query "%s"' % query)
1476
1477         prefix, query = query.split(':')
1478         prefix = prefix[8:]
1479         query = query.encode('utf-8')
1480         if prefix == '':
1481             self._download_n_results(query, 1)
1482             return
1483         elif prefix == 'all':
1484             self._download_n_results(query, self._max_yahoo_results)
1485             return
1486         else:
1487             try:
1488                 n = int(prefix)
1489                 if n <= 0:
1490                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1491                 elif n > self._max_yahoo_results:
1492                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1493                     n = self._max_yahoo_results
1494                 self._download_n_results(query, n)
1495                 return
1496             except ValueError: # parsing prefix as integer fails
1497                 self._download_n_results(query, 1)
1498                 return
1499
1500     def _download_n_results(self, query, n):
1501         """Downloads a specified number of results for a query"""
1502
1503         video_ids = []
1504         already_seen = set()
1505         pagenum = 1
1506
1507         while True:
1508             self.report_download_page(query, pagenum)
1509             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1510             request = compat_urllib_request.Request(result_url)
1511             try:
1512                 page = compat_urllib_request.urlopen(request).read()
1513             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1514                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1515
1516             # Extract video identifiers
1517             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1518                 video_id = mobj.group(1)
1519                 if video_id not in already_seen:
1520                     video_ids.append(video_id)
1521                     already_seen.add(video_id)
1522                     if len(video_ids) == n:
1523                         # Specified n videos reached
1524                         for id in video_ids:
1525                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1526                         return
1527
1528             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1529                 for id in video_ids:
1530                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1531                 return
1532
1533             pagenum = pagenum + 1
1534
1535
1536 class YoutubePlaylistIE(InfoExtractor):
1537     """Information Extractor for YouTube playlists."""
1538
1539     _VALID_URL = r"""(?:
1540                         (?:https?://)?
1541                         (?:\w+\.)?
1542                         youtube\.com/
1543                         (?:
1544                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1545                            \? (?:.*?&)*? (?:p|a|list)=
1546                         |  p/
1547                         )
1548                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1549                         .*
1550                      |
1551                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1552                      )"""
1553     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1554     _MAX_RESULTS = 50
1555     IE_NAME = u'youtube:playlist'
1556
1557     @classmethod
1558     def suitable(cls, url):
1559         """Receives a URL and returns True if suitable for this IE."""
1560         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1561
1562     def _real_extract(self, url):
1563         # Extract playlist id
1564         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1565         if mobj is None:
1566             raise ExtractorError(u'Invalid URL: %s' % url)
1567
1568         # Download playlist videos from API
1569         playlist_id = mobj.group(1) or mobj.group(2)
1570         page_num = 1
1571         videos = []
1572
1573         while True:
1574             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1575             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1576
1577             try:
1578                 response = json.loads(page)
1579             except ValueError as err:
1580                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1581
1582             if 'feed' not in response:
1583                 raise ExtractorError(u'Got a malformed response from YouTube API')
1584             playlist_title = response['feed']['title']['$t']
1585             if 'entry' not in response['feed']:
1586                 # Number of videos is a multiple of self._MAX_RESULTS
1587                 break
1588
1589             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1590                         for entry in response['feed']['entry']
1591                         if 'content' in entry ]
1592
1593             if len(response['feed']['entry']) < self._MAX_RESULTS:
1594                 break
1595             page_num += 1
1596
1597         videos = [v[1] for v in sorted(videos)]
1598
1599         url_results = [self.url_result(url, 'Youtube') for url in videos]
1600         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1601
1602
1603 class YoutubeChannelIE(InfoExtractor):
1604     """Information Extractor for YouTube channels."""
1605
1606     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1607     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1608     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1609     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1610     IE_NAME = u'youtube:channel'
1611
1612     def extract_videos_from_page(self, page):
1613         ids_in_page = []
1614         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1615             if mobj.group(1) not in ids_in_page:
1616                 ids_in_page.append(mobj.group(1))
1617         return ids_in_page
1618
1619     def _real_extract(self, url):
1620         # Extract channel id
1621         mobj = re.match(self._VALID_URL, url)
1622         if mobj is None:
1623             raise ExtractorError(u'Invalid URL: %s' % url)
1624
1625         # Download channel page
1626         channel_id = mobj.group(1)
1627         video_ids = []
1628         pagenum = 1
1629
1630         url = self._TEMPLATE_URL % (channel_id, pagenum)
1631         page = self._download_webpage(url, channel_id,
1632                                       u'Downloading page #%s' % pagenum)
1633
1634         # Extract video identifiers
1635         ids_in_page = self.extract_videos_from_page(page)
1636         video_ids.extend(ids_in_page)
1637
1638         # Download any subsequent channel pages using the json-based channel_ajax query
1639         if self._MORE_PAGES_INDICATOR in page:
1640             while True:
1641                 pagenum = pagenum + 1
1642
1643                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1644                 page = self._download_webpage(url, channel_id,
1645                                               u'Downloading page #%s' % pagenum)
1646
1647                 page = json.loads(page)
1648
1649                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1650                 video_ids.extend(ids_in_page)
1651
1652                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1653                     break
1654
1655         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1656
1657         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1658         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1659         return [self.playlist_result(url_entries, channel_id)]
1660
1661
1662 class YoutubeUserIE(InfoExtractor):
1663     """Information Extractor for YouTube users."""
1664
1665     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1666     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1667     _GDATA_PAGE_SIZE = 50
1668     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1669     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1670     IE_NAME = u'youtube:user'
1671
1672     def _real_extract(self, url):
1673         # Extract username
1674         mobj = re.match(self._VALID_URL, url)
1675         if mobj is None:
1676             raise ExtractorError(u'Invalid URL: %s' % url)
1677
1678         username = mobj.group(1)
1679
1680         # Download video ids using YouTube Data API. Result size per
1681         # query is limited (currently to 50 videos) so we need to query
1682         # page by page until there are no video ids - it means we got
1683         # all of them.
1684
1685         video_ids = []
1686         pagenum = 0
1687
1688         while True:
1689             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1690
1691             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1692             page = self._download_webpage(gdata_url, username,
1693                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1694
1695             # Extract video identifiers
1696             ids_in_page = []
1697
1698             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1699                 if mobj.group(1) not in ids_in_page:
1700                     ids_in_page.append(mobj.group(1))
1701
1702             video_ids.extend(ids_in_page)
1703
1704             # A little optimization - if current page is not
1705             # "full", ie. does not contain PAGE_SIZE video ids then
1706             # we can assume that this page is the last one - there
1707             # are no more ids on further pages - no need to query
1708             # again.
1709
1710             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1711                 break
1712
1713             pagenum += 1
1714
1715         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1716         url_results = [self.url_result(url, 'Youtube') for url in urls]
1717         return [self.playlist_result(url_results, playlist_title = username)]
1718
1719
1720 class BlipTVUserIE(InfoExtractor):
1721     """Information Extractor for blip.tv users."""
1722
1723     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1724     _PAGE_SIZE = 12
1725     IE_NAME = u'blip.tv:user'
1726
1727     def _real_extract(self, url):
1728         # Extract username
1729         mobj = re.match(self._VALID_URL, url)
1730         if mobj is None:
1731             raise ExtractorError(u'Invalid URL: %s' % url)
1732
1733         username = mobj.group(1)
1734
1735         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1736
1737         page = self._download_webpage(url, username, u'Downloading user page')
1738         mobj = re.search(r'data-users-id="([^"]+)"', page)
1739         page_base = page_base % mobj.group(1)
1740
1741
1742         # Download video ids using BlipTV Ajax calls. Result size per
1743         # query is limited (currently to 12 videos) so we need to query
1744         # page by page until there are no video ids - it means we got
1745         # all of them.
1746
1747         video_ids = []
1748         pagenum = 1
1749
1750         while True:
1751             url = page_base + "&page=" + str(pagenum)
1752             page = self._download_webpage(url, username,
1753                                           u'Downloading video ids from page %d' % pagenum)
1754
1755             # Extract video identifiers
1756             ids_in_page = []
1757
1758             for mobj in re.finditer(r'href="/([^"]+)"', page):
1759                 if mobj.group(1) not in ids_in_page:
1760                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1761
1762             video_ids.extend(ids_in_page)
1763
1764             # A little optimization - if current page is not
1765             # "full", ie. does not contain PAGE_SIZE video ids then
1766             # we can assume that this page is the last one - there
1767             # are no more ids on further pages - no need to query
1768             # again.
1769
1770             if len(ids_in_page) < self._PAGE_SIZE:
1771                 break
1772
1773             pagenum += 1
1774
1775         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1776         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1777         return [self.playlist_result(url_entries, playlist_title = username)]
1778
1779
1780 class DepositFilesIE(InfoExtractor):
1781     """Information extractor for depositfiles.com"""
1782
1783     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1784
1785     def _real_extract(self, url):
1786         file_id = url.split('/')[-1]
1787         # Rebuild url in english locale
1788         url = 'http://depositfiles.com/en/files/' + file_id
1789
1790         # Retrieve file webpage with 'Free download' button pressed
1791         free_download_indication = { 'gateway_result' : '1' }
1792         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1793         try:
1794             self.report_download_webpage(file_id)
1795             webpage = compat_urllib_request.urlopen(request).read()
1796         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1797             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1798
1799         # Search for the real file URL
1800         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1801         if (mobj is None) or (mobj.group(1) is None):
1802             # Try to figure out reason of the error.
1803             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1804             if (mobj is not None) and (mobj.group(1) is not None):
1805                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1806                 raise ExtractorError(u'%s' % restriction_message)
1807             else:
1808                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1809
1810         file_url = mobj.group(1)
1811         file_extension = os.path.splitext(file_url)[1][1:]
1812
1813         # Search for file title
1814         mobj = re.search(r'<b title="(.*?)">', webpage)
1815         if mobj is None:
1816             raise ExtractorError(u'Unable to extract title')
1817         file_title = mobj.group(1).decode('utf-8')
1818
1819         return [{
1820             'id':       file_id.decode('utf-8'),
1821             'url':      file_url.decode('utf-8'),
1822             'uploader': None,
1823             'upload_date':  None,
1824             'title':    file_title,
1825             'ext':      file_extension.decode('utf-8'),
1826         }]
1827
1828
1829 class FacebookIE(InfoExtractor):
1830     """Information Extractor for Facebook"""
1831
1832     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1833     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1834     _NETRC_MACHINE = 'facebook'
1835     IE_NAME = u'facebook'
1836
1837     def report_login(self):
1838         """Report attempt to log in."""
1839         self.to_screen(u'Logging in')
1840
1841     def _real_initialize(self):
1842         if self._downloader is None:
1843             return
1844
1845         useremail = None
1846         password = None
1847         downloader_params = self._downloader.params
1848
1849         # Attempt to use provided username and password or .netrc data
1850         if downloader_params.get('username', None) is not None:
1851             useremail = downloader_params['username']
1852             password = downloader_params['password']
1853         elif downloader_params.get('usenetrc', False):
1854             try:
1855                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1856                 if info is not None:
1857                     useremail = info[0]
1858                     password = info[2]
1859                 else:
1860                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1861             except (IOError, netrc.NetrcParseError) as err:
1862                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1863                 return
1864
1865         if useremail is None:
1866             return
1867
1868         # Log in
1869         login_form = {
1870             'email': useremail,
1871             'pass': password,
1872             'login': 'Log+In'
1873             }
1874         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1875         try:
1876             self.report_login()
1877             login_results = compat_urllib_request.urlopen(request).read()
1878             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1879                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1880                 return
1881         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1883             return
1884
1885     def _real_extract(self, url):
1886         mobj = re.match(self._VALID_URL, url)
1887         if mobj is None:
1888             raise ExtractorError(u'Invalid URL: %s' % url)
1889         video_id = mobj.group('ID')
1890
1891         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1892         webpage = self._download_webpage(url, video_id)
1893
1894         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1895         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1896         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1897         if not m:
1898             raise ExtractorError(u'Cannot parse data')
1899         data = dict(json.loads(m.group(1)))
1900         params_raw = compat_urllib_parse.unquote(data['params'])
1901         params = json.loads(params_raw)
1902         video_data = params['video_data'][0]
1903         video_url = video_data.get('hd_src')
1904         if not video_url:
1905             video_url = video_data['sd_src']
1906         if not video_url:
1907             raise ExtractorError(u'Cannot find video URL')
1908         video_duration = int(video_data['video_duration'])
1909         thumbnail = video_data['thumbnail_src']
1910
1911         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1912         if not m:
1913             raise ExtractorError(u'Cannot find title in webpage')
1914         video_title = unescapeHTML(m.group(1))
1915
1916         info = {
1917             'id': video_id,
1918             'title': video_title,
1919             'url': video_url,
1920             'ext': 'mp4',
1921             'duration': video_duration,
1922             'thumbnail': thumbnail,
1923         }
1924         return [info]
1925
1926
1927 class BlipTVIE(InfoExtractor):
1928     """Information extractor for blip.tv"""
1929
1930     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1931     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1932     IE_NAME = u'blip.tv'
1933
1934     def report_direct_download(self, title):
1935         """Report information extraction."""
1936         self.to_screen(u'%s: Direct download detected' % title)
1937
1938     def _real_extract(self, url):
1939         mobj = re.match(self._VALID_URL, url)
1940         if mobj is None:
1941             raise ExtractorError(u'Invalid URL: %s' % url)
1942
1943         urlp = compat_urllib_parse_urlparse(url)
1944         if urlp.path.startswith('/play/'):
1945             request = compat_urllib_request.Request(url)
1946             response = compat_urllib_request.urlopen(request)
1947             redirecturl = response.geturl()
1948             rurlp = compat_urllib_parse_urlparse(redirecturl)
1949             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1950             url = 'http://blip.tv/a/a-' + file_id
1951             return self._real_extract(url)
1952
1953
1954         if '?' in url:
1955             cchar = '&'
1956         else:
1957             cchar = '?'
1958         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1959         request = compat_urllib_request.Request(json_url)
1960         request.add_header('User-Agent', 'iTunes/10.6.1')
1961         self.report_extraction(mobj.group(1))
1962         info = None
1963         try:
1964             urlh = compat_urllib_request.urlopen(request)
1965             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1966                 basename = url.split('/')[-1]
1967                 title,ext = os.path.splitext(basename)
1968                 title = title.decode('UTF-8')
1969                 ext = ext.replace('.', '')
1970                 self.report_direct_download(title)
1971                 info = {
1972                     'id': title,
1973                     'url': url,
1974                     'uploader': None,
1975                     'upload_date': None,
1976                     'title': title,
1977                     'ext': ext,
1978                     'urlhandle': urlh
1979                 }
1980         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1981             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1982         if info is None: # Regular URL
1983             try:
1984                 json_code_bytes = urlh.read()
1985                 json_code = json_code_bytes.decode('utf-8')
1986             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1987                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1988
1989             try:
1990                 json_data = json.loads(json_code)
1991                 if 'Post' in json_data:
1992                     data = json_data['Post']
1993                 else:
1994                     data = json_data
1995
1996                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1997                 video_url = data['media']['url']
1998                 umobj = re.match(self._URL_EXT, video_url)
1999                 if umobj is None:
2000                     raise ValueError('Can not determine filename extension')
2001                 ext = umobj.group(1)
2002
2003                 info = {
2004                     'id': data['item_id'],
2005                     'url': video_url,
2006                     'uploader': data['display_name'],
2007                     'upload_date': upload_date,
2008                     'title': data['title'],
2009                     'ext': ext,
2010                     'format': data['media']['mimeType'],
2011                     'thumbnail': data['thumbnailUrl'],
2012                     'description': data['description'],
2013                     'player_url': data['embedUrl'],
2014                     'user_agent': 'iTunes/10.6.1',
2015                 }
2016             except (ValueError,KeyError) as err:
2017                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2018
2019         return [info]
2020
2021
2022 class MyVideoIE(InfoExtractor):
2023     """Information Extractor for myvideo.de."""
2024
2025     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2026     IE_NAME = u'myvideo'
2027
2028     def _real_extract(self,url):
2029         mobj = re.match(self._VALID_URL, url)
2030         if mobj is None:
2031             raise ExtractorError(u'Invalid URL: %s' % url)
2032
2033         video_id = mobj.group(1)
2034
2035         # Get video webpage
2036         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2037         webpage = self._download_webpage(webpage_url, video_id)
2038
2039         self.report_extraction(video_id)
2040         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2041                  webpage)
2042         if mobj is None:
2043             raise ExtractorError(u'Unable to extract media URL')
2044         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2045
2046         mobj = re.search('<title>([^<]+)</title>', webpage)
2047         if mobj is None:
2048             raise ExtractorError(u'Unable to extract title')
2049
2050         video_title = mobj.group(1)
2051
2052         return [{
2053             'id':       video_id,
2054             'url':      video_url,
2055             'uploader': None,
2056             'upload_date':  None,
2057             'title':    video_title,
2058             'ext':      u'flv',
2059         }]
2060
2061 class ComedyCentralIE(InfoExtractor):
2062     """Information extractor for The Daily Show and Colbert Report """
2063
2064     # urls can be abbreviations like :thedailyshow or :colbert
2065     # urls for episodes like:
2066     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2067     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2068     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2069     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2070                       |(https?://)?(www\.)?
2071                           (?P<showname>thedailyshow|colbertnation)\.com/
2072                          (full-episodes/(?P<episode>.*)|
2073                           (?P<clip>
2074                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2075                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2076                      $"""
2077
2078     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2079
2080     _video_extensions = {
2081         '3500': 'mp4',
2082         '2200': 'mp4',
2083         '1700': 'mp4',
2084         '1200': 'mp4',
2085         '750': 'mp4',
2086         '400': 'mp4',
2087     }
2088     _video_dimensions = {
2089         '3500': '1280x720',
2090         '2200': '960x540',
2091         '1700': '768x432',
2092         '1200': '640x360',
2093         '750': '512x288',
2094         '400': '384x216',
2095     }
2096
2097     @classmethod
2098     def suitable(cls, url):
2099         """Receives a URL and returns True if suitable for this IE."""
2100         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2101
2102     def _print_formats(self, formats):
2103         print('Available formats:')
2104         for x in formats:
2105             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2106
2107
2108     def _real_extract(self, url):
2109         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2110         if mobj is None:
2111             raise ExtractorError(u'Invalid URL: %s' % url)
2112
2113         if mobj.group('shortname'):
2114             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2115                 url = u'http://www.thedailyshow.com/full-episodes/'
2116             else:
2117                 url = u'http://www.colbertnation.com/full-episodes/'
2118             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2119             assert mobj is not None
2120
2121         if mobj.group('clip'):
2122             if mobj.group('showname') == 'thedailyshow':
2123                 epTitle = mobj.group('tdstitle')
2124             else:
2125                 epTitle = mobj.group('cntitle')
2126             dlNewest = False
2127         else:
2128             dlNewest = not mobj.group('episode')
2129             if dlNewest:
2130                 epTitle = mobj.group('showname')
2131             else:
2132                 epTitle = mobj.group('episode')
2133
2134         self.report_extraction(epTitle)
2135         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2136         if dlNewest:
2137             url = htmlHandle.geturl()
2138             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2139             if mobj is None:
2140                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2141             if mobj.group('episode') == '':
2142                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2143             epTitle = mobj.group('episode')
2144
2145         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2146
2147         if len(mMovieParams) == 0:
2148             # The Colbert Report embeds the information in a without
2149             # a URL prefix; so extract the alternate reference
2150             # and then add the URL prefix manually.
2151
2152             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2153             if len(altMovieParams) == 0:
2154                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2155             else:
2156                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2157
2158         uri = mMovieParams[0][1]
2159         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2160         indexXml = self._download_webpage(indexUrl, epTitle,
2161                                           u'Downloading show index',
2162                                           u'unable to download episode index')
2163
2164         results = []
2165
2166         idoc = xml.etree.ElementTree.fromstring(indexXml)
2167         itemEls = idoc.findall('.//item')
2168         for partNum,itemEl in enumerate(itemEls):
2169             mediaId = itemEl.findall('./guid')[0].text
2170             shortMediaId = mediaId.split(':')[-1]
2171             showId = mediaId.split(':')[-2].replace('.com', '')
2172             officialTitle = itemEl.findall('./title')[0].text
2173             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2174
2175             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2176                         compat_urllib_parse.urlencode({'uri': mediaId}))
2177             configXml = self._download_webpage(configUrl, epTitle,
2178                                                u'Downloading configuration for %s' % shortMediaId)
2179
2180             cdoc = xml.etree.ElementTree.fromstring(configXml)
2181             turls = []
2182             for rendition in cdoc.findall('.//rendition'):
2183                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2184                 turls.append(finfo)
2185
2186             if len(turls) == 0:
2187                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2188                 continue
2189
2190             if self._downloader.params.get('listformats', None):
2191                 self._print_formats([i[0] for i in turls])
2192                 return
2193
2194             # For now, just pick the highest bitrate
2195             format,rtmp_video_url = turls[-1]
2196
2197             # Get the format arg from the arg stream
2198             req_format = self._downloader.params.get('format', None)
2199
2200             # Select format if we can find one
2201             for f,v in turls:
2202                 if f == req_format:
2203                     format, rtmp_video_url = f, v
2204                     break
2205
2206             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2207             if not m:
2208                 raise ExtractorError(u'Cannot transform RTMP url')
2209             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2210             video_url = base + m.group('finalid')
2211
2212             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2213             info = {
2214                 'id': shortMediaId,
2215                 'url': video_url,
2216                 'uploader': showId,
2217                 'upload_date': officialDate,
2218                 'title': effTitle,
2219                 'ext': 'mp4',
2220                 'format': format,
2221                 'thumbnail': None,
2222                 'description': officialTitle,
2223             }
2224             results.append(info)
2225
2226         return results
2227
2228
2229 class EscapistIE(InfoExtractor):
2230     """Information extractor for The Escapist """
2231
2232     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2233     IE_NAME = u'escapist'
2234
2235     def _real_extract(self, url):
2236         mobj = re.match(self._VALID_URL, url)
2237         if mobj is None:
2238             raise ExtractorError(u'Invalid URL: %s' % url)
2239         showName = mobj.group('showname')
2240         videoId = mobj.group('episode')
2241
2242         self.report_extraction(showName)
2243         webPage = self._download_webpage(url, showName)
2244
2245         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2246         description = unescapeHTML(descMatch.group(1))
2247         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2248         imgUrl = unescapeHTML(imgMatch.group(1))
2249         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2250         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2251         configUrlMatch = re.search('config=(.*)$', playerUrl)
2252         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2253
2254         configJSON = self._download_webpage(configUrl, showName,
2255                                             u'Downloading configuration',
2256                                             u'unable to download configuration')
2257
2258         # Technically, it's JavaScript, not JSON
2259         configJSON = configJSON.replace("'", '"')
2260
2261         try:
2262             config = json.loads(configJSON)
2263         except (ValueError,) as err:
2264             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2265
2266         playlist = config['playlist']
2267         videoUrl = playlist[1]['url']
2268
2269         info = {
2270             'id': videoId,
2271             'url': videoUrl,
2272             'uploader': showName,
2273             'upload_date': None,
2274             'title': showName,
2275             'ext': 'mp4',
2276             'thumbnail': imgUrl,
2277             'description': description,
2278             'player_url': playerUrl,
2279         }
2280
2281         return [info]
2282
2283 class CollegeHumorIE(InfoExtractor):
2284     """Information extractor for collegehumor.com"""
2285
2286     _WORKING = False
2287     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2288     IE_NAME = u'collegehumor'
2289
2290     def report_manifest(self, video_id):
2291         """Report information extraction."""
2292         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2293
2294     def _real_extract(self, url):
2295         mobj = re.match(self._VALID_URL, url)
2296         if mobj is None:
2297             raise ExtractorError(u'Invalid URL: %s' % url)
2298         video_id = mobj.group('videoid')
2299
2300         info = {
2301             'id': video_id,
2302             'uploader': None,
2303             'upload_date': None,
2304         }
2305
2306         self.report_extraction(video_id)
2307         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2308         try:
2309             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2311             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2312
2313         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2314         try:
2315             videoNode = mdoc.findall('./video')[0]
2316             info['description'] = videoNode.findall('./description')[0].text
2317             info['title'] = videoNode.findall('./caption')[0].text
2318             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2319             manifest_url = videoNode.findall('./file')[0].text
2320         except IndexError:
2321             raise ExtractorError(u'Invalid metadata XML file')
2322
2323         manifest_url += '?hdcore=2.10.3'
2324         self.report_manifest(video_id)
2325         try:
2326             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2328             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2329
2330         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2331         try:
2332             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2333             node_id = media_node.attrib['url']
2334             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2335         except IndexError as err:
2336             raise ExtractorError(u'Invalid manifest file')
2337
2338         url_pr = compat_urllib_parse_urlparse(manifest_url)
2339         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2340
2341         info['url'] = url
2342         info['ext'] = 'f4f'
2343         return [info]
2344
2345
2346 class XVideosIE(InfoExtractor):
2347     """Information extractor for xvideos.com"""
2348
2349     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2350     IE_NAME = u'xvideos'
2351
2352     def _real_extract(self, url):
2353         mobj = re.match(self._VALID_URL, url)
2354         if mobj is None:
2355             raise ExtractorError(u'Invalid URL: %s' % url)
2356         video_id = mobj.group(1)
2357
2358         webpage = self._download_webpage(url, video_id)
2359
2360         self.report_extraction(video_id)
2361
2362
2363         # Extract video URL
2364         mobj = re.search(r'flv_url=(.+?)&', webpage)
2365         if mobj is None:
2366             raise ExtractorError(u'Unable to extract video url')
2367         video_url = compat_urllib_parse.unquote(mobj.group(1))
2368
2369
2370         # Extract title
2371         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2372         if mobj is None:
2373             raise ExtractorError(u'Unable to extract video title')
2374         video_title = mobj.group(1)
2375
2376
2377         # Extract video thumbnail
2378         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2379         if mobj is None:
2380             raise ExtractorError(u'Unable to extract video thumbnail')
2381         video_thumbnail = mobj.group(0)
2382
2383         info = {
2384             'id': video_id,
2385             'url': video_url,
2386             'uploader': None,
2387             'upload_date': None,
2388             'title': video_title,
2389             'ext': 'flv',
2390             'thumbnail': video_thumbnail,
2391             'description': None,
2392         }
2393
2394         return [info]
2395
2396
2397 class SoundcloudIE(InfoExtractor):
2398     """Information extractor for soundcloud.com
2399        To access the media, the uid of the song and a stream token
2400        must be extracted from the page source and the script must make
2401        a request to media.soundcloud.com/crossdomain.xml. Then
2402        the media can be grabbed by requesting from an url composed
2403        of the stream token and uid
2404      """
2405
2406     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2407     IE_NAME = u'soundcloud'
2408
2409     def report_resolve(self, video_id):
2410         """Report information extraction."""
2411         self.to_screen(u'%s: Resolving id' % video_id)
2412
2413     def _real_extract(self, url):
2414         mobj = re.match(self._VALID_URL, url)
2415         if mobj is None:
2416             raise ExtractorError(u'Invalid URL: %s' % url)
2417
2418         # extract uploader (which is in the url)
2419         uploader = mobj.group(1)
2420         # extract simple title (uploader + slug of song title)
2421         slug_title =  mobj.group(2)
2422         simple_title = uploader + u'-' + slug_title
2423         full_title = '%s/%s' % (uploader, slug_title)
2424
2425         self.report_resolve(full_title)
2426
2427         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2428         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2429         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2430
2431         info = json.loads(info_json)
2432         video_id = info['id']
2433         self.report_extraction(full_title)
2434
2435         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2436         stream_json = self._download_webpage(streams_url, full_title,
2437                                              u'Downloading stream definitions',
2438                                              u'unable to download stream definitions')
2439
2440         streams = json.loads(stream_json)
2441         mediaURL = streams['http_mp3_128_url']
2442         upload_date = unified_strdate(info['created_at'])
2443
2444         return [{
2445             'id':       info['id'],
2446             'url':      mediaURL,
2447             'uploader': info['user']['username'],
2448             'upload_date': upload_date,
2449             'title':    info['title'],
2450             'ext':      u'mp3',
2451             'description': info['description'],
2452         }]
2453
2454 class SoundcloudSetIE(InfoExtractor):
2455     """Information extractor for soundcloud.com sets
2456        To access the media, the uid of the song and a stream token
2457        must be extracted from the page source and the script must make
2458        a request to media.soundcloud.com/crossdomain.xml. Then
2459        the media can be grabbed by requesting from an url composed
2460        of the stream token and uid
2461      """
2462
2463     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2464     IE_NAME = u'soundcloud:set'
2465
2466     def report_resolve(self, video_id):
2467         """Report information extraction."""
2468         self.to_screen(u'%s: Resolving id' % video_id)
2469
2470     def _real_extract(self, url):
2471         mobj = re.match(self._VALID_URL, url)
2472         if mobj is None:
2473             raise ExtractorError(u'Invalid URL: %s' % url)
2474
2475         # extract uploader (which is in the url)
2476         uploader = mobj.group(1)
2477         # extract simple title (uploader + slug of song title)
2478         slug_title =  mobj.group(2)
2479         simple_title = uploader + u'-' + slug_title
2480         full_title = '%s/sets/%s' % (uploader, slug_title)
2481
2482         self.report_resolve(full_title)
2483
2484         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2485         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2486         info_json = self._download_webpage(resolv_url, full_title)
2487
2488         videos = []
2489         info = json.loads(info_json)
2490         if 'errors' in info:
2491             for err in info['errors']:
2492                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2493             return
2494
2495         self.report_extraction(full_title)
2496         for track in info['tracks']:
2497             video_id = track['id']
2498
2499             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2500             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2501
2502             self.report_extraction(video_id)
2503             streams = json.loads(stream_json)
2504             mediaURL = streams['http_mp3_128_url']
2505
2506             videos.append({
2507                 'id':       video_id,
2508                 'url':      mediaURL,
2509                 'uploader': track['user']['username'],
2510                 'upload_date':  unified_strdate(track['created_at']),
2511                 'title':    track['title'],
2512                 'ext':      u'mp3',
2513                 'description': track['description'],
2514             })
2515         return videos
2516
2517
2518 class InfoQIE(InfoExtractor):
2519     """Information extractor for infoq.com"""
2520     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2521
2522     def _real_extract(self, url):
2523         mobj = re.match(self._VALID_URL, url)
2524         if mobj is None:
2525             raise ExtractorError(u'Invalid URL: %s' % url)
2526
2527         webpage = self._download_webpage(url, video_id=url)
2528         self.report_extraction(url)
2529
2530         # Extract video URL
2531         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2532         if mobj is None:
2533             raise ExtractorError(u'Unable to extract video url')
2534         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2535         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2536
2537         # Extract title
2538         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2539         if mobj is None:
2540             raise ExtractorError(u'Unable to extract video title')
2541         video_title = mobj.group(1)
2542
2543         # Extract description
2544         video_description = u'No description available.'
2545         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2546         if mobj is not None:
2547             video_description = mobj.group(1)
2548
2549         video_filename = video_url.split('/')[-1]
2550         video_id, extension = video_filename.split('.')
2551
2552         info = {
2553             'id': video_id,
2554             'url': video_url,
2555             'uploader': None,
2556             'upload_date': None,
2557             'title': video_title,
2558             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2559             'thumbnail': None,
2560             'description': video_description,
2561         }
2562
2563         return [info]
2564
2565 class MixcloudIE(InfoExtractor):
2566     """Information extractor for www.mixcloud.com"""
2567
2568     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2569     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2570     IE_NAME = u'mixcloud'
2571
2572     def report_download_json(self, file_id):
2573         """Report JSON download."""
2574         self.to_screen(u'Downloading json')
2575
2576     def get_urls(self, jsonData, fmt, bitrate='best'):
2577         """Get urls from 'audio_formats' section in json"""
2578         file_url = None
2579         try:
2580             bitrate_list = jsonData[fmt]
2581             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2582                 bitrate = max(bitrate_list) # select highest
2583
2584             url_list = jsonData[fmt][bitrate]
2585         except TypeError: # we have no bitrate info.
2586             url_list = jsonData[fmt]
2587         return url_list
2588
2589     def check_urls(self, url_list):
2590         """Returns 1st active url from list"""
2591         for url in url_list:
2592             try:
2593                 compat_urllib_request.urlopen(url)
2594                 return url
2595             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2596                 url = None
2597
2598         return None
2599
2600     def _print_formats(self, formats):
2601         print('Available formats:')
2602         for fmt in formats.keys():
2603             for b in formats[fmt]:
2604                 try:
2605                     ext = formats[fmt][b][0]
2606                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2607                 except TypeError: # we have no bitrate info
2608                     ext = formats[fmt][0]
2609                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2610                     break
2611
2612     def _real_extract(self, url):
2613         mobj = re.match(self._VALID_URL, url)
2614         if mobj is None:
2615             raise ExtractorError(u'Invalid URL: %s' % url)
2616         # extract uploader & filename from url
2617         uploader = mobj.group(1).decode('utf-8')
2618         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2619
2620         # construct API request
2621         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2622         # retrieve .json file with links to files
2623         request = compat_urllib_request.Request(file_url)
2624         try:
2625             self.report_download_json(file_url)
2626             jsonData = compat_urllib_request.urlopen(request).read()
2627         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2629
2630         # parse JSON
2631         json_data = json.loads(jsonData)
2632         player_url = json_data['player_swf_url']
2633         formats = dict(json_data['audio_formats'])
2634
2635         req_format = self._downloader.params.get('format', None)
2636         bitrate = None
2637
2638         if self._downloader.params.get('listformats', None):
2639             self._print_formats(formats)
2640             return
2641
2642         if req_format is None or req_format == 'best':
2643             for format_param in formats.keys():
2644                 url_list = self.get_urls(formats, format_param)
2645                 # check urls
2646                 file_url = self.check_urls(url_list)
2647                 if file_url is not None:
2648                     break # got it!
2649         else:
2650             if req_format not in formats:
2651                 raise ExtractorError(u'Format is not available')
2652
2653             url_list = self.get_urls(formats, req_format)
2654             file_url = self.check_urls(url_list)
2655             format_param = req_format
2656
2657         return [{
2658             'id': file_id.decode('utf-8'),
2659             'url': file_url.decode('utf-8'),
2660             'uploader': uploader.decode('utf-8'),
2661             'upload_date': None,
2662             'title': json_data['name'],
2663             'ext': file_url.split('.')[-1].decode('utf-8'),
2664             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2665             'thumbnail': json_data['thumbnail_url'],
2666             'description': json_data['description'],
2667             'player_url': player_url.decode('utf-8'),
2668         }]
2669
2670 class StanfordOpenClassroomIE(InfoExtractor):
2671     """Information extractor for Stanford's Open ClassRoom"""
2672
2673     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2674     IE_NAME = u'stanfordoc'
2675
2676     def _real_extract(self, url):
2677         mobj = re.match(self._VALID_URL, url)
2678         if mobj is None:
2679             raise ExtractorError(u'Invalid URL: %s' % url)
2680
2681         if mobj.group('course') and mobj.group('video'): # A specific video
2682             course = mobj.group('course')
2683             video = mobj.group('video')
2684             info = {
2685                 'id': course + '_' + video,
2686                 'uploader': None,
2687                 'upload_date': None,
2688             }
2689
2690             self.report_extraction(info['id'])
2691             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2692             xmlUrl = baseUrl + video + '.xml'
2693             try:
2694                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2695             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2696                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2697             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2698             try:
2699                 info['title'] = mdoc.findall('./title')[0].text
2700                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2701             except IndexError:
2702                 raise ExtractorError(u'Invalid metadata XML file')
2703             info['ext'] = info['url'].rpartition('.')[2]
2704             return [info]
2705         elif mobj.group('course'): # A course page
2706             course = mobj.group('course')
2707             info = {
2708                 'id': course,
2709                 'type': 'playlist',
2710                 'uploader': None,
2711                 'upload_date': None,
2712             }
2713
2714             coursepage = self._download_webpage(url, info['id'],
2715                                         note='Downloading course info page',
2716                                         errnote='Unable to download course info page')
2717
2718             m = re.search('<h1>([^<]+)</h1>', coursepage)
2719             if m:
2720                 info['title'] = unescapeHTML(m.group(1))
2721             else:
2722                 info['title'] = info['id']
2723
2724             m = re.search('<description>([^<]+)</description>', coursepage)
2725             if m:
2726                 info['description'] = unescapeHTML(m.group(1))
2727
2728             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2729             info['list'] = [
2730                 {
2731                     'type': 'reference',
2732                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2733                 }
2734                     for vpage in links]
2735             results = []
2736             for entry in info['list']:
2737                 assert entry['type'] == 'reference'
2738                 results += self.extract(entry['url'])
2739             return results
2740         else: # Root page
2741             info = {
2742                 'id': 'Stanford OpenClassroom',
2743                 'type': 'playlist',
2744                 'uploader': None,
2745                 'upload_date': None,
2746             }
2747
2748             self.report_download_webpage(info['id'])
2749             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2750             try:
2751                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2752             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2753                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2754
2755             info['title'] = info['id']
2756
2757             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2758             info['list'] = [
2759                 {
2760                     'type': 'reference',
2761                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2762                 }
2763                     for cpage in links]
2764
2765             results = []
2766             for entry in info['list']:
2767                 assert entry['type'] == 'reference'
2768                 results += self.extract(entry['url'])
2769             return results
2770
2771 class MTVIE(InfoExtractor):
2772     """Information extractor for MTV.com"""
2773
2774     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2775     IE_NAME = u'mtv'
2776
2777     def _real_extract(self, url):
2778         mobj = re.match(self._VALID_URL, url)
2779         if mobj is None:
2780             raise ExtractorError(u'Invalid URL: %s' % url)
2781         if not mobj.group('proto'):
2782             url = 'http://' + url
2783         video_id = mobj.group('videoid')
2784
2785         webpage = self._download_webpage(url, video_id)
2786
2787         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2788         if mobj is None:
2789             raise ExtractorError(u'Unable to extract song name')
2790         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2791         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2792         if mobj is None:
2793             raise ExtractorError(u'Unable to extract performer')
2794         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2795         video_title = performer + ' - ' + song_name
2796
2797         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2798         if mobj is None:
2799             raise ExtractorError(u'Unable to mtvn_uri')
2800         mtvn_uri = mobj.group(1)
2801
2802         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2803         if mobj is None:
2804             raise ExtractorError(u'Unable to extract content id')
2805         content_id = mobj.group(1)
2806
2807         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2808         self.report_extraction(video_id)
2809         request = compat_urllib_request.Request(videogen_url)
2810         try:
2811             metadataXml = compat_urllib_request.urlopen(request).read()
2812         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2814
2815         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2816         renditions = mdoc.findall('.//rendition')
2817
2818         # For now, always pick the highest quality.
2819         rendition = renditions[-1]
2820
2821         try:
2822             _,_,ext = rendition.attrib['type'].partition('/')
2823             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2824             video_url = rendition.find('./src').text
2825         except KeyError:
2826             raise ExtractorError('Invalid rendition field.')
2827
2828         info = {
2829             'id': video_id,
2830             'url': video_url,
2831             'uploader': performer,
2832             'upload_date': None,
2833             'title': video_title,
2834             'ext': ext,
2835             'format': format,
2836         }
2837
2838         return [info]
2839
2840
2841 class YoukuIE(InfoExtractor):
2842     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2843
2844     def _gen_sid(self):
2845         nowTime = int(time.time() * 1000)
2846         random1 = random.randint(1000,1998)
2847         random2 = random.randint(1000,9999)
2848
2849         return "%d%d%d" %(nowTime,random1,random2)
2850
2851     def _get_file_ID_mix_string(self, seed):
2852         mixed = []
2853         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2854         seed = float(seed)
2855         for i in range(len(source)):
2856             seed  =  (seed * 211 + 30031 ) % 65536
2857             index  =  math.floor(seed / 65536 * len(source) )
2858             mixed.append(source[int(index)])
2859             source.remove(source[int(index)])
2860         #return ''.join(mixed)
2861         return mixed
2862
2863     def _get_file_id(self, fileId, seed):
2864         mixed = self._get_file_ID_mix_string(seed)
2865         ids = fileId.split('*')
2866         realId = []
2867         for ch in ids:
2868             if ch:
2869                 realId.append(mixed[int(ch)])
2870         return ''.join(realId)
2871
2872     def _real_extract(self, url):
2873         mobj = re.match(self._VALID_URL, url)
2874         if mobj is None:
2875             raise ExtractorError(u'Invalid URL: %s' % url)
2876         video_id = mobj.group('ID')
2877
2878         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2879
2880         jsondata = self._download_webpage(info_url, video_id)
2881
2882         self.report_extraction(video_id)
2883         try:
2884             config = json.loads(jsondata)
2885
2886             video_title =  config['data'][0]['title']
2887             seed = config['data'][0]['seed']
2888
2889             format = self._downloader.params.get('format', None)
2890             supported_format = list(config['data'][0]['streamfileids'].keys())
2891
2892             if format is None or format == 'best':
2893                 if 'hd2' in supported_format:
2894                     format = 'hd2'
2895                 else:
2896                     format = 'flv'
2897                 ext = u'flv'
2898             elif format == 'worst':
2899                 format = 'mp4'
2900                 ext = u'mp4'
2901             else:
2902                 format = 'flv'
2903                 ext = u'flv'
2904
2905
2906             fileid = config['data'][0]['streamfileids'][format]
2907             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2908         except (UnicodeDecodeError, ValueError, KeyError):
2909             raise ExtractorError(u'Unable to extract info section')
2910
2911         files_info=[]
2912         sid = self._gen_sid()
2913         fileid = self._get_file_id(fileid, seed)
2914
2915         #column 8,9 of fileid represent the segment number
2916         #fileid[7:9] should be changed
2917         for index, key in enumerate(keys):
2918
2919             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2920             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2921
2922             info = {
2923                 'id': '%s_part%02d' % (video_id, index),
2924                 'url': download_url,
2925                 'uploader': None,
2926                 'upload_date': None,
2927                 'title': video_title,
2928                 'ext': ext,
2929             }
2930             files_info.append(info)
2931
2932         return files_info
2933
2934
2935 class XNXXIE(InfoExtractor):
2936     """Information extractor for xnxx.com"""
2937
2938     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2939     IE_NAME = u'xnxx'
2940     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2941     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2942     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2943
2944     def _real_extract(self, url):
2945         mobj = re.match(self._VALID_URL, url)
2946         if mobj is None:
2947             raise ExtractorError(u'Invalid URL: %s' % url)
2948         video_id = mobj.group(1)
2949
2950         # Get webpage content
2951         webpage = self._download_webpage(url, video_id)
2952
2953         result = re.search(self.VIDEO_URL_RE, webpage)
2954         if result is None:
2955             raise ExtractorError(u'Unable to extract video url')
2956         video_url = compat_urllib_parse.unquote(result.group(1))
2957
2958         result = re.search(self.VIDEO_TITLE_RE, webpage)
2959         if result is None:
2960             raise ExtractorError(u'Unable to extract video title')
2961         video_title = result.group(1)
2962
2963         result = re.search(self.VIDEO_THUMB_RE, webpage)
2964         if result is None:
2965             raise ExtractorError(u'Unable to extract video thumbnail')
2966         video_thumbnail = result.group(1)
2967
2968         return [{
2969             'id': video_id,
2970             'url': video_url,
2971             'uploader': None,
2972             'upload_date': None,
2973             'title': video_title,
2974             'ext': 'flv',
2975             'thumbnail': video_thumbnail,
2976             'description': None,
2977         }]
2978
2979
2980 class GooglePlusIE(InfoExtractor):
2981     """Information extractor for plus.google.com."""
2982
2983     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2984     IE_NAME = u'plus.google'
2985
2986     def report_extract_entry(self, url):
2987         """Report downloading extry"""
2988         self.to_screen(u'Downloading entry: %s' % url)
2989
2990     def report_date(self, upload_date):
2991         """Report downloading extry"""
2992         self.to_screen(u'Entry date: %s' % upload_date)
2993
2994     def report_uploader(self, uploader):
2995         """Report downloading extry"""
2996         self.to_screen(u'Uploader: %s' % uploader)
2997
2998     def report_title(self, video_title):
2999         """Report downloading extry"""
3000         self.to_screen(u'Title: %s' % video_title)
3001
3002     def report_extract_vid_page(self, video_page):
3003         """Report information extraction."""
3004         self.to_screen(u'Extracting video page: %s' % video_page)
3005
3006     def _real_extract(self, url):
3007         # Extract id from URL
3008         mobj = re.match(self._VALID_URL, url)
3009         if mobj is None:
3010             raise ExtractorError(u'Invalid URL: %s' % url)
3011
3012         post_url = mobj.group(0)
3013         video_id = mobj.group(1)
3014
3015         video_extension = 'flv'
3016
3017         # Step 1, Retrieve post webpage to extract further information
3018         self.report_extract_entry(post_url)
3019         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3020
3021         # Extract update date
3022         upload_date = None
3023         pattern = 'title="Timestamp">(.*?)</a>'
3024         mobj = re.search(pattern, webpage)
3025         if mobj:
3026             upload_date = mobj.group(1)
3027             # Convert timestring to a format suitable for filename
3028             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3029             upload_date = upload_date.strftime('%Y%m%d')
3030         self.report_date(upload_date)
3031
3032         # Extract uploader
3033         uploader = None
3034         pattern = r'rel\="author".*?>(.*?)</a>'
3035         mobj = re.search(pattern, webpage)
3036         if mobj:
3037             uploader = mobj.group(1)
3038         self.report_uploader(uploader)
3039
3040         # Extract title
3041         # Get the first line for title
3042         video_title = u'NA'
3043         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3044         mobj = re.search(pattern, webpage)
3045         if mobj:
3046             video_title = mobj.group(1)
3047         self.report_title(video_title)
3048
3049         # Step 2, Stimulate clicking the image box to launch video
3050         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3051         mobj = re.search(pattern, webpage)
3052         if mobj is None:
3053             raise ExtractorError(u'Unable to extract video page URL')
3054
3055         video_page = mobj.group(1)
3056         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3057         self.report_extract_vid_page(video_page)
3058
3059
3060         # Extract video links on video page
3061         """Extract video links of all sizes"""
3062         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3063         mobj = re.findall(pattern, webpage)
3064         if len(mobj) == 0:
3065             raise ExtractorError(u'Unable to extract video links')
3066
3067         # Sort in resolution
3068         links = sorted(mobj)
3069
3070         # Choose the lowest of the sort, i.e. highest resolution
3071         video_url = links[-1]
3072         # Only get the url. The resolution part in the tuple has no use anymore
3073         video_url = video_url[-1]
3074         # Treat escaped \u0026 style hex
3075         try:
3076             video_url = video_url.decode("unicode_escape")
3077         except AttributeError: # Python 3
3078             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3079
3080
3081         return [{
3082             'id':       video_id,
3083             'url':      video_url,
3084             'uploader': uploader,
3085             'upload_date':  upload_date,
3086             'title':    video_title,
3087             'ext':      video_extension,
3088         }]
3089
3090 class NBAIE(InfoExtractor):
3091     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3092     IE_NAME = u'nba'
3093
3094     def _real_extract(self, url):
3095         mobj = re.match(self._VALID_URL, url)
3096         if mobj is None:
3097             raise ExtractorError(u'Invalid URL: %s' % url)
3098
3099         video_id = mobj.group(1)
3100         if video_id.endswith('/index.html'):
3101             video_id = video_id[:-len('/index.html')]
3102
3103         webpage = self._download_webpage(url, video_id)
3104
3105         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3106         def _findProp(rexp, default=None):
3107             m = re.search(rexp, webpage)
3108             if m:
3109                 return unescapeHTML(m.group(1))
3110             else:
3111                 return default
3112
3113         shortened_video_id = video_id.rpartition('/')[2]
3114         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3115         info = {
3116             'id': shortened_video_id,
3117             'url': video_url,
3118             'ext': 'mp4',
3119             'title': title,
3120             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3121             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3122         }
3123         return [info]
3124
3125 class JustinTVIE(InfoExtractor):
3126     """Information extractor for justin.tv and twitch.tv"""
3127     # TODO: One broadcast may be split into multiple videos. The key
3128     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3129     # starts at 1 and increases. Can we treat all parts as one video?
3130
3131     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3132         (?:
3133             (?P<channelid>[^/]+)|
3134             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3135             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3136         )
3137         /?(?:\#.*)?$
3138         """
3139     _JUSTIN_PAGE_LIMIT = 100
3140     IE_NAME = u'justin.tv'
3141
3142     def report_download_page(self, channel, offset):
3143         """Report attempt to download a single page of videos."""
3144         self.to_screen(u'%s: Downloading video information from %d to %d' %
3145                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3146
3147     # Return count of items, list of *valid* items
3148     def _parse_page(self, url, video_id):
3149         webpage = self._download_webpage(url, video_id,
3150                                          u'Downloading video info JSON',
3151                                          u'unable to download video info JSON')
3152
3153         response = json.loads(webpage)
3154         if type(response) != list:
3155             error_text = response.get('error', 'unknown error')
3156             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3157         info = []
3158         for clip in response:
3159             video_url = clip['video_file_url']
3160             if video_url:
3161                 video_extension = os.path.splitext(video_url)[1][1:]
3162                 video_date = re.sub('-', '', clip['start_time'][:10])
3163                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3164                 video_id = clip['id']
3165                 video_title = clip.get('title', video_id)
3166                 info.append({
3167                     'id': video_id,
3168                     'url': video_url,
3169                     'title': video_title,
3170                     'uploader': clip.get('channel_name', video_uploader_id),
3171                     'uploader_id': video_uploader_id,
3172                     'upload_date': video_date,
3173                     'ext': video_extension,
3174                 })
3175         return (len(response), info)
3176
3177     def _real_extract(self, url):
3178         mobj = re.match(self._VALID_URL, url)
3179         if mobj is None:
3180             raise ExtractorError(u'invalid URL: %s' % url)
3181
3182         api_base = 'http://api.justin.tv'
3183         paged = False
3184         if mobj.group('channelid'):
3185             paged = True
3186             video_id = mobj.group('channelid')
3187             api = api_base + '/channel/archives/%s.json' % video_id
3188         elif mobj.group('chapterid'):
3189             chapter_id = mobj.group('chapterid')
3190
3191             webpage = self._download_webpage(url, chapter_id)
3192             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3193             if not m:
3194                 raise ExtractorError(u'Cannot find archive of a chapter')
3195             archive_id = m.group(1)
3196
3197             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3198             chapter_info_xml = self._download_webpage(api, chapter_id,
3199                                              note=u'Downloading chapter information',
3200                                              errnote=u'Chapter information download failed')
3201             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3202             for a in doc.findall('.//archive'):
3203                 if archive_id == a.find('./id').text:
3204                     break
3205             else:
3206                 raise ExtractorError(u'Could not find chapter in chapter information')
3207
3208             video_url = a.find('./video_file_url').text
3209             video_ext = video_url.rpartition('.')[2] or u'flv'
3210
3211             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3212             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3213                                    note='Downloading chapter metadata',
3214                                    errnote='Download of chapter metadata failed')
3215             chapter_info = json.loads(chapter_info_json)
3216
3217             bracket_start = int(doc.find('.//bracket_start').text)
3218             bracket_end = int(doc.find('.//bracket_end').text)
3219
3220             # TODO determine start (and probably fix up file)
3221             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3222             #video_url += u'?start=' + TODO:start_timestamp
3223             # bracket_start is 13290, but we want 51670615
3224             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3225                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3226
3227             info = {
3228                 'id': u'c' + chapter_id,
3229                 'url': video_url,
3230                 'ext': video_ext,
3231                 'title': chapter_info['title'],
3232                 'thumbnail': chapter_info['preview'],
3233                 'description': chapter_info['description'],
3234                 'uploader': chapter_info['channel']['display_name'],
3235                 'uploader_id': chapter_info['channel']['name'],
3236             }
3237             return [info]
3238         else:
3239             video_id = mobj.group('videoid')
3240             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3241
3242         self.report_extraction(video_id)
3243
3244         info = []
3245         offset = 0
3246         limit = self._JUSTIN_PAGE_LIMIT
3247         while True:
3248             if paged:
3249                 self.report_download_page(video_id, offset)
3250             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3251             page_count, page_info = self._parse_page(page_url, video_id)
3252             info.extend(page_info)
3253             if not paged or page_count != limit:
3254                 break
3255             offset += limit
3256         return info
3257
3258 class FunnyOrDieIE(InfoExtractor):
3259     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3260
3261     def _real_extract(self, url):
3262         mobj = re.match(self._VALID_URL, url)
3263         if mobj is None:
3264             raise ExtractorError(u'invalid URL: %s' % url)
3265
3266         video_id = mobj.group('id')
3267         webpage = self._download_webpage(url, video_id)
3268
3269         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3270         if not m:
3271             raise ExtractorError(u'Unable to find video information')
3272         video_url = unescapeHTML(m.group('url'))
3273
3274         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3275         if not m:
3276             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3277             if not m:
3278                 raise ExtractorError(u'Cannot find video title')
3279         title = clean_html(m.group('title'))
3280
3281         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3282         if m:
3283             desc = unescapeHTML(m.group('desc'))
3284         else:
3285             desc = None
3286
3287         info = {
3288             'id': video_id,
3289             'url': video_url,
3290             'ext': 'mp4',
3291             'title': title,
3292             'description': desc,
3293         }
3294         return [info]
3295
3296 class SteamIE(InfoExtractor):
3297     _VALID_URL = r"""http://store\.steampowered\.com/
3298                 (agecheck/)?
3299                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3300                 (?P<gameID>\d+)/?
3301                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3302                 """
3303
3304     @classmethod
3305     def suitable(cls, url):
3306         """Receives a URL and returns True if suitable for this IE."""
3307         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3308
3309     def _real_extract(self, url):
3310         m = re.match(self._VALID_URL, url, re.VERBOSE)
3311         gameID = m.group('gameID')
3312         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3313         self.report_age_confirmation()
3314         webpage = self._download_webpage(videourl, gameID)
3315         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3316
3317         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3318         mweb = re.finditer(urlRE, webpage)
3319         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3320         titles = re.finditer(namesRE, webpage)
3321         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3322         thumbs = re.finditer(thumbsRE, webpage)
3323         videos = []
3324         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3325             video_id = vid.group('videoID')
3326             title = vtitle.group('videoName')
3327             video_url = vid.group('videoURL')
3328             video_thumb = thumb.group('thumbnail')
3329             if not video_url:
3330                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3331             info = {
3332                 'id':video_id,
3333                 'url':video_url,
3334                 'ext': 'flv',
3335                 'title': unescapeHTML(title),
3336                 'thumbnail': video_thumb
3337                   }
3338             videos.append(info)
3339         return [self.playlist_result(videos, gameID, game_title)]
3340
3341 class UstreamIE(InfoExtractor):
3342     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3343     IE_NAME = u'ustream'
3344
3345     def _real_extract(self, url):
3346         m = re.match(self._VALID_URL, url)
3347         video_id = m.group('videoID')
3348         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3349         webpage = self._download_webpage(url, video_id)
3350         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3351         title = m.group('title')
3352         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3353         uploader = m.group('uploader')
3354         info = {
3355                 'id':video_id,
3356                 'url':video_url,
3357                 'ext': 'flv',
3358                 'title': title,
3359                 'uploader': uploader
3360                   }
3361         return [info]
3362
3363 class WorldStarHipHopIE(InfoExtractor):
3364     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3365     IE_NAME = u'WorldStarHipHop'
3366
3367     def _real_extract(self, url):
3368         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3369
3370         m = re.match(self._VALID_URL, url)
3371         video_id = m.group('id')
3372
3373         webpage_src = self._download_webpage(url, video_id)
3374
3375         mobj = re.search(_src_url, webpage_src)
3376
3377         if mobj is not None:
3378             video_url = mobj.group(1)
3379             if 'mp4' in video_url:
3380                 ext = 'mp4'
3381             else:
3382                 ext = 'flv'
3383         else:
3384             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3385
3386         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3387
3388         if mobj is None:
3389             raise ExtractorError(u'Cannot determine title')
3390         title = mobj.group(1)
3391
3392         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3393         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3394         if mobj is not None:
3395             thumbnail = mobj.group(1)
3396         else:
3397             _title = r"""candytitles.*>(.*)</span>"""
3398             mobj = re.search(_title, webpage_src)
3399             if mobj is not None:
3400                 title = mobj.group(1)
3401             thumbnail = None
3402
3403         results = [{
3404                     'id': video_id,
3405                     'url' : video_url,
3406                     'title' : title,
3407                     'thumbnail' : thumbnail,
3408                     'ext' : ext,
3409                     }]
3410         return results
3411
3412 class RBMARadioIE(InfoExtractor):
3413     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3414
3415     def _real_extract(self, url):
3416         m = re.match(self._VALID_URL, url)
3417         video_id = m.group('videoID')
3418
3419         webpage = self._download_webpage(url, video_id)
3420         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3421         if not m:
3422             raise ExtractorError(u'Cannot find metadata')
3423         json_data = m.group(1)
3424
3425         try:
3426             data = json.loads(json_data)
3427         except ValueError as e:
3428             raise ExtractorError(u'Invalid JSON: ' + str(e))
3429
3430         video_url = data['akamai_url'] + '&cbr=256'
3431         url_parts = compat_urllib_parse_urlparse(video_url)
3432         video_ext = url_parts.path.rpartition('.')[2]
3433         info = {
3434                 'id': video_id,
3435                 'url': video_url,
3436                 'ext': video_ext,
3437                 'title': data['title'],
3438                 'description': data.get('teaser_text'),
3439                 'location': data.get('country_of_origin'),
3440                 'uploader': data.get('host', {}).get('name'),
3441                 'uploader_id': data.get('host', {}).get('slug'),
3442                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3443                 'duration': data.get('duration'),
3444         }
3445         return [info]
3446
3447
3448 class YouPornIE(InfoExtractor):
3449     """Information extractor for youporn.com."""
3450     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3451
3452     def _print_formats(self, formats):
3453         """Print all available formats"""
3454         print(u'Available formats:')
3455         print(u'ext\t\tformat')
3456         print(u'---------------------------------')
3457         for format in formats:
3458             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3459
3460     def _specific(self, req_format, formats):
3461         for x in formats:
3462             if(x["format"]==req_format):
3463                 return x
3464         return None
3465
3466     def _real_extract(self, url):
3467         mobj = re.match(self._VALID_URL, url)
3468         if mobj is None:
3469             raise ExtractorError(u'Invalid URL: %s' % url)
3470
3471         video_id = mobj.group('videoid')
3472
3473         req = compat_urllib_request.Request(url)
3474         req.add_header('Cookie', 'age_verified=1')
3475         webpage = self._download_webpage(req, video_id)
3476
3477         # Get the video title
3478         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3479         if result is None:
3480             raise ExtractorError(u'Unable to extract video title')
3481         video_title = result.group('title').strip()
3482
3483         # Get the video date
3484         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3485         if result is None:
3486             self._downloader.report_warning(u'unable to extract video date')
3487             upload_date = None
3488         else:
3489             upload_date = unified_strdate(result.group('date').strip())
3490
3491         # Get the video uploader
3492         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3493         if result is None:
3494             self._downloader.report_warning(u'unable to extract uploader')
3495             video_uploader = None
3496         else:
3497             video_uploader = result.group('uploader').strip()
3498             video_uploader = clean_html( video_uploader )
3499
3500         # Get all of the formats available
3501         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3502         result = re.search(DOWNLOAD_LIST_RE, webpage)
3503         if result is None:
3504             raise ExtractorError(u'Unable to extract download list')
3505         download_list_html = result.group('download_list').strip()
3506
3507         # Get all of the links from the page
3508         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3509         links = re.findall(LINK_RE, download_list_html)
3510         if(len(links) == 0):
3511             raise ExtractorError(u'ERROR: no known formats available for video')
3512
3513         self.to_screen(u'Links found: %d' % len(links))
3514
3515         formats = []
3516         for link in links:
3517
3518             # A link looks like this:
3519             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3520             # A path looks like this:
3521             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3522             video_url = unescapeHTML( link )
3523             path = compat_urllib_parse_urlparse( video_url ).path
3524             extension = os.path.splitext( path )[1][1:]
3525             format = path.split('/')[4].split('_')[:2]
3526             size = format[0]
3527             bitrate = format[1]
3528             format = "-".join( format )
3529             title = u'%s-%s-%s' % (video_title, size, bitrate)
3530
3531             formats.append({
3532                 'id': video_id,
3533                 'url': video_url,
3534                 'uploader': video_uploader,
3535                 'upload_date': upload_date,
3536                 'title': title,
3537                 'ext': extension,
3538                 'format': format,
3539                 'thumbnail': None,
3540                 'description': None,
3541                 'player_url': None
3542             })
3543
3544         if self._downloader.params.get('listformats', None):
3545             self._print_formats(formats)
3546             return
3547
3548         req_format = self._downloader.params.get('format', None)
3549         self.to_screen(u'Format: %s' % req_format)
3550
3551         if req_format is None or req_format == 'best':
3552             return [formats[0]]
3553         elif req_format == 'worst':
3554             return [formats[-1]]
3555         elif req_format in ('-1', 'all'):
3556             return formats
3557         else:
3558             format = self._specific( req_format, formats )
3559             if result is None:
3560                 raise ExtractorError(u'Requested format not available')
3561             return [format]
3562
3563
3564
3565 class PornotubeIE(InfoExtractor):
3566     """Information extractor for pornotube.com."""
3567     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3568
3569     def _real_extract(self, url):
3570         mobj = re.match(self._VALID_URL, url)
3571         if mobj is None:
3572             raise ExtractorError(u'Invalid URL: %s' % url)
3573
3574         video_id = mobj.group('videoid')
3575         video_title = mobj.group('title')
3576
3577         # Get webpage content
3578         webpage = self._download_webpage(url, video_id)
3579
3580         # Get the video URL
3581         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3582         result = re.search(VIDEO_URL_RE, webpage)
3583         if result is None:
3584             raise ExtractorError(u'Unable to extract video url')
3585         video_url = compat_urllib_parse.unquote(result.group('url'))
3586
3587         #Get the uploaded date
3588         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3589         result = re.search(VIDEO_UPLOADED_RE, webpage)
3590         if result is None:
3591             raise ExtractorError(u'Unable to extract video title')
3592         upload_date = unified_strdate(result.group('date'))
3593
3594         info = {'id': video_id,
3595                 'url': video_url,
3596                 'uploader': None,
3597                 'upload_date': upload_date,
3598                 'title': video_title,
3599                 'ext': 'flv',
3600                 'format': 'flv'}
3601
3602         return [info]
3603
3604 class YouJizzIE(InfoExtractor):
3605     """Information extractor for youjizz.com."""
3606     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3607
3608     def _real_extract(self, url):
3609         mobj = re.match(self._VALID_URL, url)
3610         if mobj is None:
3611             raise ExtractorError(u'Invalid URL: %s' % url)
3612
3613         video_id = mobj.group('videoid')
3614
3615         # Get webpage content
3616         webpage = self._download_webpage(url, video_id)
3617
3618         # Get the video title
3619         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3620         if result is None:
3621             raise ExtractorError(u'ERROR: unable to extract video title')
3622         video_title = result.group('title').strip()
3623
3624         # Get the embed page
3625         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3626         if result is None:
3627             raise ExtractorError(u'ERROR: unable to extract embed page')
3628
3629         embed_page_url = result.group(0).strip()
3630         video_id = result.group('videoid')
3631
3632         webpage = self._download_webpage(embed_page_url, video_id)
3633
3634         # Get the video URL
3635         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3636         if result is None:
3637             raise ExtractorError(u'ERROR: unable to extract video url')
3638         video_url = result.group('source')
3639
3640         info = {'id': video_id,
3641                 'url': video_url,
3642                 'title': video_title,
3643                 'ext': 'flv',
3644                 'format': 'flv',
3645                 'player_url': embed_page_url}
3646
3647         return [info]
3648
3649 class EightTracksIE(InfoExtractor):
3650     IE_NAME = '8tracks'
3651     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3652
3653     def _real_extract(self, url):
3654         mobj = re.match(self._VALID_URL, url)
3655         if mobj is None:
3656             raise ExtractorError(u'Invalid URL: %s' % url)
3657         playlist_id = mobj.group('id')
3658
3659         webpage = self._download_webpage(url, playlist_id)
3660
3661         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3662         if not m:
3663             raise ExtractorError(u'Cannot find trax information')
3664         json_like = m.group(1)
3665         data = json.loads(json_like)
3666
3667         session = str(random.randint(0, 1000000000))
3668         mix_id = data['id']
3669         track_count = data['tracks_count']
3670         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3671         next_url = first_url
3672         res = []
3673         for i in itertools.count():
3674             api_json = self._download_webpage(next_url, playlist_id,
3675                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3676                 errnote=u'Failed to download song information')
3677             api_data = json.loads(api_json)
3678             track_data = api_data[u'set']['track']
3679             info = {
3680                 'id': track_data['id'],
3681                 'url': track_data['track_file_stream_url'],
3682                 'title': track_data['performer'] + u' - ' + track_data['name'],
3683                 'raw_title': track_data['name'],
3684                 'uploader_id': data['user']['login'],
3685                 'ext': 'm4a',
3686             }
3687             res.append(info)
3688             if api_data['set']['at_last_track']:
3689                 break
3690             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3691         return res
3692
3693 class KeekIE(InfoExtractor):
3694     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3695     IE_NAME = u'keek'
3696
3697     def _real_extract(self, url):
3698         m = re.match(self._VALID_URL, url)
3699         video_id = m.group('videoID')
3700         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3701         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3702         webpage = self._download_webpage(url, video_id)
3703         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3704         title = unescapeHTML(m.group('title'))
3705         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3706         uploader = clean_html(m.group('uploader'))
3707         info = {
3708                 'id': video_id,
3709                 'url': video_url,
3710                 'ext': 'mp4',
3711                 'title': title,
3712                 'thumbnail': thumbnail,
3713                 'uploader': uploader
3714         }
3715         return [info]
3716
3717 class TEDIE(InfoExtractor):
3718     _VALID_URL=r'''http://www\.ted\.com/
3719                    (
3720                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3721                         |
3722                         ((?P<type_talk>talks)) # We have a simple talk
3723                    )
3724                    (/lang/(.*?))? # The url may contain the language
3725                    /(?P<name>\w+) # Here goes the name and then ".html"
3726                    '''
3727
3728     @classmethod
3729     def suitable(cls, url):
3730         """Receives a URL and returns True if suitable for this IE."""
3731         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3732
3733     def _real_extract(self, url):
3734         m=re.match(self._VALID_URL, url, re.VERBOSE)
3735         if m.group('type_talk'):
3736             return [self._talk_info(url)]
3737         else :
3738             playlist_id=m.group('playlist_id')
3739             name=m.group('name')
3740             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3741             return [self._playlist_videos_info(url,name,playlist_id)]
3742
3743     def _talk_video_link(self,mediaSlug):
3744         '''Returns the video link for that mediaSlug'''
3745         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3746
3747     def _playlist_videos_info(self,url,name,playlist_id=0):
3748         '''Returns the videos of the playlist'''
3749         video_RE=r'''
3750                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3751                      ([.\s]*?)data-playlist_item_id="(\d+)"
3752                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3753                      '''
3754         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3755         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3756         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3757         m_names=re.finditer(video_name_RE,webpage)
3758
3759         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3760         m_playlist = re.search(playlist_RE, webpage)
3761         playlist_title = m_playlist.group('playlist_title')
3762
3763         playlist_entries = []
3764         for m_video, m_name in zip(m_videos,m_names):
3765             video_id=m_video.group('video_id')
3766             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3767             playlist_entries.append(self.url_result(talk_url, 'TED'))
3768         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3769
3770     def _talk_info(self, url, video_id=0):
3771         """Return the video for the talk in the url"""
3772         m=re.match(self._VALID_URL, url,re.VERBOSE)
3773         videoName=m.group('name')
3774         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3775         # If the url includes the language we get the title translated
3776         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3777         title=re.search(title_RE, webpage).group('title')
3778         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3779                         "id":(?P<videoID>[\d]+).*?
3780                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3781         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3782         thumb_match=re.search(thumb_RE,webpage)
3783         info_match=re.search(info_RE,webpage,re.VERBOSE)
3784         video_id=info_match.group('videoID')
3785         mediaSlug=info_match.group('mediaSlug')
3786         video_url=self._talk_video_link(mediaSlug)
3787         info = {
3788                 'id': video_id,
3789                 'url': video_url,
3790                 'ext': 'mp4',
3791                 'title': title,
3792                 'thumbnail': thumb_match.group('thumbnail')
3793                 }
3794         return info
3795
3796 class MySpassIE(InfoExtractor):
3797     _VALID_URL = r'http://www.myspass.de/.*'
3798
3799     def _real_extract(self, url):
3800         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3801
3802         # video id is the last path element of the URL
3803         # usually there is a trailing slash, so also try the second but last
3804         url_path = compat_urllib_parse_urlparse(url).path
3805         url_parent_path, video_id = os.path.split(url_path)
3806         if not video_id:
3807             _, video_id = os.path.split(url_parent_path)
3808
3809         # get metadata
3810         metadata_url = META_DATA_URL_TEMPLATE % video_id
3811         metadata_text = self._download_webpage(metadata_url, video_id)
3812         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3813
3814         # extract values from metadata
3815         url_flv_el = metadata.find('url_flv')
3816         if url_flv_el is None:
3817             raise ExtractorError(u'Unable to extract download url')
3818         video_url = url_flv_el.text
3819         extension = os.path.splitext(video_url)[1][1:]
3820         title_el = metadata.find('title')
3821         if title_el is None:
3822             raise ExtractorError(u'Unable to extract title')
3823         title = title_el.text
3824         format_id_el = metadata.find('format_id')
3825         if format_id_el is None:
3826             format = ext
3827         else:
3828             format = format_id_el.text
3829         description_el = metadata.find('description')
3830         if description_el is not None:
3831             description = description_el.text
3832         else:
3833             description = None
3834         imagePreview_el = metadata.find('imagePreview')
3835         if imagePreview_el is not None:
3836             thumbnail = imagePreview_el.text
3837         else:
3838             thumbnail = None
3839         info = {
3840             'id': video_id,
3841             'url': video_url,
3842             'title': title,
3843             'ext': extension,
3844             'format': format,
3845             'thumbnail': thumbnail,
3846             'description': description
3847         }
3848         return [info]
3849
3850 class SpiegelIE(InfoExtractor):
3851     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3852
3853     def _real_extract(self, url):
3854         m = re.match(self._VALID_URL, url)
3855         video_id = m.group('videoID')
3856
3857         webpage = self._download_webpage(url, video_id)
3858         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3859         if not m:
3860             raise ExtractorError(u'Cannot find title')
3861         video_title = unescapeHTML(m.group(1))
3862
3863         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3864         xml_code = self._download_webpage(xml_url, video_id,
3865                     note=u'Downloading XML', errnote=u'Failed to download XML')
3866
3867         idoc = xml.etree.ElementTree.fromstring(xml_code)
3868         last_type = idoc[-1]
3869         filename = last_type.findall('./filename')[0].text
3870         duration = float(last_type.findall('./duration')[0].text)
3871
3872         video_url = 'http://video2.spiegel.de/flash/' + filename
3873         video_ext = filename.rpartition('.')[2]
3874         info = {
3875             'id': video_id,
3876             'url': video_url,
3877             'ext': video_ext,
3878             'title': video_title,
3879             'duration': duration,
3880         }
3881         return [info]
3882
3883 class LiveLeakIE(InfoExtractor):
3884
3885     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3886     IE_NAME = u'liveleak'
3887
3888     def _real_extract(self, url):
3889         mobj = re.match(self._VALID_URL, url)
3890         if mobj is None:
3891             raise ExtractorError(u'Invalid URL: %s' % url)
3892
3893         video_id = mobj.group('video_id')
3894
3895         webpage = self._download_webpage(url, video_id)
3896
3897         m = re.search(r'file: "(.*?)",', webpage)
3898         if not m:
3899             raise ExtractorError(u'Unable to find video url')
3900         video_url = m.group(1)
3901
3902         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3903         if not m:
3904             raise ExtractorError(u'Cannot find video title')
3905         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3906
3907         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3908         if m:
3909             desc = unescapeHTML(m.group('desc'))
3910         else:
3911             desc = None
3912
3913         m = re.search(r'By:.*?(\w+)</a>', webpage)
3914         if m:
3915             uploader = clean_html(m.group(1))
3916         else:
3917             uploader = None
3918
3919         info = {
3920             'id':  video_id,
3921             'url': video_url,
3922             'ext': 'mp4',
3923             'title': title,
3924             'description': desc,
3925             'uploader': uploader
3926         }
3927
3928         return [info]
3929
3930 class ARDIE(InfoExtractor):
3931     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3932     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3933     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3934
3935     def _real_extract(self, url):
3936         # determine video id from url
3937         m = re.match(self._VALID_URL, url)
3938
3939         numid = re.search(r'documentId=([0-9]+)', url)
3940         if numid:
3941             video_id = numid.group(1)
3942         else:
3943             video_id = m.group('video_id')
3944
3945         # determine title and media streams from webpage
3946         html = self._download_webpage(url, video_id)
3947         title = re.search(self._TITLE, html).group('title')
3948         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3949         if not streams:
3950             assert '"fsk"' in html
3951             raise ExtractorError(u'This video is only available after 8:00 pm')
3952
3953         # choose default media type and highest quality for now
3954         stream = max([s for s in streams if int(s["media_type"]) == 0],
3955                      key=lambda s: int(s["quality"]))
3956
3957         # there's two possibilities: RTMP stream or HTTP download
3958         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3959         if stream['rtmp_url']:
3960             self.to_screen(u'RTMP download detected')
3961             assert stream['video_url'].startswith('mp4:')
3962             info["url"] = stream["rtmp_url"]
3963             info["play_path"] = stream['video_url']
3964         else:
3965             assert stream["video_url"].endswith('.mp4')
3966             info["url"] = stream["video_url"]
3967         return [info]
3968
3969 class TumblrIE(InfoExtractor):
3970     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3971
3972     def _real_extract(self, url):
3973         m_url = re.match(self._VALID_URL, url)
3974         video_id = m_url.group('id')
3975         blog = m_url.group('blog_name')
3976
3977         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3978         webpage = self._download_webpage(url, video_id)
3979
3980         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3981         video = re.search(re_video, webpage)
3982         if video is None:
3983             self.to_screen("No video founded")
3984             return []
3985         video_url = video.group('video_url')
3986         ext = video.group('ext')
3987
3988         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
3989         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3990
3991         # The only place where you can get a title, it's not complete,
3992         # but searching in other places doesn't work for all videos
3993         re_title = r'<title>(?P<title>.*?)</title>'
3994         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3995
3996         return [{'id': video_id,
3997                  'url': video_url,
3998                  'title': title,
3999                  'thumbnail': thumb,
4000                  'ext': ext
4001                  }]
4002
4003 class BandcampIE(InfoExtractor):
4004     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4005
4006     def _real_extract(self, url):
4007         mobj = re.match(self._VALID_URL, url)
4008         title = mobj.group('title')
4009         webpage = self._download_webpage(url, title)
4010         # We get the link to the free download page
4011         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4012         if m_download is None:
4013             raise ExtractorError(u'No free songs founded')
4014
4015         download_link = m_download.group(1)
4016         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4017                        webpage, re.MULTILINE|re.DOTALL).group('id')
4018
4019         download_webpage = self._download_webpage(download_link, id,
4020                                                   'Downloading free downloads page')
4021         # We get the dictionary of the track from some javascrip code
4022         info = re.search(r'items: (.*?),$',
4023                          download_webpage, re.MULTILINE).group(1)
4024         info = json.loads(info)[0]
4025         # We pick mp3-320 for now, until format selection can be easily implemented.
4026         mp3_info = info[u'downloads'][u'mp3-320']
4027         # If we try to use this url it says the link has expired
4028         initial_url = mp3_info[u'url']
4029         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4030         m_url = re.match(re_url, initial_url)
4031         #We build the url we will use to get the final track url
4032         # This url is build in Bandcamp in the script download_bunde_*.js
4033         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4034         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4035         # If we could correctly generate the .rand field the url would be
4036         #in the "download_url" key
4037         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4038
4039         track_info = {'id':id,
4040                       'title' : info[u'title'],
4041                       'ext' : 'mp3',
4042                       'url' : final_url,
4043                       'thumbnail' : info[u'thumb_url'],
4044                       'uploader' : info[u'artist']
4045                       }
4046
4047         return [track_info]
4048
4049 class RedTubeIE(InfoExtractor):
4050     """Information Extractor for redtube"""
4051     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4052
4053     def _real_extract(self,url):
4054         mobj = re.match(self._VALID_URL, url)
4055         if mobj is None:
4056             raise ExtractorError(u'Invalid URL: %s' % url)
4057
4058         video_id = mobj.group('id')
4059         video_extension = 'mp4'
4060         webpage = self._download_webpage(url, video_id)
4061         self.report_extraction(video_id)
4062         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4063
4064         if mobj is None:
4065             raise ExtractorError(u'Unable to extract media URL')
4066
4067         video_url = mobj.group(1)
4068         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4069         if mobj is None:
4070             raise ExtractorError(u'Unable to extract title')
4071         video_title = mobj.group(1)
4072
4073         return [{
4074             'id':       video_id,
4075             'url':      video_url,
4076             'ext':      video_extension,
4077             'title':    video_title,
4078         }]
4079
4080 class InaIE(InfoExtractor):
4081     """Information Extractor for Ina.fr"""
4082     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4083
4084     def _real_extract(self,url):
4085         mobj = re.match(self._VALID_URL, url)
4086
4087         video_id = mobj.group('id')
4088         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4089         video_extension = 'mp4'
4090         webpage = self._download_webpage(mrss_url, video_id)
4091
4092         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4093         if mobj is None:
4094             raise ExtractorError(u'Unable to extract media URL')
4095         video_url = mobj.group(1)
4096
4097         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4098         if mobj is None:
4099             raise ExtractorError(u'Unable to extract title')
4100         video_title = mobj.group(1)
4101
4102         return [{
4103             'id':       video_id,
4104             'url':      video_url,
4105             'ext':      video_extension,
4106             'title':    video_title,
4107         }]
4108
4109 def gen_extractors():
4110     """ Return a list of an instance of every supported extractor.
4111     The order does matter; the first extractor matched is the one handling the URL.
4112     """
4113     return [
4114         YoutubePlaylistIE(),
4115         YoutubeChannelIE(),
4116         YoutubeUserIE(),
4117         YoutubeSearchIE(),
4118         YoutubeIE(),
4119         MetacafeIE(),
4120         DailymotionIE(),
4121         GoogleSearchIE(),
4122         PhotobucketIE(),
4123         YahooIE(),
4124         YahooSearchIE(),
4125         DepositFilesIE(),
4126         FacebookIE(),
4127         BlipTVUserIE(),
4128         BlipTVIE(),
4129         VimeoIE(),
4130         MyVideoIE(),
4131         ComedyCentralIE(),
4132         EscapistIE(),
4133         CollegeHumorIE(),
4134         XVideosIE(),
4135         SoundcloudSetIE(),
4136         SoundcloudIE(),
4137         InfoQIE(),
4138         MixcloudIE(),
4139         StanfordOpenClassroomIE(),
4140         MTVIE(),
4141         YoukuIE(),
4142         XNXXIE(),
4143         YouJizzIE(),
4144         PornotubeIE(),
4145         YouPornIE(),
4146         GooglePlusIE(),
4147         ArteTvIE(),
4148         NBAIE(),
4149         WorldStarHipHopIE(),
4150         JustinTVIE(),
4151         FunnyOrDieIE(),
4152         SteamIE(),
4153         UstreamIE(),
4154         RBMARadioIE(),
4155         EightTracksIE(),
4156         KeekIE(),
4157         TEDIE(),
4158         MySpassIE(),
4159         SpiegelIE(),
4160         LiveLeakIE(),
4161         ARDIE(),
4162         TumblrIE(),
4163         BandcampIE(),
4164         RedTubeIE(),
4165         InaIE(),
4166         GenericIE()
4167     ]
4168
4169 def get_info_extractor(ie_name):
4170     """Returns the info extractor class with the given ie_name"""
4171     return globals()[ie_name+'IE']