_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 476
 477     def _extract_id(self, url):
 478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 479         if mobj is None:
 480             raise ExtractorError(u'Invalid URL: %s' % url)
 481         video_id = mobj.group(2)
 482         return video_id
 483
 484     def _real_extract(self, url):
 485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 486         mobj = re.search(self._NEXT_URL_RE, url)
 487         if mobj:
 488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 489         video_id = self._extract_id(url)
 490
 491         # Get video webpage
 492         self.report_video_webpage_download(video_id)
 493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 494         request = compat_urllib_request.Request(url)
 495         try:
 496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 499
 500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 501
 502         # Attempt to extract SWF player URL
 503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 504         if mobj is not None:
 505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 506         else:
 507             player_url = None
 508
 509         # Get video info
 510         self.report_video_info_webpage_download(video_id)
 511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 513                     % (video_id, el_type))
 514             video_info_webpage = self._download_webpage(video_info_url, video_id,
 515                                     note=False,
 516                                     errnote='unable to download video info webpage')
 517             video_info = compat_parse_qs(video_info_webpage)
 518             if 'token' in video_info:
 519                 break
 520         if 'token' not in video_info:
 521             if 'reason' in video_info:
 522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 523             else:
 524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 525
 526         # Check for "rental" videos
 527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 528             raise ExtractorError(u'"rental" videos not supported')
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             raise ExtractorError(u'Unable to extract uploader name')
 536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 537
 538         # uploader_id
 539         video_uploader_id = None
 540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 541         if mobj is not None:
 542             video_uploader_id = mobj.group(1)
 543         else:
 544             self._downloader.report_warning(u'unable to extract uploader nickname')
 545
 546         # title
 547         if 'title' not in video_info:
 548             raise ExtractorError(u'Unable to extract video title')
 549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 550
 551         # thumbnail image
 552         if 'thumbnail_url' not in video_info:
 553             self._downloader.report_warning(u'unable to extract video thumbnail')
 554             video_thumbnail = ''
 555         else:   # don't panic if we can't find it
 556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 557
 558         # upload date
 559         upload_date = None
 560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 561         if mobj is not None:
 562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 563             upload_date = unified_strdate(upload_date)
 564
 565         # description
 566         video_description = get_element_by_id("eow-description", video_webpage)
 567         if video_description:
 568             video_description = clean_html(video_description)
 569         else:
 570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 571             if fd_mobj:
 572                 video_description = unescapeHTML(fd_mobj.group(1))
 573             else:
 574                 video_description = u''
 575
 576         # subtitles
 577         video_subtitles = None
 578
 579         if self._downloader.params.get('writesubtitles', False):
 580             video_subtitles = self._extract_subtitle(video_id)
 581             if video_subtitles:
 582                 (sub_error, sub_lang, sub) = video_subtitles[0]
 583                 if sub_error:
 584                     self._downloader.report_error(sub_error)
 585
 586         if self._downloader.params.get('allsubtitles', False):
 587             video_subtitles = self._extract_all_subtitles(video_id)
 588             for video_subtitle in video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitle
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('listsubtitles', False):
 594             sub_lang_list = self._list_available_subtitles(video_id)
 595             return
 596
 597         if 'length_seconds' not in video_info:
 598             self._downloader.report_warning(u'unable to extract video duration')
 599             video_duration = ''
 600         else:
 601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 602
 603         # token
 604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 605
 606         # Decide which formats to download
 607         req_format = self._downloader.params.get('format', None)
 608
 609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 610             self.report_rtmp_download()
 611             video_url_list = [(None, video_info['conn'][0])]
 612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 613             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 614             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 615             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 616             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 617
 618             format_limit = self._downloader.params.get('format_limit', None)
 619             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 620             if format_limit is not None and format_limit in available_formats:
 621                 format_list = available_formats[available_formats.index(format_limit):]
 622             else:
 623                 format_list = available_formats
 624             existing_formats = [x for x in format_list if x in url_map]
 625             if len(existing_formats) == 0:
 626                 raise ExtractorError(u'no known formats available for video')
 627             if self._downloader.params.get('listformats', None):
 628                 self._print_formats(existing_formats)
 629                 return
 630             if req_format is None or req_format == 'best':
 631                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 632             elif req_format == 'worst':
 633                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 634             elif req_format in ('-1', 'all'):
 635                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 636             else:
 637                 # Specific formats. We pick the first in a slash-delimeted sequence.
 638                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 639                 req_formats = req_format.split('/')
 640                 video_url_list = None
 641                 for rf in req_formats:
 642                     if rf in url_map:
 643                         video_url_list = [(rf, url_map[rf])]
 644                         break
 645                 if video_url_list is None:
 646                     raise ExtractorError(u'requested format not available')
 647         else:
 648             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 649
 650         results = []
 651         for format_param, video_real_url in video_url_list:
 652             # Extension
 653             video_extension = self._video_extensions.get(format_param, 'flv')
 654
 655             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 656                                               self._video_dimensions.get(format_param, '???'))
 657
 658             results.append({
 659                 'id':       video_id,
 660                 'url':      video_real_url,
 661                 'uploader': video_uploader,
 662                 'uploader_id': video_uploader_id,
 663                 'upload_date':  upload_date,
 664                 'title':    video_title,
 665                 'ext':      video_extension,
 666                 'format':   video_format,
 667                 'thumbnail':    video_thumbnail,
 668                 'description':  video_description,
 669                 'player_url':   player_url,
 670                 'subtitles':    video_subtitles,
 671                 'duration':     video_duration
 672             })
 673         return results
 674
 675
 676 class MetacafeIE(InfoExtractor):
 677     """Information Extractor for metacafe.com."""
 678
 679     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 680     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 681     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 682     IE_NAME = u'metacafe'
 683
 684     def report_disclaimer(self):
 685         """Report disclaimer retrieval."""
 686         self.to_screen(u'Retrieving disclaimer')
 687
 688     def _real_initialize(self):
 689         # Retrieve disclaimer
 690         request = compat_urllib_request.Request(self._DISCLAIMER)
 691         try:
 692             self.report_disclaimer()
 693             disclaimer = compat_urllib_request.urlopen(request).read()
 694         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 695             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 696
 697         # Confirm age
 698         disclaimer_form = {
 699             'filters': '0',
 700             'submit': "Continue - I'm over 18",
 701             }
 702         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 703         try:
 704             self.report_age_confirmation()
 705             disclaimer = compat_urllib_request.urlopen(request).read()
 706         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 707             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 708
 709     def _real_extract(self, url):
 710         # Extract id and simplified title from URL
 711         mobj = re.match(self._VALID_URL, url)
 712         if mobj is None:
 713             raise ExtractorError(u'Invalid URL: %s' % url)
 714
 715         video_id = mobj.group(1)
 716
 717         # Check if video comes from YouTube
 718         mobj2 = re.match(r'^yt-(.*)$', video_id)
 719         if mobj2 is not None:
 720             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 721
 722         # Retrieve video webpage to extract further information
 723         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 724
 725         # Extract URL, uploader and title from webpage
 726         self.report_extraction(video_id)
 727         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 728         if mobj is not None:
 729             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 730             video_extension = mediaURL[-3:]
 731
 732             # Extract gdaKey if available
 733             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 734             if mobj is None:
 735                 video_url = mediaURL
 736             else:
 737                 gdaKey = mobj.group(1)
 738                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 739         else:
 740             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 741             if mobj is None:
 742                 raise ExtractorError(u'Unable to extract media URL')
 743             vardict = compat_parse_qs(mobj.group(1))
 744             if 'mediaData' not in vardict:
 745                 raise ExtractorError(u'Unable to extract media URL')
 746             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 747             if mobj is None:
 748                 raise ExtractorError(u'Unable to extract media URL')
 749             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 750             video_extension = mediaURL[-3:]
 751             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 752
 753         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 754         if mobj is None:
 755             raise ExtractorError(u'Unable to extract title')
 756         video_title = mobj.group(1).decode('utf-8')
 757
 758         mobj = re.search(r'submitter=(.*?);', webpage)
 759         if mobj is None:
 760             raise ExtractorError(u'Unable to extract uploader nickname')
 761         video_uploader = mobj.group(1)
 762
 763         return [{
 764             'id':       video_id.decode('utf-8'),
 765             'url':      video_url.decode('utf-8'),
 766             'uploader': video_uploader.decode('utf-8'),
 767             'upload_date':  None,
 768             'title':    video_title,
 769             'ext':      video_extension.decode('utf-8'),
 770         }]
 771
 772 class DailymotionIE(InfoExtractor):
 773     """Information Extractor for Dailymotion"""
 774
 775     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 776     IE_NAME = u'dailymotion'
 777
 778     def _real_extract(self, url):
 779         # Extract id and simplified title from URL
 780         mobj = re.match(self._VALID_URL, url)
 781         if mobj is None:
 782             raise ExtractorError(u'Invalid URL: %s' % url)
 783
 784         video_id = mobj.group(1).split('_')[0].split('?')[0]
 785
 786         video_extension = 'mp4'
 787
 788         # Retrieve video webpage to extract further information
 789         request = compat_urllib_request.Request(url)
 790         request.add_header('Cookie', 'family_filter=off')
 791         webpage = self._download_webpage(request, video_id)
 792
 793         # Extract URL, uploader and title from webpage
 794         self.report_extraction(video_id)
 795         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 796         if mobj is None:
 797             raise ExtractorError(u'Unable to extract media URL')
 798         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 799
 800         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 801             if key in flashvars:
 802                 max_quality = key
 803                 self.to_screen(u'Using %s' % key)
 804                 break
 805         else:
 806             raise ExtractorError(u'Unable to extract video URL')
 807
 808         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 809         if mobj is None:
 810             raise ExtractorError(u'Unable to extract video URL')
 811
 812         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 813
 814         # TODO: support choosing qualities
 815
 816         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 817         if mobj is None:
 818             raise ExtractorError(u'Unable to extract title')
 819         video_title = unescapeHTML(mobj.group('title'))
 820
 821         video_uploader = None
 822         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 823         if mobj is None:
 824             # lookin for official user
 825             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 826             if mobj_official is None:
 827                 self._downloader.report_warning(u'unable to extract uploader nickname')
 828             else:
 829                 video_uploader = mobj_official.group(1)
 830         else:
 831             video_uploader = mobj.group(1)
 832
 833         video_upload_date = None
 834         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 835         if mobj is not None:
 836             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 837
 838         return [{
 839             'id':       video_id,
 840             'url':      video_url,
 841             'uploader': video_uploader,
 842             'upload_date':  video_upload_date,
 843             'title':    video_title,
 844             'ext':      video_extension,
 845         }]
 846
 847
 848 class PhotobucketIE(InfoExtractor):
 849     """Information extractor for photobucket.com."""
 850
 851     # TODO: the original _VALID_URL was:
 852     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 853     # Check if it's necessary to keep the old extracion process
 854     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 855     IE_NAME = u'photobucket'
 856
 857     def _real_extract(self, url):
 858         # Extract id from URL
 859         mobj = re.match(self._VALID_URL, url)
 860         if mobj is None:
 861             raise ExtractorError(u'Invalid URL: %s' % url)
 862
 863         video_id = mobj.group('id')
 864
 865         video_extension = mobj.group('ext')
 866
 867         # Retrieve video webpage to extract further information
 868         webpage = self._download_webpage(url, video_id)
 869
 870         # Extract URL, uploader, and title from webpage
 871         self.report_extraction(video_id)
 872         # We try first by looking the javascript code:
 873         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 874         if mobj is not None:
 875             info = json.loads(mobj.group('json'))
 876             return [{
 877                 'id':       video_id,
 878                 'url':      info[u'downloadUrl'],
 879                 'uploader': info[u'username'],
 880                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 881                 'title':    info[u'title'],
 882                 'ext':      video_extension,
 883                 'thumbnail': info[u'thumbUrl'],
 884             }]
 885
 886         # We try looking in other parts of the webpage
 887         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 888         if mobj is None:
 889             raise ExtractorError(u'Unable to extract media URL')
 890         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 891
 892         video_url = mediaURL
 893
 894         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 895         if mobj is None:
 896             raise ExtractorError(u'Unable to extract title')
 897         video_title = mobj.group(1).decode('utf-8')
 898
 899         video_uploader = mobj.group(2).decode('utf-8')
 900
 901         return [{
 902             'id':       video_id.decode('utf-8'),
 903             'url':      video_url.decode('utf-8'),
 904             'uploader': video_uploader,
 905             'upload_date':  None,
 906             'title':    video_title,
 907             'ext':      video_extension.decode('utf-8'),
 908         }]
 909
 910
 911 class YahooIE(InfoExtractor):
 912     """Information extractor for video.yahoo.com."""
 913
 914     _WORKING = False
 915     # _VALID_URL matches all Yahoo! Video URLs
 916     # _VPAGE_URL matches only the extractable '/watch/' URLs
 917     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 918     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 919     IE_NAME = u'video.yahoo'
 920
 921     def _real_extract(self, url, new_video=True):
 922         # Extract ID from URL
 923         mobj = re.match(self._VALID_URL, url)
 924         if mobj is None:
 925             raise ExtractorError(u'Invalid URL: %s' % url)
 926
 927         video_id = mobj.group(2)
 928         video_extension = 'flv'
 929
 930         # Rewrite valid but non-extractable URLs as
 931         # extractable English language /watch/ URLs
 932         if re.match(self._VPAGE_URL, url) is None:
 933             request = compat_urllib_request.Request(url)
 934             try:
 935                 webpage = compat_urllib_request.urlopen(request).read()
 936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 937                 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 938
 939             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 940             if mobj is None:
 941                 raise ExtractorError(u'Unable to extract id field')
 942             yahoo_id = mobj.group(1)
 943
 944             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 945             if mobj is None:
 946                 raise ExtractorError(u'Unable to extract vid field')
 947             yahoo_vid = mobj.group(1)
 948
 949             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 950             return self._real_extract(url, new_video=False)
 951
 952         # Retrieve video webpage to extract further information
 953         request = compat_urllib_request.Request(url)
 954         try:
 955             self.report_download_webpage(video_id)
 956             webpage = compat_urllib_request.urlopen(request).read()
 957         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 958             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 959
 960         # Extract uploader and title from webpage
 961         self.report_extraction(video_id)
 962         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 963         if mobj is None:
 964             raise ExtractorError(u'Unable to extract video title')
 965         video_title = mobj.group(1).decode('utf-8')
 966
 967         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 968         if mobj is None:
 969             raise ExtractorError(u'Unable to extract video uploader')
 970         video_uploader = mobj.group(1).decode('utf-8')
 971
 972         # Extract video thumbnail
 973         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 974         if mobj is None:
 975             raise ExtractorError(u'Unable to extract video thumbnail')
 976         video_thumbnail = mobj.group(1).decode('utf-8')
 977
 978         # Extract video description
 979         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 980         if mobj is None:
 981             raise ExtractorError(u'Unable to extract video description')
 982         video_description = mobj.group(1).decode('utf-8')
 983         if not video_description:
 984             video_description = 'No description available.'
 985
 986         # Extract video height and width
 987         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 988         if mobj is None:
 989             raise ExtractorError(u'Unable to extract video height')
 990         yv_video_height = mobj.group(1)
 991
 992         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 993         if mobj is None:
 994             raise ExtractorError(u'Unable to extract video width')
 995         yv_video_width = mobj.group(1)
 996
 997         # Retrieve video playlist to extract media URL
 998         # I'm not completely sure what all these options are, but we
 999         # seem to need most of them, otherwise the server sends a 401.
1000         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1001         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1002         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1003                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1004                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1005         try:
1006             self.report_download_webpage(video_id)
1007             webpage = compat_urllib_request.urlopen(request).read()
1008         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1010
1011         # Extract media URL from playlist XML
1012         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1013         if mobj is None:
1014             raise ExtractorError(u'Unable to extract media URL')
1015         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1016         video_url = unescapeHTML(video_url)
1017
1018         return [{
1019             'id':       video_id.decode('utf-8'),
1020             'url':      video_url,
1021             'uploader': video_uploader,
1022             'upload_date':  None,
1023             'title':    video_title,
1024             'ext':      video_extension.decode('utf-8'),
1025             'thumbnail':    video_thumbnail.decode('utf-8'),
1026             'description':  video_description,
1027         }]
1028
1029
1030 class VimeoIE(InfoExtractor):
1031     """Information extractor for vimeo.com."""
1032
1033     # _VALID_URL matches Vimeo URLs
1034     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1035     IE_NAME = u'vimeo'
1036
1037     def _real_extract(self, url, new_video=True):
1038         # Extract ID from URL
1039         mobj = re.match(self._VALID_URL, url)
1040         if mobj is None:
1041             raise ExtractorError(u'Invalid URL: %s' % url)
1042
1043         video_id = mobj.group('id')
1044         if not mobj.group('proto'):
1045             url = 'https://' + url
1046         if mobj.group('direct_link'):
1047             url = 'https://vimeo.com/' + video_id
1048
1049         # Retrieve video webpage to extract further information
1050         request = compat_urllib_request.Request(url, None, std_headers)
1051         webpage = self._download_webpage(request, video_id)
1052
1053         # Now we begin extracting as much information as we can from what we
1054         # retrieved. First we extract the information common to all extractors,
1055         # and latter we extract those that are Vimeo specific.
1056         self.report_extraction(video_id)
1057
1058         # Extract the config JSON
1059         try:
1060             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061             config = json.loads(config)
1062         except:
1063             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1064                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1065             else:
1066                 raise ExtractorError(u'Unable to extract info section')
1067
1068         # Extract title
1069         video_title = config["video"]["title"]
1070
1071         # Extract uploader and uploader_id
1072         video_uploader = config["video"]["owner"]["name"]
1073         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074
1075         # Extract video thumbnail
1076         video_thumbnail = config["video"]["thumbnail"]
1077
1078         # Extract video description
1079         video_description = get_element_by_attribute("itemprop", "description", webpage)
1080         if video_description: video_description = clean_html(video_description)
1081         else: video_description = u''
1082
1083         # Extract upload date
1084         video_upload_date = None
1085         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1086         if mobj is not None:
1087             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088
1089         # Vimeo specific: extract request signature and timestamp
1090         sig = config['request']['signature']
1091         timestamp = config['request']['timestamp']
1092
1093         # Vimeo specific: extract video codec and quality information
1094         # First consider quality, then codecs, then take everything
1095         # TODO bind to format param
1096         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1097         files = { 'hd': [], 'sd': [], 'other': []}
1098         for codec_name, codec_extension in codecs:
1099             if codec_name in config["video"]["files"]:
1100                 if 'hd' in config["video"]["files"][codec_name]:
1101                     files['hd'].append((codec_name, codec_extension, 'hd'))
1102                 elif 'sd' in config["video"]["files"][codec_name]:
1103                     files['sd'].append((codec_name, codec_extension, 'sd'))
1104                 else:
1105                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106
1107         for quality in ('hd', 'sd', 'other'):
1108             if len(files[quality]) > 0:
1109                 video_quality = files[quality][0][2]
1110                 video_codec = files[quality][0][0]
1111                 video_extension = files[quality][0][1]
1112                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1113                 break
1114         else:
1115             raise ExtractorError(u'No known codec found')
1116
1117         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1119
1120         return [{
1121             'id':       video_id,
1122             'url':      video_url,
1123             'uploader': video_uploader,
1124             'uploader_id': video_uploader_id,
1125             'upload_date':  video_upload_date,
1126             'title':    video_title,
1127             'ext':      video_extension,
1128             'thumbnail':    video_thumbnail,
1129             'description':  video_description,
1130         }]
1131
1132
1133 class ArteTvIE(InfoExtractor):
1134     """arte.tv information extractor."""
1135
1136     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137     _LIVE_URL = r'index-[0-9]+\.html$'
1138
1139     IE_NAME = u'arte.tv'
1140
1141     def fetch_webpage(self, url):
1142         request = compat_urllib_request.Request(url)
1143         try:
1144             self.report_download_webpage(url)
1145             webpage = compat_urllib_request.urlopen(request).read()
1146         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1147             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1148         except ValueError as err:
1149             raise ExtractorError(u'Invalid URL: %s' % url)
1150         return webpage
1151
1152     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1153         page = self.fetch_webpage(url)
1154         mobj = re.search(regex, page, regexFlags)
1155         info = {}
1156
1157         if mobj is None:
1158             raise ExtractorError(u'Invalid URL: %s' % url)
1159
1160         for (i, key, err) in matchTuples:
1161             if mobj.group(i) is None:
1162                 raise ExtractorError(err)
1163             else:
1164                 info[key] = mobj.group(i)
1165
1166         return info
1167
1168     def extractLiveStream(self, url):
1169         video_lang = url.split('/')[-4]
1170         info = self.grep_webpage(
1171             url,
1172             r'src="(.*?/videothek_js.*?\.js)',
1173             0,
1174             [
1175                 (1, 'url', u'Invalid URL: %s' % url)
1176             ]
1177         )
1178         http_host = url.split('/')[2]
1179         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1180         info = self.grep_webpage(
1181             next_url,
1182             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1183                 '(http://.*?\.swf).*?' +
1184                 '(rtmp://.*?)\'',
1185             re.DOTALL,
1186             [
1187                 (1, 'path',   u'could not extract video path: %s' % url),
1188                 (2, 'player', u'could not extract video player: %s' % url),
1189                 (3, 'url',    u'could not extract video url: %s' % url)
1190             ]
1191         )
1192         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1193
1194     def extractPlus7Stream(self, url):
1195         video_lang = url.split('/')[-3]
1196         info = self.grep_webpage(
1197             url,
1198             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1199             0,
1200             [
1201                 (1, 'url', u'Invalid URL: %s' % url)
1202             ]
1203         )
1204         next_url = compat_urllib_parse.unquote(info.get('url'))
1205         info = self.grep_webpage(
1206             next_url,
1207             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1208             0,
1209             [
1210                 (1, 'url', u'Could not find <video> tag: %s' % url)
1211             ]
1212         )
1213         next_url = compat_urllib_parse.unquote(info.get('url'))
1214
1215         info = self.grep_webpage(
1216             next_url,
1217             r'<video id="(.*?)".*?>.*?' +
1218                 '<name>(.*?)</name>.*?' +
1219                 '<dateVideo>(.*?)</dateVideo>.*?' +
1220                 '<url quality="hd">(.*?)</url>',
1221             re.DOTALL,
1222             [
1223                 (1, 'id',    u'could not extract video id: %s' % url),
1224                 (2, 'title', u'could not extract video title: %s' % url),
1225                 (3, 'date',  u'could not extract video date: %s' % url),
1226                 (4, 'url',   u'could not extract video url: %s' % url)
1227             ]
1228         )
1229
1230         return {
1231             'id':           info.get('id'),
1232             'url':          compat_urllib_parse.unquote(info.get('url')),
1233             'uploader':     u'arte.tv',
1234             'upload_date':  unified_strdate(info.get('date')),
1235             'title':        info.get('title').decode('utf-8'),
1236             'ext':          u'mp4',
1237             'format':       u'NA',
1238             'player_url':   None,
1239         }
1240
1241     def _real_extract(self, url):
1242         video_id = url.split('/')[-1]
1243         self.report_extraction(video_id)
1244
1245         if re.search(self._LIVE_URL, video_id) is not None:
1246             self.extractLiveStream(url)
1247             return
1248         else:
1249             info = self.extractPlus7Stream(url)
1250
1251         return [info]
1252
1253
1254 class GenericIE(InfoExtractor):
1255     """Generic last-resort information extractor."""
1256
1257     _VALID_URL = r'.*'
1258     IE_NAME = u'generic'
1259
1260     def report_download_webpage(self, video_id):
1261         """Report webpage download."""
1262         if not self._downloader.params.get('test', False):
1263             self._downloader.report_warning(u'Falling back on generic information extractor.')
1264         super(GenericIE, self).report_download_webpage(video_id)
1265
1266     def report_following_redirect(self, new_url):
1267         """Report information extraction."""
1268         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1269
1270     def _test_redirect(self, url):
1271         """Check if it is a redirect, like url shorteners, in case return the new url."""
1272         class HeadRequest(compat_urllib_request.Request):
1273             def get_method(self):
1274                 return "HEAD"
1275
1276         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1277             """
1278             Subclass the HTTPRedirectHandler to make it use our
1279             HeadRequest also on the redirected URL
1280             """
1281             def redirect_request(self, req, fp, code, msg, headers, newurl):
1282                 if code in (301, 302, 303, 307):
1283                     newurl = newurl.replace(' ', '%20')
1284                     newheaders = dict((k,v) for k,v in req.headers.items()
1285                                       if k.lower() not in ("content-length", "content-type"))
1286                     return HeadRequest(newurl,
1287                                        headers=newheaders,
1288                                        origin_req_host=req.get_origin_req_host(),
1289                                        unverifiable=True)
1290                 else:
1291                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1292
1293         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1294             """
1295             Fallback to GET if HEAD is not allowed (405 HTTP error)
1296             """
1297             def http_error_405(self, req, fp, code, msg, headers):
1298                 fp.read()
1299                 fp.close()
1300
1301                 newheaders = dict((k,v) for k,v in req.headers.items()
1302                                   if k.lower() not in ("content-length", "content-type"))
1303                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1304                                                  headers=newheaders,
1305                                                  origin_req_host=req.get_origin_req_host(),
1306                                                  unverifiable=True))
1307
1308         # Build our opener
1309         opener = compat_urllib_request.OpenerDirector()
1310         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1311                         HTTPMethodFallback, HEADRedirectHandler,
1312                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1313             opener.add_handler(handler())
1314
1315         response = opener.open(HeadRequest(url))
1316         new_url = response.geturl()
1317
1318         if url == new_url:
1319             return False
1320
1321         self.report_following_redirect(new_url)
1322         return new_url
1323
1324     def _real_extract(self, url):
1325         new_url = self._test_redirect(url)
1326         if new_url: return [self.url_result(new_url)]
1327
1328         video_id = url.split('/')[-1]
1329         try:
1330             webpage = self._download_webpage(url, video_id)
1331         except ValueError as err:
1332             # since this is the last-resort InfoExtractor, if
1333             # this error is thrown, it'll be thrown here
1334             raise ExtractorError(u'Invalid URL: %s' % url)
1335
1336         self.report_extraction(video_id)
1337         # Start with something easy: JW Player in SWFObject
1338         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1339         if mobj is None:
1340             # Broaden the search a little bit
1341             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1342         if mobj is None:
1343             # Broaden the search a little bit: JWPlayer JS loader
1344             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1345         if mobj is None:
1346             raise ExtractorError(u'Invalid URL: %s' % url)
1347
1348         # It's possible that one of the regexes
1349         # matched, but returned an empty group:
1350         if mobj.group(1) is None:
1351             raise ExtractorError(u'Invalid URL: %s' % url)
1352
1353         video_url = compat_urllib_parse.unquote(mobj.group(1))
1354         video_id = os.path.basename(video_url)
1355
1356         # here's a fun little line of code for you:
1357         video_extension = os.path.splitext(video_id)[1][1:]
1358         video_id = os.path.splitext(video_id)[0]
1359
1360         # it's tempting to parse this further, but you would
1361         # have to take into account all the variations like
1362         #   Video Title - Site Name
1363         #   Site Name | Video Title
1364         #   Video Title - Tagline | Site Name
1365         # and so on and so forth; it's just not practical
1366         mobj = re.search(r'<title>(.*)</title>', webpage)
1367         if mobj is None:
1368             raise ExtractorError(u'Unable to extract title')
1369         video_title = mobj.group(1)
1370
1371         # video uploader is domain name
1372         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1373         if mobj is None:
1374             raise ExtractorError(u'Unable to extract title')
1375         video_uploader = mobj.group(1)
1376
1377         return [{
1378             'id':       video_id,
1379             'url':      video_url,
1380             'uploader': video_uploader,
1381             'upload_date':  None,
1382             'title':    video_title,
1383             'ext':      video_extension,
1384         }]
1385
1386
1387 class YoutubeSearchIE(InfoExtractor):
1388     """Information Extractor for YouTube search queries."""
1389     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1390     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1391     _max_youtube_results = 1000
1392     IE_NAME = u'youtube:search'
1393
1394     def report_download_page(self, query, pagenum):
1395         """Report attempt to download search page with given number."""
1396         query = query.decode(preferredencoding())
1397         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1398
1399     def _real_extract(self, query):
1400         mobj = re.match(self._VALID_URL, query)
1401         if mobj is None:
1402             raise ExtractorError(u'Invalid search query "%s"' % query)
1403
1404         prefix, query = query.split(':')
1405         prefix = prefix[8:]
1406         query = query.encode('utf-8')
1407         if prefix == '':
1408             return self._get_n_results(query, 1)
1409         elif prefix == 'all':
1410             self._get_n_results(query, self._max_youtube_results)
1411         else:
1412             try:
1413                 n = int(prefix)
1414                 if n <= 0:
1415                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1416                 elif n > self._max_youtube_results:
1417                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1418                     n = self._max_youtube_results
1419                 return self._get_n_results(query, n)
1420             except ValueError: # parsing prefix as integer fails
1421                 return self._get_n_results(query, 1)
1422
1423     def _get_n_results(self, query, n):
1424         """Get a specified number of results for a query"""
1425
1426         video_ids = []
1427         pagenum = 0
1428         limit = n
1429
1430         while (50 * pagenum) < limit:
1431             self.report_download_page(query, pagenum+1)
1432             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1433             request = compat_urllib_request.Request(result_url)
1434             try:
1435                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1437                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1438             api_response = json.loads(data)['data']
1439
1440             if not 'items' in api_response:
1441                 raise ExtractorError(u'[youtube] No video results')
1442
1443             new_ids = list(video['id'] for video in api_response['items'])
1444             video_ids += new_ids
1445
1446             limit = min(n, api_response['totalItems'])
1447             pagenum += 1
1448
1449         if len(video_ids) > n:
1450             video_ids = video_ids[:n]
1451         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1452         return videos
1453
1454
1455 class GoogleSearchIE(InfoExtractor):
1456     """Information Extractor for Google Video search queries."""
1457     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1458     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1459     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1460     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1461     _max_google_results = 1000
1462     IE_NAME = u'video.google:search'
1463
1464     def report_download_page(self, query, pagenum):
1465         """Report attempt to download playlist page with given number."""
1466         query = query.decode(preferredencoding())
1467         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1468
1469     def _real_extract(self, query):
1470         mobj = re.match(self._VALID_URL, query)
1471         if mobj is None:
1472             raise ExtractorError(u'Invalid search query "%s"' % query)
1473
1474         prefix, query = query.split(':')
1475         prefix = prefix[8:]
1476         query = query.encode('utf-8')
1477         if prefix == '':
1478             self._download_n_results(query, 1)
1479             return
1480         elif prefix == 'all':
1481             self._download_n_results(query, self._max_google_results)
1482             return
1483         else:
1484             try:
1485                 n = int(prefix)
1486                 if n <= 0:
1487                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1488                 elif n > self._max_google_results:
1489                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1490                     n = self._max_google_results
1491                 self._download_n_results(query, n)
1492                 return
1493             except ValueError: # parsing prefix as integer fails
1494                 self._download_n_results(query, 1)
1495                 return
1496
1497     def _download_n_results(self, query, n):
1498         """Downloads a specified number of results for a query"""
1499
1500         video_ids = []
1501         pagenum = 0
1502
1503         while True:
1504             self.report_download_page(query, pagenum)
1505             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1506             request = compat_urllib_request.Request(result_url)
1507             try:
1508                 page = compat_urllib_request.urlopen(request).read()
1509             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1510                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1511
1512             # Extract video identifiers
1513             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514                 video_id = mobj.group(1)
1515                 if video_id not in video_ids:
1516                     video_ids.append(video_id)
1517                     if len(video_ids) == n:
1518                         # Specified n videos reached
1519                         for id in video_ids:
1520                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1521                         return
1522
1523             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524                 for id in video_ids:
1525                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1526                 return
1527
1528             pagenum = pagenum + 1
1529
1530
1531 class YahooSearchIE(InfoExtractor):
1532     """Information Extractor for Yahoo! Video search queries."""
1533
1534     _WORKING = False
1535     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1536     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1537     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1538     _MORE_PAGES_INDICATOR = r'\s*Next'
1539     _max_yahoo_results = 1000
1540     IE_NAME = u'video.yahoo:search'
1541
1542     def report_download_page(self, query, pagenum):
1543         """Report attempt to download playlist page with given number."""
1544         query = query.decode(preferredencoding())
1545         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1546
1547     def _real_extract(self, query):
1548         mobj = re.match(self._VALID_URL, query)
1549         if mobj is None:
1550             raise ExtractorError(u'Invalid search query "%s"' % query)
1551
1552         prefix, query = query.split(':')
1553         prefix = prefix[8:]
1554         query = query.encode('utf-8')
1555         if prefix == '':
1556             self._download_n_results(query, 1)
1557             return
1558         elif prefix == 'all':
1559             self._download_n_results(query, self._max_yahoo_results)
1560             return
1561         else:
1562             try:
1563                 n = int(prefix)
1564                 if n <= 0:
1565                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1566                 elif n > self._max_yahoo_results:
1567                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1568                     n = self._max_yahoo_results
1569                 self._download_n_results(query, n)
1570                 return
1571             except ValueError: # parsing prefix as integer fails
1572                 self._download_n_results(query, 1)
1573                 return
1574
1575     def _download_n_results(self, query, n):
1576         """Downloads a specified number of results for a query"""
1577
1578         video_ids = []
1579         already_seen = set()
1580         pagenum = 1
1581
1582         while True:
1583             self.report_download_page(query, pagenum)
1584             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1585             request = compat_urllib_request.Request(result_url)
1586             try:
1587                 page = compat_urllib_request.urlopen(request).read()
1588             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1589                 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1590
1591             # Extract video identifiers
1592             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593                 video_id = mobj.group(1)
1594                 if video_id not in already_seen:
1595                     video_ids.append(video_id)
1596                     already_seen.add(video_id)
1597                     if len(video_ids) == n:
1598                         # Specified n videos reached
1599                         for id in video_ids:
1600                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1601                         return
1602
1603             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1604                 for id in video_ids:
1605                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1606                 return
1607
1608             pagenum = pagenum + 1
1609
1610
1611 class YoutubePlaylistIE(InfoExtractor):
1612     """Information Extractor for YouTube playlists."""
1613
1614     _VALID_URL = r"""(?:
1615                         (?:https?://)?
1616                         (?:\w+\.)?
1617                         youtube\.com/
1618                         (?:
1619                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1620                            \? (?:.*?&)*? (?:p|a|list)=
1621                         |  p/
1622                         )
1623                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1624                         .*
1625                      |
1626                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1627                      )"""
1628     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1629     _MAX_RESULTS = 50
1630     IE_NAME = u'youtube:playlist'
1631
1632     @classmethod
1633     def suitable(cls, url):
1634         """Receives a URL and returns True if suitable for this IE."""
1635         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1636
1637     def _real_extract(self, url):
1638         # Extract playlist id
1639         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1640         if mobj is None:
1641             raise ExtractorError(u'Invalid URL: %s' % url)
1642
1643         # Download playlist videos from API
1644         playlist_id = mobj.group(1) or mobj.group(2)
1645         page_num = 1
1646         videos = []
1647
1648         while True:
1649             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1650             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1651
1652             try:
1653                 response = json.loads(page)
1654             except ValueError as err:
1655                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1656
1657             if 'feed' not in response:
1658                 raise ExtractorError(u'Got a malformed response from YouTube API')
1659             playlist_title = response['feed']['title']['$t']
1660             if 'entry' not in response['feed']:
1661                 # Number of videos is a multiple of self._MAX_RESULTS
1662                 break
1663
1664             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1665                         for entry in response['feed']['entry']
1666                         if 'content' in entry ]
1667
1668             if len(response['feed']['entry']) < self._MAX_RESULTS:
1669                 break
1670             page_num += 1
1671
1672         videos = [v[1] for v in sorted(videos)]
1673
1674         url_results = [self.url_result(url, 'Youtube') for url in videos]
1675         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1676
1677
1678 class YoutubeChannelIE(InfoExtractor):
1679     """Information Extractor for YouTube channels."""
1680
1681     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1682     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1683     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1684     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1685     IE_NAME = u'youtube:channel'
1686
1687     def extract_videos_from_page(self, page):
1688         ids_in_page = []
1689         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1690             if mobj.group(1) not in ids_in_page:
1691                 ids_in_page.append(mobj.group(1))
1692         return ids_in_page
1693
1694     def _real_extract(self, url):
1695         # Extract channel id
1696         mobj = re.match(self._VALID_URL, url)
1697         if mobj is None:
1698             raise ExtractorError(u'Invalid URL: %s' % url)
1699
1700         # Download channel page
1701         channel_id = mobj.group(1)
1702         video_ids = []
1703         pagenum = 1
1704
1705         url = self._TEMPLATE_URL % (channel_id, pagenum)
1706         page = self._download_webpage(url, channel_id,
1707                                       u'Downloading page #%s' % pagenum)
1708
1709         # Extract video identifiers
1710         ids_in_page = self.extract_videos_from_page(page)
1711         video_ids.extend(ids_in_page)
1712
1713         # Download any subsequent channel pages using the json-based channel_ajax query
1714         if self._MORE_PAGES_INDICATOR in page:
1715             while True:
1716                 pagenum = pagenum + 1
1717
1718                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1719                 page = self._download_webpage(url, channel_id,
1720                                               u'Downloading page #%s' % pagenum)
1721
1722                 page = json.loads(page)
1723
1724                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1725                 video_ids.extend(ids_in_page)
1726
1727                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1728                     break
1729
1730         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1731
1732         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1733         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1734         return [self.playlist_result(url_entries, channel_id)]
1735
1736
1737 class YoutubeUserIE(InfoExtractor):
1738     """Information Extractor for YouTube users."""
1739
1740     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1741     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1742     _GDATA_PAGE_SIZE = 50
1743     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1744     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1745     IE_NAME = u'youtube:user'
1746
1747     def _real_extract(self, url):
1748         # Extract username
1749         mobj = re.match(self._VALID_URL, url)
1750         if mobj is None:
1751             raise ExtractorError(u'Invalid URL: %s' % url)
1752
1753         username = mobj.group(1)
1754
1755         # Download video ids using YouTube Data API. Result size per
1756         # query is limited (currently to 50 videos) so we need to query
1757         # page by page until there are no video ids - it means we got
1758         # all of them.
1759
1760         video_ids = []
1761         pagenum = 0
1762
1763         while True:
1764             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1765
1766             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1767             page = self._download_webpage(gdata_url, username,
1768                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1769
1770             # Extract video identifiers
1771             ids_in_page = []
1772
1773             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1774                 if mobj.group(1) not in ids_in_page:
1775                     ids_in_page.append(mobj.group(1))
1776
1777             video_ids.extend(ids_in_page)
1778
1779             # A little optimization - if current page is not
1780             # "full", ie. does not contain PAGE_SIZE video ids then
1781             # we can assume that this page is the last one - there
1782             # are no more ids on further pages - no need to query
1783             # again.
1784
1785             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1786                 break
1787
1788             pagenum += 1
1789
1790         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1791         url_results = [self.url_result(url, 'Youtube') for url in urls]
1792         return [self.playlist_result(url_results, playlist_title = username)]
1793
1794
1795 class BlipTVUserIE(InfoExtractor):
1796     """Information Extractor for blip.tv users."""
1797
1798     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1799     _PAGE_SIZE = 12
1800     IE_NAME = u'blip.tv:user'
1801
1802     def _real_extract(self, url):
1803         # Extract username
1804         mobj = re.match(self._VALID_URL, url)
1805         if mobj is None:
1806             raise ExtractorError(u'Invalid URL: %s' % url)
1807
1808         username = mobj.group(1)
1809
1810         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1811
1812         page = self._download_webpage(url, username, u'Downloading user page')
1813         mobj = re.search(r'data-users-id="([^"]+)"', page)
1814         page_base = page_base % mobj.group(1)
1815
1816
1817         # Download video ids using BlipTV Ajax calls. Result size per
1818         # query is limited (currently to 12 videos) so we need to query
1819         # page by page until there are no video ids - it means we got
1820         # all of them.
1821
1822         video_ids = []
1823         pagenum = 1
1824
1825         while True:
1826             url = page_base + "&page=" + str(pagenum)
1827             page = self._download_webpage(url, username,
1828                                           u'Downloading video ids from page %d' % pagenum)
1829
1830             # Extract video identifiers
1831             ids_in_page = []
1832
1833             for mobj in re.finditer(r'href="/([^"]+)"', page):
1834                 if mobj.group(1) not in ids_in_page:
1835                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1836
1837             video_ids.extend(ids_in_page)
1838
1839             # A little optimization - if current page is not
1840             # "full", ie. does not contain PAGE_SIZE video ids then
1841             # we can assume that this page is the last one - there
1842             # are no more ids on further pages - no need to query
1843             # again.
1844
1845             if len(ids_in_page) < self._PAGE_SIZE:
1846                 break
1847
1848             pagenum += 1
1849
1850         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1851         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1852         return [self.playlist_result(url_entries, playlist_title = username)]
1853
1854
1855 class DepositFilesIE(InfoExtractor):
1856     """Information extractor for depositfiles.com"""
1857
1858     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1859
1860     def _real_extract(self, url):
1861         file_id = url.split('/')[-1]
1862         # Rebuild url in english locale
1863         url = 'http://depositfiles.com/en/files/' + file_id
1864
1865         # Retrieve file webpage with 'Free download' button pressed
1866         free_download_indication = { 'gateway_result' : '1' }
1867         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1868         try:
1869             self.report_download_webpage(file_id)
1870             webpage = compat_urllib_request.urlopen(request).read()
1871         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1872             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1873
1874         # Search for the real file URL
1875         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1876         if (mobj is None) or (mobj.group(1) is None):
1877             # Try to figure out reason of the error.
1878             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1879             if (mobj is not None) and (mobj.group(1) is not None):
1880                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1881                 raise ExtractorError(u'%s' % restriction_message)
1882             else:
1883                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1884
1885         file_url = mobj.group(1)
1886         file_extension = os.path.splitext(file_url)[1][1:]
1887
1888         # Search for file title
1889         mobj = re.search(r'<b title="(.*?)">', webpage)
1890         if mobj is None:
1891             raise ExtractorError(u'Unable to extract title')
1892         file_title = mobj.group(1).decode('utf-8')
1893
1894         return [{
1895             'id':       file_id.decode('utf-8'),
1896             'url':      file_url.decode('utf-8'),
1897             'uploader': None,
1898             'upload_date':  None,
1899             'title':    file_title,
1900             'ext':      file_extension.decode('utf-8'),
1901         }]
1902
1903
1904 class FacebookIE(InfoExtractor):
1905     """Information Extractor for Facebook"""
1906
1907     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1908     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1909     _NETRC_MACHINE = 'facebook'
1910     IE_NAME = u'facebook'
1911
1912     def report_login(self):
1913         """Report attempt to log in."""
1914         self.to_screen(u'Logging in')
1915
1916     def _real_initialize(self):
1917         if self._downloader is None:
1918             return
1919
1920         useremail = None
1921         password = None
1922         downloader_params = self._downloader.params
1923
1924         # Attempt to use provided username and password or .netrc data
1925         if downloader_params.get('username', None) is not None:
1926             useremail = downloader_params['username']
1927             password = downloader_params['password']
1928         elif downloader_params.get('usenetrc', False):
1929             try:
1930                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1931                 if info is not None:
1932                     useremail = info[0]
1933                     password = info[2]
1934                 else:
1935                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1936             except (IOError, netrc.NetrcParseError) as err:
1937                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1938                 return
1939
1940         if useremail is None:
1941             return
1942
1943         # Log in
1944         login_form = {
1945             'email': useremail,
1946             'pass': password,
1947             'login': 'Log+In'
1948             }
1949         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1950         try:
1951             self.report_login()
1952             login_results = compat_urllib_request.urlopen(request).read()
1953             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1954                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1955                 return
1956         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1957             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1958             return
1959
1960     def _real_extract(self, url):
1961         mobj = re.match(self._VALID_URL, url)
1962         if mobj is None:
1963             raise ExtractorError(u'Invalid URL: %s' % url)
1964         video_id = mobj.group('ID')
1965
1966         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1967         webpage = self._download_webpage(url, video_id)
1968
1969         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1970         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1971         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1972         if not m:
1973             raise ExtractorError(u'Cannot parse data')
1974         data = dict(json.loads(m.group(1)))
1975         params_raw = compat_urllib_parse.unquote(data['params'])
1976         params = json.loads(params_raw)
1977         video_data = params['video_data'][0]
1978         video_url = video_data.get('hd_src')
1979         if not video_url:
1980             video_url = video_data['sd_src']
1981         if not video_url:
1982             raise ExtractorError(u'Cannot find video URL')
1983         video_duration = int(video_data['video_duration'])
1984         thumbnail = video_data['thumbnail_src']
1985
1986         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1987         if not m:
1988             raise ExtractorError(u'Cannot find title in webpage')
1989         video_title = unescapeHTML(m.group(1))
1990
1991         info = {
1992             'id': video_id,
1993             'title': video_title,
1994             'url': video_url,
1995             'ext': 'mp4',
1996             'duration': video_duration,
1997             'thumbnail': thumbnail,
1998         }
1999         return [info]
2000
2001
2002 class BlipTVIE(InfoExtractor):
2003     """Information extractor for blip.tv"""
2004
2005     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2006     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2007     IE_NAME = u'blip.tv'
2008
2009     def report_direct_download(self, title):
2010         """Report information extraction."""
2011         self.to_screen(u'%s: Direct download detected' % title)
2012
2013     def _real_extract(self, url):
2014         mobj = re.match(self._VALID_URL, url)
2015         if mobj is None:
2016             raise ExtractorError(u'Invalid URL: %s' % url)
2017
2018         urlp = compat_urllib_parse_urlparse(url)
2019         if urlp.path.startswith('/play/'):
2020             request = compat_urllib_request.Request(url)
2021             response = compat_urllib_request.urlopen(request)
2022             redirecturl = response.geturl()
2023             rurlp = compat_urllib_parse_urlparse(redirecturl)
2024             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2025             url = 'http://blip.tv/a/a-' + file_id
2026             return self._real_extract(url)
2027
2028
2029         if '?' in url:
2030             cchar = '&'
2031         else:
2032             cchar = '?'
2033         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2034         request = compat_urllib_request.Request(json_url)
2035         request.add_header('User-Agent', 'iTunes/10.6.1')
2036         self.report_extraction(mobj.group(1))
2037         info = None
2038         try:
2039             urlh = compat_urllib_request.urlopen(request)
2040             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2041                 basename = url.split('/')[-1]
2042                 title,ext = os.path.splitext(basename)
2043                 title = title.decode('UTF-8')
2044                 ext = ext.replace('.', '')
2045                 self.report_direct_download(title)
2046                 info = {
2047                     'id': title,
2048                     'url': url,
2049                     'uploader': None,
2050                     'upload_date': None,
2051                     'title': title,
2052                     'ext': ext,
2053                     'urlhandle': urlh
2054                 }
2055         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2056             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2057         if info is None: # Regular URL
2058             try:
2059                 json_code_bytes = urlh.read()
2060                 json_code = json_code_bytes.decode('utf-8')
2061             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2062                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2063
2064             try:
2065                 json_data = json.loads(json_code)
2066                 if 'Post' in json_data:
2067                     data = json_data['Post']
2068                 else:
2069                     data = json_data
2070
2071                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2072                 video_url = data['media']['url']
2073                 umobj = re.match(self._URL_EXT, video_url)
2074                 if umobj is None:
2075                     raise ValueError('Can not determine filename extension')
2076                 ext = umobj.group(1)
2077
2078                 info = {
2079                     'id': data['item_id'],
2080                     'url': video_url,
2081                     'uploader': data['display_name'],
2082                     'upload_date': upload_date,
2083                     'title': data['title'],
2084                     'ext': ext,
2085                     'format': data['media']['mimeType'],
2086                     'thumbnail': data['thumbnailUrl'],
2087                     'description': data['description'],
2088                     'player_url': data['embedUrl'],
2089                     'user_agent': 'iTunes/10.6.1',
2090                 }
2091             except (ValueError,KeyError) as err:
2092                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2093
2094         return [info]
2095
2096
2097 class MyVideoIE(InfoExtractor):
2098     """Information Extractor for myvideo.de."""
2099
2100     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2101     IE_NAME = u'myvideo'
2102
2103     def _real_extract(self,url):
2104         mobj = re.match(self._VALID_URL, url)
2105         if mobj is None:
2106             raise ExtractorError(u'Invalid URL: %s' % url)
2107
2108         video_id = mobj.group(1)
2109
2110         # Get video webpage
2111         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2112         webpage = self._download_webpage(webpage_url, video_id)
2113
2114         self.report_extraction(video_id)
2115         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2116                  webpage)
2117         if mobj is None:
2118             raise ExtractorError(u'Unable to extract media URL')
2119         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2120
2121         mobj = re.search('<title>([^<]+)</title>', webpage)
2122         if mobj is None:
2123             raise ExtractorError(u'Unable to extract title')
2124
2125         video_title = mobj.group(1)
2126
2127         return [{
2128             'id':       video_id,
2129             'url':      video_url,
2130             'uploader': None,
2131             'upload_date':  None,
2132             'title':    video_title,
2133             'ext':      u'flv',
2134         }]
2135
2136 class ComedyCentralIE(InfoExtractor):
2137     """Information extractor for The Daily Show and Colbert Report """
2138
2139     # urls can be abbreviations like :thedailyshow or :colbert
2140     # urls for episodes like:
2141     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2142     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2143     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2144     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2145                       |(https?://)?(www\.)?
2146                           (?P<showname>thedailyshow|colbertnation)\.com/
2147                          (full-episodes/(?P<episode>.*)|
2148                           (?P<clip>
2149                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2150                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2151                      $"""
2152
2153     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2154
2155     _video_extensions = {
2156         '3500': 'mp4',
2157         '2200': 'mp4',
2158         '1700': 'mp4',
2159         '1200': 'mp4',
2160         '750': 'mp4',
2161         '400': 'mp4',
2162     }
2163     _video_dimensions = {
2164         '3500': '1280x720',
2165         '2200': '960x540',
2166         '1700': '768x432',
2167         '1200': '640x360',
2168         '750': '512x288',
2169         '400': '384x216',
2170     }
2171
2172     @classmethod
2173     def suitable(cls, url):
2174         """Receives a URL and returns True if suitable for this IE."""
2175         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2176
2177     def _print_formats(self, formats):
2178         print('Available formats:')
2179         for x in formats:
2180             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2181
2182
2183     def _real_extract(self, url):
2184         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2185         if mobj is None:
2186             raise ExtractorError(u'Invalid URL: %s' % url)
2187
2188         if mobj.group('shortname'):
2189             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2190                 url = u'http://www.thedailyshow.com/full-episodes/'
2191             else:
2192                 url = u'http://www.colbertnation.com/full-episodes/'
2193             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2194             assert mobj is not None
2195
2196         if mobj.group('clip'):
2197             if mobj.group('showname') == 'thedailyshow':
2198                 epTitle = mobj.group('tdstitle')
2199             else:
2200                 epTitle = mobj.group('cntitle')
2201             dlNewest = False
2202         else:
2203             dlNewest = not mobj.group('episode')
2204             if dlNewest:
2205                 epTitle = mobj.group('showname')
2206             else:
2207                 epTitle = mobj.group('episode')
2208
2209         self.report_extraction(epTitle)
2210         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2211         if dlNewest:
2212             url = htmlHandle.geturl()
2213             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2214             if mobj is None:
2215                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2216             if mobj.group('episode') == '':
2217                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2218             epTitle = mobj.group('episode')
2219
2220         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2221
2222         if len(mMovieParams) == 0:
2223             # The Colbert Report embeds the information in a without
2224             # a URL prefix; so extract the alternate reference
2225             # and then add the URL prefix manually.
2226
2227             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2228             if len(altMovieParams) == 0:
2229                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2230             else:
2231                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2232
2233         uri = mMovieParams[0][1]
2234         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2235         indexXml = self._download_webpage(indexUrl, epTitle,
2236                                           u'Downloading show index',
2237                                           u'unable to download episode index')
2238
2239         results = []
2240
2241         idoc = xml.etree.ElementTree.fromstring(indexXml)
2242         itemEls = idoc.findall('.//item')
2243         for partNum,itemEl in enumerate(itemEls):
2244             mediaId = itemEl.findall('./guid')[0].text
2245             shortMediaId = mediaId.split(':')[-1]
2246             showId = mediaId.split(':')[-2].replace('.com', '')
2247             officialTitle = itemEl.findall('./title')[0].text
2248             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2249
2250             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2251                         compat_urllib_parse.urlencode({'uri': mediaId}))
2252             configXml = self._download_webpage(configUrl, epTitle,
2253                                                u'Downloading configuration for %s' % shortMediaId)
2254
2255             cdoc = xml.etree.ElementTree.fromstring(configXml)
2256             turls = []
2257             for rendition in cdoc.findall('.//rendition'):
2258                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2259                 turls.append(finfo)
2260
2261             if len(turls) == 0:
2262                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2263                 continue
2264
2265             if self._downloader.params.get('listformats', None):
2266                 self._print_formats([i[0] for i in turls])
2267                 return
2268
2269             # For now, just pick the highest bitrate
2270             format,rtmp_video_url = turls[-1]
2271
2272             # Get the format arg from the arg stream
2273             req_format = self._downloader.params.get('format', None)
2274
2275             # Select format if we can find one
2276             for f,v in turls:
2277                 if f == req_format:
2278                     format, rtmp_video_url = f, v
2279                     break
2280
2281             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2282             if not m:
2283                 raise ExtractorError(u'Cannot transform RTMP url')
2284             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2285             video_url = base + m.group('finalid')
2286
2287             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2288             info = {
2289                 'id': shortMediaId,
2290                 'url': video_url,
2291                 'uploader': showId,
2292                 'upload_date': officialDate,
2293                 'title': effTitle,
2294                 'ext': 'mp4',
2295                 'format': format,
2296                 'thumbnail': None,
2297                 'description': officialTitle,
2298             }
2299             results.append(info)
2300
2301         return results
2302
2303
2304 class EscapistIE(InfoExtractor):
2305     """Information extractor for The Escapist """
2306
2307     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2308     IE_NAME = u'escapist'
2309
2310     def _real_extract(self, url):
2311         mobj = re.match(self._VALID_URL, url)
2312         if mobj is None:
2313             raise ExtractorError(u'Invalid URL: %s' % url)
2314         showName = mobj.group('showname')
2315         videoId = mobj.group('episode')
2316
2317         self.report_extraction(showName)
2318         webPage = self._download_webpage(url, showName)
2319
2320         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2321         description = unescapeHTML(descMatch.group(1))
2322         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2323         imgUrl = unescapeHTML(imgMatch.group(1))
2324         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2325         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2326         configUrlMatch = re.search('config=(.*)$', playerUrl)
2327         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2328
2329         configJSON = self._download_webpage(configUrl, showName,
2330                                             u'Downloading configuration',
2331                                             u'unable to download configuration')
2332
2333         # Technically, it's JavaScript, not JSON
2334         configJSON = configJSON.replace("'", '"')
2335
2336         try:
2337             config = json.loads(configJSON)
2338         except (ValueError,) as err:
2339             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2340
2341         playlist = config['playlist']
2342         videoUrl = playlist[1]['url']
2343
2344         info = {
2345             'id': videoId,
2346             'url': videoUrl,
2347             'uploader': showName,
2348             'upload_date': None,
2349             'title': showName,
2350             'ext': 'mp4',
2351             'thumbnail': imgUrl,
2352             'description': description,
2353             'player_url': playerUrl,
2354         }
2355
2356         return [info]
2357
2358 class CollegeHumorIE(InfoExtractor):
2359     """Information extractor for collegehumor.com"""
2360
2361     _WORKING = False
2362     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2363     IE_NAME = u'collegehumor'
2364
2365     def report_manifest(self, video_id):
2366         """Report information extraction."""
2367         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2368
2369     def _real_extract(self, url):
2370         mobj = re.match(self._VALID_URL, url)
2371         if mobj is None:
2372             raise ExtractorError(u'Invalid URL: %s' % url)
2373         video_id = mobj.group('videoid')
2374
2375         info = {
2376             'id': video_id,
2377             'uploader': None,
2378             'upload_date': None,
2379         }
2380
2381         self.report_extraction(video_id)
2382         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2383         try:
2384             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2385         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2386             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2387
2388         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2389         try:
2390             videoNode = mdoc.findall('./video')[0]
2391             info['description'] = videoNode.findall('./description')[0].text
2392             info['title'] = videoNode.findall('./caption')[0].text
2393             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2394             manifest_url = videoNode.findall('./file')[0].text
2395         except IndexError:
2396             raise ExtractorError(u'Invalid metadata XML file')
2397
2398         manifest_url += '?hdcore=2.10.3'
2399         self.report_manifest(video_id)
2400         try:
2401             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2402         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2403             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2404
2405         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2406         try:
2407             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2408             node_id = media_node.attrib['url']
2409             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2410         except IndexError as err:
2411             raise ExtractorError(u'Invalid manifest file')
2412
2413         url_pr = compat_urllib_parse_urlparse(manifest_url)
2414         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2415
2416         info['url'] = url
2417         info['ext'] = 'f4f'
2418         return [info]
2419
2420
2421 class XVideosIE(InfoExtractor):
2422     """Information extractor for xvideos.com"""
2423
2424     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2425     IE_NAME = u'xvideos'
2426
2427     def _real_extract(self, url):
2428         mobj = re.match(self._VALID_URL, url)
2429         if mobj is None:
2430             raise ExtractorError(u'Invalid URL: %s' % url)
2431         video_id = mobj.group(1)
2432
2433         webpage = self._download_webpage(url, video_id)
2434
2435         self.report_extraction(video_id)
2436
2437
2438         # Extract video URL
2439         mobj = re.search(r'flv_url=(.+?)&', webpage)
2440         if mobj is None:
2441             raise ExtractorError(u'Unable to extract video url')
2442         video_url = compat_urllib_parse.unquote(mobj.group(1))
2443
2444
2445         # Extract title
2446         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2447         if mobj is None:
2448             raise ExtractorError(u'Unable to extract video title')
2449         video_title = mobj.group(1)
2450
2451
2452         # Extract video thumbnail
2453         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2454         if mobj is None:
2455             raise ExtractorError(u'Unable to extract video thumbnail')
2456         video_thumbnail = mobj.group(0)
2457
2458         info = {
2459             'id': video_id,
2460             'url': video_url,
2461             'uploader': None,
2462             'upload_date': None,
2463             'title': video_title,
2464             'ext': 'flv',
2465             'thumbnail': video_thumbnail,
2466             'description': None,
2467         }
2468
2469         return [info]
2470
2471
2472 class SoundcloudIE(InfoExtractor):
2473     """Information extractor for soundcloud.com
2474        To access the media, the uid of the song and a stream token
2475        must be extracted from the page source and the script must make
2476        a request to media.soundcloud.com/crossdomain.xml. Then
2477        the media can be grabbed by requesting from an url composed
2478        of the stream token and uid
2479      """
2480
2481     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2482     IE_NAME = u'soundcloud'
2483
2484     def report_resolve(self, video_id):
2485         """Report information extraction."""
2486         self.to_screen(u'%s: Resolving id' % video_id)
2487
2488     def _real_extract(self, url):
2489         mobj = re.match(self._VALID_URL, url)
2490         if mobj is None:
2491             raise ExtractorError(u'Invalid URL: %s' % url)
2492
2493         # extract uploader (which is in the url)
2494         uploader = mobj.group(1)
2495         # extract simple title (uploader + slug of song title)
2496         slug_title =  mobj.group(2)
2497         simple_title = uploader + u'-' + slug_title
2498         full_title = '%s/%s' % (uploader, slug_title)
2499
2500         self.report_resolve(full_title)
2501
2502         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2503         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2504         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2505
2506         info = json.loads(info_json)
2507         video_id = info['id']
2508         self.report_extraction(full_title)
2509
2510         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2511         stream_json = self._download_webpage(streams_url, full_title,
2512                                              u'Downloading stream definitions',
2513                                              u'unable to download stream definitions')
2514
2515         streams = json.loads(stream_json)
2516         mediaURL = streams['http_mp3_128_url']
2517         upload_date = unified_strdate(info['created_at'])
2518
2519         return [{
2520             'id':       info['id'],
2521             'url':      mediaURL,
2522             'uploader': info['user']['username'],
2523             'upload_date': upload_date,
2524             'title':    info['title'],
2525             'ext':      u'mp3',
2526             'description': info['description'],
2527         }]
2528
2529 class SoundcloudSetIE(InfoExtractor):
2530     """Information extractor for soundcloud.com sets
2531        To access the media, the uid of the song and a stream token
2532        must be extracted from the page source and the script must make
2533        a request to media.soundcloud.com/crossdomain.xml. Then
2534        the media can be grabbed by requesting from an url composed
2535        of the stream token and uid
2536      """
2537
2538     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2539     IE_NAME = u'soundcloud:set'
2540
2541     def report_resolve(self, video_id):
2542         """Report information extraction."""
2543         self.to_screen(u'%s: Resolving id' % video_id)
2544
2545     def _real_extract(self, url):
2546         mobj = re.match(self._VALID_URL, url)
2547         if mobj is None:
2548             raise ExtractorError(u'Invalid URL: %s' % url)
2549
2550         # extract uploader (which is in the url)
2551         uploader = mobj.group(1)
2552         # extract simple title (uploader + slug of song title)
2553         slug_title =  mobj.group(2)
2554         simple_title = uploader + u'-' + slug_title
2555         full_title = '%s/sets/%s' % (uploader, slug_title)
2556
2557         self.report_resolve(full_title)
2558
2559         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2560         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2561         info_json = self._download_webpage(resolv_url, full_title)
2562
2563         videos = []
2564         info = json.loads(info_json)
2565         if 'errors' in info:
2566             for err in info['errors']:
2567                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2568             return
2569
2570         self.report_extraction(full_title)
2571         for track in info['tracks']:
2572             video_id = track['id']
2573
2574             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2575             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2576
2577             self.report_extraction(video_id)
2578             streams = json.loads(stream_json)
2579             mediaURL = streams['http_mp3_128_url']
2580
2581             videos.append({
2582                 'id':       video_id,
2583                 'url':      mediaURL,
2584                 'uploader': track['user']['username'],
2585                 'upload_date':  unified_strdate(track['created_at']),
2586                 'title':    track['title'],
2587                 'ext':      u'mp3',
2588                 'description': track['description'],
2589             })
2590         return videos
2591
2592
2593 class InfoQIE(InfoExtractor):
2594     """Information extractor for infoq.com"""
2595     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2596
2597     def _real_extract(self, url):
2598         mobj = re.match(self._VALID_URL, url)
2599         if mobj is None:
2600             raise ExtractorError(u'Invalid URL: %s' % url)
2601
2602         webpage = self._download_webpage(url, video_id=url)
2603         self.report_extraction(url)
2604
2605         # Extract video URL
2606         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2607         if mobj is None:
2608             raise ExtractorError(u'Unable to extract video url')
2609         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2610         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2611
2612         # Extract title
2613         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2614         if mobj is None:
2615             raise ExtractorError(u'Unable to extract video title')
2616         video_title = mobj.group(1)
2617
2618         # Extract description
2619         video_description = u'No description available.'
2620         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2621         if mobj is not None:
2622             video_description = mobj.group(1)
2623
2624         video_filename = video_url.split('/')[-1]
2625         video_id, extension = video_filename.split('.')
2626
2627         info = {
2628             'id': video_id,
2629             'url': video_url,
2630             'uploader': None,
2631             'upload_date': None,
2632             'title': video_title,
2633             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2634             'thumbnail': None,
2635             'description': video_description,
2636         }
2637
2638         return [info]
2639
2640 class MixcloudIE(InfoExtractor):
2641     """Information extractor for www.mixcloud.com"""
2642
2643     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2644     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2645     IE_NAME = u'mixcloud'
2646
2647     def report_download_json(self, file_id):
2648         """Report JSON download."""
2649         self.to_screen(u'Downloading json')
2650
2651     def get_urls(self, jsonData, fmt, bitrate='best'):
2652         """Get urls from 'audio_formats' section in json"""
2653         file_url = None
2654         try:
2655             bitrate_list = jsonData[fmt]
2656             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2657                 bitrate = max(bitrate_list) # select highest
2658
2659             url_list = jsonData[fmt][bitrate]
2660         except TypeError: # we have no bitrate info.
2661             url_list = jsonData[fmt]
2662         return url_list
2663
2664     def check_urls(self, url_list):
2665         """Returns 1st active url from list"""
2666         for url in url_list:
2667             try:
2668                 compat_urllib_request.urlopen(url)
2669                 return url
2670             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2671                 url = None
2672
2673         return None
2674
2675     def _print_formats(self, formats):
2676         print('Available formats:')
2677         for fmt in formats.keys():
2678             for b in formats[fmt]:
2679                 try:
2680                     ext = formats[fmt][b][0]
2681                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2682                 except TypeError: # we have no bitrate info
2683                     ext = formats[fmt][0]
2684                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2685                     break
2686
2687     def _real_extract(self, url):
2688         mobj = re.match(self._VALID_URL, url)
2689         if mobj is None:
2690             raise ExtractorError(u'Invalid URL: %s' % url)
2691         # extract uploader & filename from url
2692         uploader = mobj.group(1).decode('utf-8')
2693         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2694
2695         # construct API request
2696         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2697         # retrieve .json file with links to files
2698         request = compat_urllib_request.Request(file_url)
2699         try:
2700             self.report_download_json(file_url)
2701             jsonData = compat_urllib_request.urlopen(request).read()
2702         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2703             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2704
2705         # parse JSON
2706         json_data = json.loads(jsonData)
2707         player_url = json_data['player_swf_url']
2708         formats = dict(json_data['audio_formats'])
2709
2710         req_format = self._downloader.params.get('format', None)
2711         bitrate = None
2712
2713         if self._downloader.params.get('listformats', None):
2714             self._print_formats(formats)
2715             return
2716
2717         if req_format is None or req_format == 'best':
2718             for format_param in formats.keys():
2719                 url_list = self.get_urls(formats, format_param)
2720                 # check urls
2721                 file_url = self.check_urls(url_list)
2722                 if file_url is not None:
2723                     break # got it!
2724         else:
2725             if req_format not in formats:
2726                 raise ExtractorError(u'Format is not available')
2727
2728             url_list = self.get_urls(formats, req_format)
2729             file_url = self.check_urls(url_list)
2730             format_param = req_format
2731
2732         return [{
2733             'id': file_id.decode('utf-8'),
2734             'url': file_url.decode('utf-8'),
2735             'uploader': uploader.decode('utf-8'),
2736             'upload_date': None,
2737             'title': json_data['name'],
2738             'ext': file_url.split('.')[-1].decode('utf-8'),
2739             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2740             'thumbnail': json_data['thumbnail_url'],
2741             'description': json_data['description'],
2742             'player_url': player_url.decode('utf-8'),
2743         }]
2744
2745 class StanfordOpenClassroomIE(InfoExtractor):
2746     """Information extractor for Stanford's Open ClassRoom"""
2747
2748     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2749     IE_NAME = u'stanfordoc'
2750
2751     def _real_extract(self, url):
2752         mobj = re.match(self._VALID_URL, url)
2753         if mobj is None:
2754             raise ExtractorError(u'Invalid URL: %s' % url)
2755
2756         if mobj.group('course') and mobj.group('video'): # A specific video
2757             course = mobj.group('course')
2758             video = mobj.group('video')
2759             info = {
2760                 'id': course + '_' + video,
2761                 'uploader': None,
2762                 'upload_date': None,
2763             }
2764
2765             self.report_extraction(info['id'])
2766             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2767             xmlUrl = baseUrl + video + '.xml'
2768             try:
2769                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2770             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2771                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2772             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2773             try:
2774                 info['title'] = mdoc.findall('./title')[0].text
2775                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2776             except IndexError:
2777                 raise ExtractorError(u'Invalid metadata XML file')
2778             info['ext'] = info['url'].rpartition('.')[2]
2779             return [info]
2780         elif mobj.group('course'): # A course page
2781             course = mobj.group('course')
2782             info = {
2783                 'id': course,
2784                 'type': 'playlist',
2785                 'uploader': None,
2786                 'upload_date': None,
2787             }
2788
2789             coursepage = self._download_webpage(url, info['id'],
2790                                         note='Downloading course info page',
2791                                         errnote='Unable to download course info page')
2792
2793             m = re.search('<h1>([^<]+)</h1>', coursepage)
2794             if m:
2795                 info['title'] = unescapeHTML(m.group(1))
2796             else:
2797                 info['title'] = info['id']
2798
2799             m = re.search('<description>([^<]+)</description>', coursepage)
2800             if m:
2801                 info['description'] = unescapeHTML(m.group(1))
2802
2803             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2804             info['list'] = [
2805                 {
2806                     'type': 'reference',
2807                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2808                 }
2809                     for vpage in links]
2810             results = []
2811             for entry in info['list']:
2812                 assert entry['type'] == 'reference'
2813                 results += self.extract(entry['url'])
2814             return results
2815         else: # Root page
2816             info = {
2817                 'id': 'Stanford OpenClassroom',
2818                 'type': 'playlist',
2819                 'uploader': None,
2820                 'upload_date': None,
2821             }
2822
2823             self.report_download_webpage(info['id'])
2824             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2825             try:
2826                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2827             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2828                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2829
2830             info['title'] = info['id']
2831
2832             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2833             info['list'] = [
2834                 {
2835                     'type': 'reference',
2836                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2837                 }
2838                     for cpage in links]
2839
2840             results = []
2841             for entry in info['list']:
2842                 assert entry['type'] == 'reference'
2843                 results += self.extract(entry['url'])
2844             return results
2845
2846 class MTVIE(InfoExtractor):
2847     """Information extractor for MTV.com"""
2848
2849     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2850     IE_NAME = u'mtv'
2851
2852     def _real_extract(self, url):
2853         mobj = re.match(self._VALID_URL, url)
2854         if mobj is None:
2855             raise ExtractorError(u'Invalid URL: %s' % url)
2856         if not mobj.group('proto'):
2857             url = 'http://' + url
2858         video_id = mobj.group('videoid')
2859
2860         webpage = self._download_webpage(url, video_id)
2861
2862         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2863         if mobj is None:
2864             raise ExtractorError(u'Unable to extract song name')
2865         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2866         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2867         if mobj is None:
2868             raise ExtractorError(u'Unable to extract performer')
2869         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2870         video_title = performer + ' - ' + song_name
2871
2872         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2873         if mobj is None:
2874             raise ExtractorError(u'Unable to mtvn_uri')
2875         mtvn_uri = mobj.group(1)
2876
2877         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2878         if mobj is None:
2879             raise ExtractorError(u'Unable to extract content id')
2880         content_id = mobj.group(1)
2881
2882         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2883         self.report_extraction(video_id)
2884         request = compat_urllib_request.Request(videogen_url)
2885         try:
2886             metadataXml = compat_urllib_request.urlopen(request).read()
2887         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2888             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2889
2890         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2891         renditions = mdoc.findall('.//rendition')
2892
2893         # For now, always pick the highest quality.
2894         rendition = renditions[-1]
2895
2896         try:
2897             _,_,ext = rendition.attrib['type'].partition('/')
2898             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2899             video_url = rendition.find('./src').text
2900         except KeyError:
2901             raise ExtractorError('Invalid rendition field.')
2902
2903         info = {
2904             'id': video_id,
2905             'url': video_url,
2906             'uploader': performer,
2907             'upload_date': None,
2908             'title': video_title,
2909             'ext': ext,
2910             'format': format,
2911         }
2912
2913         return [info]
2914
2915
2916 class YoukuIE(InfoExtractor):
2917     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2918
2919     def _gen_sid(self):
2920         nowTime = int(time.time() * 1000)
2921         random1 = random.randint(1000,1998)
2922         random2 = random.randint(1000,9999)
2923
2924         return "%d%d%d" %(nowTime,random1,random2)
2925
2926     def _get_file_ID_mix_string(self, seed):
2927         mixed = []
2928         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2929         seed = float(seed)
2930         for i in range(len(source)):
2931             seed  =  (seed * 211 + 30031 ) % 65536
2932             index  =  math.floor(seed / 65536 * len(source) )
2933             mixed.append(source[int(index)])
2934             source.remove(source[int(index)])
2935         #return ''.join(mixed)
2936         return mixed
2937
2938     def _get_file_id(self, fileId, seed):
2939         mixed = self._get_file_ID_mix_string(seed)
2940         ids = fileId.split('*')
2941         realId = []
2942         for ch in ids:
2943             if ch:
2944                 realId.append(mixed[int(ch)])
2945         return ''.join(realId)
2946
2947     def _real_extract(self, url):
2948         mobj = re.match(self._VALID_URL, url)
2949         if mobj is None:
2950             raise ExtractorError(u'Invalid URL: %s' % url)
2951         video_id = mobj.group('ID')
2952
2953         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2954
2955         jsondata = self._download_webpage(info_url, video_id)
2956
2957         self.report_extraction(video_id)
2958         try:
2959             config = json.loads(jsondata)
2960
2961             video_title =  config['data'][0]['title']
2962             seed = config['data'][0]['seed']
2963
2964             format = self._downloader.params.get('format', None)
2965             supported_format = list(config['data'][0]['streamfileids'].keys())
2966
2967             if format is None or format == 'best':
2968                 if 'hd2' in supported_format:
2969                     format = 'hd2'
2970                 else:
2971                     format = 'flv'
2972                 ext = u'flv'
2973             elif format == 'worst':
2974                 format = 'mp4'
2975                 ext = u'mp4'
2976             else:
2977                 format = 'flv'
2978                 ext = u'flv'
2979
2980
2981             fileid = config['data'][0]['streamfileids'][format]
2982             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2983         except (UnicodeDecodeError, ValueError, KeyError):
2984             raise ExtractorError(u'Unable to extract info section')
2985
2986         files_info=[]
2987         sid = self._gen_sid()
2988         fileid = self._get_file_id(fileid, seed)
2989
2990         #column 8,9 of fileid represent the segment number
2991         #fileid[7:9] should be changed
2992         for index, key in enumerate(keys):
2993
2994             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2995             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2996
2997             info = {
2998                 'id': '%s_part%02d' % (video_id, index),
2999                 'url': download_url,
3000                 'uploader': None,
3001                 'upload_date': None,
3002                 'title': video_title,
3003                 'ext': ext,
3004             }
3005             files_info.append(info)
3006
3007         return files_info
3008
3009
3010 class XNXXIE(InfoExtractor):
3011     """Information extractor for xnxx.com"""
3012
3013     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3014     IE_NAME = u'xnxx'
3015     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3016     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3017     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3018
3019     def _real_extract(self, url):
3020         mobj = re.match(self._VALID_URL, url)
3021         if mobj is None:
3022             raise ExtractorError(u'Invalid URL: %s' % url)
3023         video_id = mobj.group(1)
3024
3025         # Get webpage content
3026         webpage = self._download_webpage(url, video_id)
3027
3028         result = re.search(self.VIDEO_URL_RE, webpage)
3029         if result is None:
3030             raise ExtractorError(u'Unable to extract video url')
3031         video_url = compat_urllib_parse.unquote(result.group(1))
3032
3033         result = re.search(self.VIDEO_TITLE_RE, webpage)
3034         if result is None:
3035             raise ExtractorError(u'Unable to extract video title')
3036         video_title = result.group(1)
3037
3038         result = re.search(self.VIDEO_THUMB_RE, webpage)
3039         if result is None:
3040             raise ExtractorError(u'Unable to extract video thumbnail')
3041         video_thumbnail = result.group(1)
3042
3043         return [{
3044             'id': video_id,
3045             'url': video_url,
3046             'uploader': None,
3047             'upload_date': None,
3048             'title': video_title,
3049             'ext': 'flv',
3050             'thumbnail': video_thumbnail,
3051             'description': None,
3052         }]
3053
3054
3055 class GooglePlusIE(InfoExtractor):
3056     """Information extractor for plus.google.com."""
3057
3058     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3059     IE_NAME = u'plus.google'
3060
3061     def report_extract_entry(self, url):
3062         """Report downloading extry"""
3063         self.to_screen(u'Downloading entry: %s' % url)
3064
3065     def report_date(self, upload_date):
3066         """Report downloading extry"""
3067         self.to_screen(u'Entry date: %s' % upload_date)
3068
3069     def report_uploader(self, uploader):
3070         """Report downloading extry"""
3071         self.to_screen(u'Uploader: %s' % uploader)
3072
3073     def report_title(self, video_title):
3074         """Report downloading extry"""
3075         self.to_screen(u'Title: %s' % video_title)
3076
3077     def report_extract_vid_page(self, video_page):
3078         """Report information extraction."""
3079         self.to_screen(u'Extracting video page: %s' % video_page)
3080
3081     def _real_extract(self, url):
3082         # Extract id from URL
3083         mobj = re.match(self._VALID_URL, url)
3084         if mobj is None:
3085             raise ExtractorError(u'Invalid URL: %s' % url)
3086
3087         post_url = mobj.group(0)
3088         video_id = mobj.group(1)
3089
3090         video_extension = 'flv'
3091
3092         # Step 1, Retrieve post webpage to extract further information
3093         self.report_extract_entry(post_url)
3094         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3095
3096         # Extract update date
3097         upload_date = None
3098         pattern = 'title="Timestamp">(.*?)</a>'
3099         mobj = re.search(pattern, webpage)
3100         if mobj:
3101             upload_date = mobj.group(1)
3102             # Convert timestring to a format suitable for filename
3103             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3104             upload_date = upload_date.strftime('%Y%m%d')
3105         self.report_date(upload_date)
3106
3107         # Extract uploader
3108         uploader = None
3109         pattern = r'rel\="author".*?>(.*?)</a>'
3110         mobj = re.search(pattern, webpage)
3111         if mobj:
3112             uploader = mobj.group(1)
3113         self.report_uploader(uploader)
3114
3115         # Extract title
3116         # Get the first line for title
3117         video_title = u'NA'
3118         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3119         mobj = re.search(pattern, webpage)
3120         if mobj:
3121             video_title = mobj.group(1)
3122         self.report_title(video_title)
3123
3124         # Step 2, Stimulate clicking the image box to launch video
3125         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3126         mobj = re.search(pattern, webpage)
3127         if mobj is None:
3128             raise ExtractorError(u'Unable to extract video page URL')
3129
3130         video_page = mobj.group(1)
3131         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132         self.report_extract_vid_page(video_page)
3133
3134
3135         # Extract video links on video page
3136         """Extract video links of all sizes"""
3137         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3138         mobj = re.findall(pattern, webpage)
3139         if len(mobj) == 0:
3140             raise ExtractorError(u'Unable to extract video links')
3141
3142         # Sort in resolution
3143         links = sorted(mobj)
3144
3145         # Choose the lowest of the sort, i.e. highest resolution
3146         video_url = links[-1]
3147         # Only get the url. The resolution part in the tuple has no use anymore
3148         video_url = video_url[-1]
3149         # Treat escaped \u0026 style hex
3150         try:
3151             video_url = video_url.decode("unicode_escape")
3152         except AttributeError: # Python 3
3153             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3154
3155
3156         return [{
3157             'id':       video_id,
3158             'url':      video_url,
3159             'uploader': uploader,
3160             'upload_date':  upload_date,
3161             'title':    video_title,
3162             'ext':      video_extension,
3163         }]
3164
3165 class NBAIE(InfoExtractor):
3166     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3167     IE_NAME = u'nba'
3168
3169     def _real_extract(self, url):
3170         mobj = re.match(self._VALID_URL, url)
3171         if mobj is None:
3172             raise ExtractorError(u'Invalid URL: %s' % url)
3173
3174         video_id = mobj.group(1)
3175         if video_id.endswith('/index.html'):
3176             video_id = video_id[:-len('/index.html')]
3177
3178         webpage = self._download_webpage(url, video_id)
3179
3180         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3181         def _findProp(rexp, default=None):
3182             m = re.search(rexp, webpage)
3183             if m:
3184                 return unescapeHTML(m.group(1))
3185             else:
3186                 return default
3187
3188         shortened_video_id = video_id.rpartition('/')[2]
3189         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3190         info = {
3191             'id': shortened_video_id,
3192             'url': video_url,
3193             'ext': 'mp4',
3194             'title': title,
3195             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3196             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3197         }
3198         return [info]
3199
3200 class JustinTVIE(InfoExtractor):
3201     """Information extractor for justin.tv and twitch.tv"""
3202     # TODO: One broadcast may be split into multiple videos. The key
3203     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3204     # starts at 1 and increases. Can we treat all parts as one video?
3205
3206     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3207         (?:
3208             (?P<channelid>[^/]+)|
3209             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3210             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3211         )
3212         /?(?:\#.*)?$
3213         """
3214     _JUSTIN_PAGE_LIMIT = 100
3215     IE_NAME = u'justin.tv'
3216
3217     def report_download_page(self, channel, offset):
3218         """Report attempt to download a single page of videos."""
3219         self.to_screen(u'%s: Downloading video information from %d to %d' %
3220                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3221
3222     # Return count of items, list of *valid* items
3223     def _parse_page(self, url, video_id):
3224         webpage = self._download_webpage(url, video_id,
3225                                          u'Downloading video info JSON',
3226                                          u'unable to download video info JSON')
3227
3228         response = json.loads(webpage)
3229         if type(response) != list:
3230             error_text = response.get('error', 'unknown error')
3231             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3232         info = []
3233         for clip in response:
3234             video_url = clip['video_file_url']
3235             if video_url:
3236                 video_extension = os.path.splitext(video_url)[1][1:]
3237                 video_date = re.sub('-', '', clip['start_time'][:10])
3238                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3239                 video_id = clip['id']
3240                 video_title = clip.get('title', video_id)
3241                 info.append({
3242                     'id': video_id,
3243                     'url': video_url,
3244                     'title': video_title,
3245                     'uploader': clip.get('channel_name', video_uploader_id),
3246                     'uploader_id': video_uploader_id,
3247                     'upload_date': video_date,
3248                     'ext': video_extension,
3249                 })
3250         return (len(response), info)
3251
3252     def _real_extract(self, url):
3253         mobj = re.match(self._VALID_URL, url)
3254         if mobj is None:
3255             raise ExtractorError(u'invalid URL: %s' % url)
3256
3257         api_base = 'http://api.justin.tv'
3258         paged = False
3259         if mobj.group('channelid'):
3260             paged = True
3261             video_id = mobj.group('channelid')
3262             api = api_base + '/channel/archives/%s.json' % video_id
3263         elif mobj.group('chapterid'):
3264             chapter_id = mobj.group('chapterid')
3265
3266             webpage = self._download_webpage(url, chapter_id)
3267             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3268             if not m:
3269                 raise ExtractorError(u'Cannot find archive of a chapter')
3270             archive_id = m.group(1)
3271
3272             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3273             chapter_info_xml = self._download_webpage(api, chapter_id,
3274                                              note=u'Downloading chapter information',
3275                                              errnote=u'Chapter information download failed')
3276             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3277             for a in doc.findall('.//archive'):
3278                 if archive_id == a.find('./id').text:
3279                     break
3280             else:
3281                 raise ExtractorError(u'Could not find chapter in chapter information')
3282
3283             video_url = a.find('./video_file_url').text
3284             video_ext = video_url.rpartition('.')[2] or u'flv'
3285
3286             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3287             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3288                                    note='Downloading chapter metadata',
3289                                    errnote='Download of chapter metadata failed')
3290             chapter_info = json.loads(chapter_info_json)
3291
3292             bracket_start = int(doc.find('.//bracket_start').text)
3293             bracket_end = int(doc.find('.//bracket_end').text)
3294
3295             # TODO determine start (and probably fix up file)
3296             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3297             #video_url += u'?start=' + TODO:start_timestamp
3298             # bracket_start is 13290, but we want 51670615
3299             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3300                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3301
3302             info = {
3303                 'id': u'c' + chapter_id,
3304                 'url': video_url,
3305                 'ext': video_ext,
3306                 'title': chapter_info['title'],
3307                 'thumbnail': chapter_info['preview'],
3308                 'description': chapter_info['description'],
3309                 'uploader': chapter_info['channel']['display_name'],
3310                 'uploader_id': chapter_info['channel']['name'],
3311             }
3312             return [info]
3313         else:
3314             video_id = mobj.group('videoid')
3315             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3316
3317         self.report_extraction(video_id)
3318
3319         info = []
3320         offset = 0
3321         limit = self._JUSTIN_PAGE_LIMIT
3322         while True:
3323             if paged:
3324                 self.report_download_page(video_id, offset)
3325             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3326             page_count, page_info = self._parse_page(page_url, video_id)
3327             info.extend(page_info)
3328             if not paged or page_count != limit:
3329                 break
3330             offset += limit
3331         return info
3332
3333 class FunnyOrDieIE(InfoExtractor):
3334     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3335
3336     def _real_extract(self, url):
3337         mobj = re.match(self._VALID_URL, url)
3338         if mobj is None:
3339             raise ExtractorError(u'invalid URL: %s' % url)
3340
3341         video_id = mobj.group('id')
3342         webpage = self._download_webpage(url, video_id)
3343
3344         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3345         if not m:
3346             raise ExtractorError(u'Unable to find video information')
3347         video_url = unescapeHTML(m.group('url'))
3348
3349         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3350         if not m:
3351             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3352             if not m:
3353                 raise ExtractorError(u'Cannot find video title')
3354         title = clean_html(m.group('title'))
3355
3356         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3357         if m:
3358             desc = unescapeHTML(m.group('desc'))
3359         else:
3360             desc = None
3361
3362         info = {
3363             'id': video_id,
3364             'url': video_url,
3365             'ext': 'mp4',
3366             'title': title,
3367             'description': desc,
3368         }
3369         return [info]
3370
3371 class SteamIE(InfoExtractor):
3372     _VALID_URL = r"""http://store\.steampowered\.com/
3373                 (agecheck/)?
3374                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3375                 (?P<gameID>\d+)/?
3376                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3377                 """
3378
3379     @classmethod
3380     def suitable(cls, url):
3381         """Receives a URL and returns True if suitable for this IE."""
3382         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3383
3384     def _real_extract(self, url):
3385         m = re.match(self._VALID_URL, url, re.VERBOSE)
3386         gameID = m.group('gameID')
3387         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3388         self.report_age_confirmation()
3389         webpage = self._download_webpage(videourl, gameID)
3390         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3391
3392         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3393         mweb = re.finditer(urlRE, webpage)
3394         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3395         titles = re.finditer(namesRE, webpage)
3396         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3397         thumbs = re.finditer(thumbsRE, webpage)
3398         videos = []
3399         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3400             video_id = vid.group('videoID')
3401             title = vtitle.group('videoName')
3402             video_url = vid.group('videoURL')
3403             video_thumb = thumb.group('thumbnail')
3404             if not video_url:
3405                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3406             info = {
3407                 'id':video_id,
3408                 'url':video_url,
3409                 'ext': 'flv',
3410                 'title': unescapeHTML(title),
3411                 'thumbnail': video_thumb
3412                   }
3413             videos.append(info)
3414         return [self.playlist_result(videos, gameID, game_title)]
3415
3416 class UstreamIE(InfoExtractor):
3417     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3418     IE_NAME = u'ustream'
3419
3420     def _real_extract(self, url):
3421         m = re.match(self._VALID_URL, url)
3422         video_id = m.group('videoID')
3423         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3424         webpage = self._download_webpage(url, video_id)
3425         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3426         title = m.group('title')
3427         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3428         uploader = m.group('uploader')
3429         info = {
3430                 'id':video_id,
3431                 'url':video_url,
3432                 'ext': 'flv',
3433                 'title': title,
3434                 'uploader': uploader
3435                   }
3436         return [info]
3437
3438 class WorldStarHipHopIE(InfoExtractor):
3439     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3440     IE_NAME = u'WorldStarHipHop'
3441
3442     def _real_extract(self, url):
3443         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3444
3445         m = re.match(self._VALID_URL, url)
3446         video_id = m.group('id')
3447
3448         webpage_src = self._download_webpage(url, video_id)
3449
3450         mobj = re.search(_src_url, webpage_src)
3451
3452         if mobj is not None:
3453             video_url = mobj.group(1)
3454             if 'mp4' in video_url:
3455                 ext = 'mp4'
3456             else:
3457                 ext = 'flv'
3458         else:
3459             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3460
3461         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3462
3463         if mobj is None:
3464             raise ExtractorError(u'Cannot determine title')
3465         title = mobj.group(1)
3466
3467         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3468         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3469         if mobj is not None:
3470             thumbnail = mobj.group(1)
3471         else:
3472             _title = r"""candytitles.*>(.*)</span>"""
3473             mobj = re.search(_title, webpage_src)
3474             if mobj is not None:
3475                 title = mobj.group(1)
3476             thumbnail = None
3477
3478         results = [{
3479                     'id': video_id,
3480                     'url' : video_url,
3481                     'title' : title,
3482                     'thumbnail' : thumbnail,
3483                     'ext' : ext,
3484                     }]
3485         return results
3486
3487 class RBMARadioIE(InfoExtractor):
3488     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3489
3490     def _real_extract(self, url):
3491         m = re.match(self._VALID_URL, url)
3492         video_id = m.group('videoID')
3493
3494         webpage = self._download_webpage(url, video_id)
3495         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3496         if not m:
3497             raise ExtractorError(u'Cannot find metadata')
3498         json_data = m.group(1)
3499
3500         try:
3501             data = json.loads(json_data)
3502         except ValueError as e:
3503             raise ExtractorError(u'Invalid JSON: ' + str(e))
3504
3505         video_url = data['akamai_url'] + '&cbr=256'
3506         url_parts = compat_urllib_parse_urlparse(video_url)
3507         video_ext = url_parts.path.rpartition('.')[2]
3508         info = {
3509                 'id': video_id,
3510                 'url': video_url,
3511                 'ext': video_ext,
3512                 'title': data['title'],
3513                 'description': data.get('teaser_text'),
3514                 'location': data.get('country_of_origin'),
3515                 'uploader': data.get('host', {}).get('name'),
3516                 'uploader_id': data.get('host', {}).get('slug'),
3517                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3518                 'duration': data.get('duration'),
3519         }
3520         return [info]
3521
3522
3523 class YouPornIE(InfoExtractor):
3524     """Information extractor for youporn.com."""
3525     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3526
3527     def _print_formats(self, formats):
3528         """Print all available formats"""
3529         print(u'Available formats:')
3530         print(u'ext\t\tformat')
3531         print(u'---------------------------------')
3532         for format in formats:
3533             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3534
3535     def _specific(self, req_format, formats):
3536         for x in formats:
3537             if(x["format"]==req_format):
3538                 return x
3539         return None
3540
3541     def _real_extract(self, url):
3542         mobj = re.match(self._VALID_URL, url)
3543         if mobj is None:
3544             raise ExtractorError(u'Invalid URL: %s' % url)
3545
3546         video_id = mobj.group('videoid')
3547
3548         req = compat_urllib_request.Request(url)
3549         req.add_header('Cookie', 'age_verified=1')
3550         webpage = self._download_webpage(req, video_id)
3551
3552         # Get the video title
3553         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3554         if result is None:
3555             raise ExtractorError(u'Unable to extract video title')
3556         video_title = result.group('title').strip()
3557
3558         # Get the video date
3559         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3560         if result is None:
3561             self._downloader.report_warning(u'unable to extract video date')
3562             upload_date = None
3563         else:
3564             upload_date = unified_strdate(result.group('date').strip())
3565
3566         # Get the video uploader
3567         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3568         if result is None:
3569             self._downloader.report_warning(u'unable to extract uploader')
3570             video_uploader = None
3571         else:
3572             video_uploader = result.group('uploader').strip()
3573             video_uploader = clean_html( video_uploader )
3574
3575         # Get all of the formats available
3576         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3577         result = re.search(DOWNLOAD_LIST_RE, webpage)
3578         if result is None:
3579             raise ExtractorError(u'Unable to extract download list')
3580         download_list_html = result.group('download_list').strip()
3581
3582         # Get all of the links from the page
3583         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3584         links = re.findall(LINK_RE, download_list_html)
3585         if(len(links) == 0):
3586             raise ExtractorError(u'ERROR: no known formats available for video')
3587
3588         self.to_screen(u'Links found: %d' % len(links))
3589
3590         formats = []
3591         for link in links:
3592
3593             # A link looks like this:
3594             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3595             # A path looks like this:
3596             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3597             video_url = unescapeHTML( link )
3598             path = compat_urllib_parse_urlparse( video_url ).path
3599             extension = os.path.splitext( path )[1][1:]
3600             format = path.split('/')[4].split('_')[:2]
3601             size = format[0]
3602             bitrate = format[1]
3603             format = "-".join( format )
3604             title = u'%s-%s-%s' % (video_title, size, bitrate)
3605
3606             formats.append({
3607                 'id': video_id,
3608                 'url': video_url,
3609                 'uploader': video_uploader,
3610                 'upload_date': upload_date,
3611                 'title': title,
3612                 'ext': extension,
3613                 'format': format,
3614                 'thumbnail': None,
3615                 'description': None,
3616                 'player_url': None
3617             })
3618
3619         if self._downloader.params.get('listformats', None):
3620             self._print_formats(formats)
3621             return
3622
3623         req_format = self._downloader.params.get('format', None)
3624         self.to_screen(u'Format: %s' % req_format)
3625
3626         if req_format is None or req_format == 'best':
3627             return [formats[0]]
3628         elif req_format == 'worst':
3629             return [formats[-1]]
3630         elif req_format in ('-1', 'all'):
3631             return formats
3632         else:
3633             format = self._specific( req_format, formats )
3634             if result is None:
3635                 raise ExtractorError(u'Requested format not available')
3636             return [format]
3637
3638
3639
3640 class PornotubeIE(InfoExtractor):
3641     """Information extractor for pornotube.com."""
3642     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3643
3644     def _real_extract(self, url):
3645         mobj = re.match(self._VALID_URL, url)
3646         if mobj is None:
3647             raise ExtractorError(u'Invalid URL: %s' % url)
3648
3649         video_id = mobj.group('videoid')
3650         video_title = mobj.group('title')
3651
3652         # Get webpage content
3653         webpage = self._download_webpage(url, video_id)
3654
3655         # Get the video URL
3656         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3657         result = re.search(VIDEO_URL_RE, webpage)
3658         if result is None:
3659             raise ExtractorError(u'Unable to extract video url')
3660         video_url = compat_urllib_parse.unquote(result.group('url'))
3661
3662         #Get the uploaded date
3663         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3664         result = re.search(VIDEO_UPLOADED_RE, webpage)
3665         if result is None:
3666             raise ExtractorError(u'Unable to extract video title')
3667         upload_date = unified_strdate(result.group('date'))
3668
3669         info = {'id': video_id,
3670                 'url': video_url,
3671                 'uploader': None,
3672                 'upload_date': upload_date,
3673                 'title': video_title,
3674                 'ext': 'flv',
3675                 'format': 'flv'}
3676
3677         return [info]
3678
3679 class YouJizzIE(InfoExtractor):
3680     """Information extractor for youjizz.com."""
3681     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3682
3683     def _real_extract(self, url):
3684         mobj = re.match(self._VALID_URL, url)
3685         if mobj is None:
3686             raise ExtractorError(u'Invalid URL: %s' % url)
3687
3688         video_id = mobj.group('videoid')
3689
3690         # Get webpage content
3691         webpage = self._download_webpage(url, video_id)
3692
3693         # Get the video title
3694         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3695         if result is None:
3696             raise ExtractorError(u'ERROR: unable to extract video title')
3697         video_title = result.group('title').strip()
3698
3699         # Get the embed page
3700         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3701         if result is None:
3702             raise ExtractorError(u'ERROR: unable to extract embed page')
3703
3704         embed_page_url = result.group(0).strip()
3705         video_id = result.group('videoid')
3706
3707         webpage = self._download_webpage(embed_page_url, video_id)
3708
3709         # Get the video URL
3710         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3711         if result is None:
3712             raise ExtractorError(u'ERROR: unable to extract video url')
3713         video_url = result.group('source')
3714
3715         info = {'id': video_id,
3716                 'url': video_url,
3717                 'title': video_title,
3718                 'ext': 'flv',
3719                 'format': 'flv',
3720                 'player_url': embed_page_url}
3721
3722         return [info]
3723
3724 class EightTracksIE(InfoExtractor):
3725     IE_NAME = '8tracks'
3726     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3727
3728     def _real_extract(self, url):
3729         mobj = re.match(self._VALID_URL, url)
3730         if mobj is None:
3731             raise ExtractorError(u'Invalid URL: %s' % url)
3732         playlist_id = mobj.group('id')
3733
3734         webpage = self._download_webpage(url, playlist_id)
3735
3736         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3737         if not m:
3738             raise ExtractorError(u'Cannot find trax information')
3739         json_like = m.group(1)
3740         data = json.loads(json_like)
3741
3742         session = str(random.randint(0, 1000000000))
3743         mix_id = data['id']
3744         track_count = data['tracks_count']
3745         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3746         next_url = first_url
3747         res = []
3748         for i in itertools.count():
3749             api_json = self._download_webpage(next_url, playlist_id,
3750                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3751                 errnote=u'Failed to download song information')
3752             api_data = json.loads(api_json)
3753             track_data = api_data[u'set']['track']
3754             info = {
3755                 'id': track_data['id'],
3756                 'url': track_data['track_file_stream_url'],
3757                 'title': track_data['performer'] + u' - ' + track_data['name'],
3758                 'raw_title': track_data['name'],
3759                 'uploader_id': data['user']['login'],
3760                 'ext': 'm4a',
3761             }
3762             res.append(info)
3763             if api_data['set']['at_last_track']:
3764                 break
3765             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3766         return res
3767
3768 class KeekIE(InfoExtractor):
3769     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3770     IE_NAME = u'keek'
3771
3772     def _real_extract(self, url):
3773         m = re.match(self._VALID_URL, url)
3774         video_id = m.group('videoID')
3775         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3776         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3777         webpage = self._download_webpage(url, video_id)
3778         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3779         title = unescapeHTML(m.group('title'))
3780         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3781         uploader = clean_html(m.group('uploader'))
3782         info = {
3783                 'id': video_id,
3784                 'url': video_url,
3785                 'ext': 'mp4',
3786                 'title': title,
3787                 'thumbnail': thumbnail,
3788                 'uploader': uploader
3789         }
3790         return [info]
3791
3792 class TEDIE(InfoExtractor):
3793     _VALID_URL=r'''http://www\.ted\.com/
3794                    (
3795                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3796                         |
3797                         ((?P<type_talk>talks)) # We have a simple talk
3798                    )
3799                    (/lang/(.*?))? # The url may contain the language
3800                    /(?P<name>\w+) # Here goes the name and then ".html"
3801                    '''
3802
3803     @classmethod
3804     def suitable(cls, url):
3805         """Receives a URL and returns True if suitable for this IE."""
3806         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3807
3808     def _real_extract(self, url):
3809         m=re.match(self._VALID_URL, url, re.VERBOSE)
3810         if m.group('type_talk'):
3811             return [self._talk_info(url)]
3812         else :
3813             playlist_id=m.group('playlist_id')
3814             name=m.group('name')
3815             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3816             return [self._playlist_videos_info(url,name,playlist_id)]
3817
3818     def _talk_video_link(self,mediaSlug):
3819         '''Returns the video link for that mediaSlug'''
3820         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3821
3822     def _playlist_videos_info(self,url,name,playlist_id=0):
3823         '''Returns the videos of the playlist'''
3824         video_RE=r'''
3825                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3826                      ([.\s]*?)data-playlist_item_id="(\d+)"
3827                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3828                      '''
3829         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3830         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3831         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3832         m_names=re.finditer(video_name_RE,webpage)
3833
3834         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3835         m_playlist = re.search(playlist_RE, webpage)
3836         playlist_title = m_playlist.group('playlist_title')
3837
3838         playlist_entries = []
3839         for m_video, m_name in zip(m_videos,m_names):
3840             video_id=m_video.group('video_id')
3841             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3842             playlist_entries.append(self.url_result(talk_url, 'TED'))
3843         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3844
3845     def _talk_info(self, url, video_id=0):
3846         """Return the video for the talk in the url"""
3847         m=re.match(self._VALID_URL, url,re.VERBOSE)
3848         videoName=m.group('name')
3849         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3850         # If the url includes the language we get the title translated
3851         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3852         title=re.search(title_RE, webpage).group('title')
3853         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3854                         "id":(?P<videoID>[\d]+).*?
3855                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3856         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3857         thumb_match=re.search(thumb_RE,webpage)
3858         info_match=re.search(info_RE,webpage,re.VERBOSE)
3859         video_id=info_match.group('videoID')
3860         mediaSlug=info_match.group('mediaSlug')
3861         video_url=self._talk_video_link(mediaSlug)
3862         info = {
3863                 'id': video_id,
3864                 'url': video_url,
3865                 'ext': 'mp4',
3866                 'title': title,
3867                 'thumbnail': thumb_match.group('thumbnail')
3868                 }
3869         return info
3870
3871 class MySpassIE(InfoExtractor):
3872     _VALID_URL = r'http://www.myspass.de/.*'
3873
3874     def _real_extract(self, url):
3875         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3876
3877         # video id is the last path element of the URL
3878         # usually there is a trailing slash, so also try the second but last
3879         url_path = compat_urllib_parse_urlparse(url).path
3880         url_parent_path, video_id = os.path.split(url_path)
3881         if not video_id:
3882             _, video_id = os.path.split(url_parent_path)
3883
3884         # get metadata
3885         metadata_url = META_DATA_URL_TEMPLATE % video_id
3886         metadata_text = self._download_webpage(metadata_url, video_id)
3887         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3888
3889         # extract values from metadata
3890         url_flv_el = metadata.find('url_flv')
3891         if url_flv_el is None:
3892             raise ExtractorError(u'Unable to extract download url')
3893         video_url = url_flv_el.text
3894         extension = os.path.splitext(video_url)[1][1:]
3895         title_el = metadata.find('title')
3896         if title_el is None:
3897             raise ExtractorError(u'Unable to extract title')
3898         title = title_el.text
3899         format_id_el = metadata.find('format_id')
3900         if format_id_el is None:
3901             format = ext
3902         else:
3903             format = format_id_el.text
3904         description_el = metadata.find('description')
3905         if description_el is not None:
3906             description = description_el.text
3907         else:
3908             description = None
3909         imagePreview_el = metadata.find('imagePreview')
3910         if imagePreview_el is not None:
3911             thumbnail = imagePreview_el.text
3912         else:
3913             thumbnail = None
3914         info = {
3915             'id': video_id,
3916             'url': video_url,
3917             'title': title,
3918             'ext': extension,
3919             'format': format,
3920             'thumbnail': thumbnail,
3921             'description': description
3922         }
3923         return [info]
3924
3925 class SpiegelIE(InfoExtractor):
3926     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3927
3928     def _real_extract(self, url):
3929         m = re.match(self._VALID_URL, url)
3930         video_id = m.group('videoID')
3931
3932         webpage = self._download_webpage(url, video_id)
3933         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3934         if not m:
3935             raise ExtractorError(u'Cannot find title')
3936         video_title = unescapeHTML(m.group(1))
3937
3938         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3939         xml_code = self._download_webpage(xml_url, video_id,
3940                     note=u'Downloading XML', errnote=u'Failed to download XML')
3941
3942         idoc = xml.etree.ElementTree.fromstring(xml_code)
3943         last_type = idoc[-1]
3944         filename = last_type.findall('./filename')[0].text
3945         duration = float(last_type.findall('./duration')[0].text)
3946
3947         video_url = 'http://video2.spiegel.de/flash/' + filename
3948         video_ext = filename.rpartition('.')[2]
3949         info = {
3950             'id': video_id,
3951             'url': video_url,
3952             'ext': video_ext,
3953             'title': video_title,
3954             'duration': duration,
3955         }
3956         return [info]
3957
3958 class LiveLeakIE(InfoExtractor):
3959
3960     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3961     IE_NAME = u'liveleak'
3962
3963     def _real_extract(self, url):
3964         mobj = re.match(self._VALID_URL, url)
3965         if mobj is None:
3966             raise ExtractorError(u'Invalid URL: %s' % url)
3967
3968         video_id = mobj.group('video_id')
3969
3970         webpage = self._download_webpage(url, video_id)
3971
3972         m = re.search(r'file: "(.*?)",', webpage)
3973         if not m:
3974             raise ExtractorError(u'Unable to find video url')
3975         video_url = m.group(1)
3976
3977         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3978         if not m:
3979             raise ExtractorError(u'Cannot find video title')
3980         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3981
3982         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3983         if m:
3984             desc = unescapeHTML(m.group('desc'))
3985         else:
3986             desc = None
3987
3988         m = re.search(r'By:.*?(\w+)</a>', webpage)
3989         if m:
3990             uploader = clean_html(m.group(1))
3991         else:
3992             uploader = None
3993
3994         info = {
3995             'id':  video_id,
3996             'url': video_url,
3997             'ext': 'mp4',
3998             'title': title,
3999             'description': desc,
4000             'uploader': uploader
4001         }
4002
4003         return [info]
4004
4005 class ARDIE(InfoExtractor):
4006     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4007     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4008     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4009
4010     def _real_extract(self, url):
4011         # determine video id from url
4012         m = re.match(self._VALID_URL, url)
4013
4014         numid = re.search(r'documentId=([0-9]+)', url)
4015         if numid:
4016             video_id = numid.group(1)
4017         else:
4018             video_id = m.group('video_id')
4019
4020         # determine title and media streams from webpage
4021         html = self._download_webpage(url, video_id)
4022         title = re.search(self._TITLE, html).group('title')
4023         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4024         if not streams:
4025             assert '"fsk"' in html
4026             raise ExtractorError(u'This video is only available after 8:00 pm')
4027
4028         # choose default media type and highest quality for now
4029         stream = max([s for s in streams if int(s["media_type"]) == 0],
4030                      key=lambda s: int(s["quality"]))
4031
4032         # there's two possibilities: RTMP stream or HTTP download
4033         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4034         if stream['rtmp_url']:
4035             self.to_screen(u'RTMP download detected')
4036             assert stream['video_url'].startswith('mp4:')
4037             info["url"] = stream["rtmp_url"]
4038             info["play_path"] = stream['video_url']
4039         else:
4040             assert stream["video_url"].endswith('.mp4')
4041             info["url"] = stream["video_url"]
4042         return [info]
4043
4044 class TumblrIE(InfoExtractor):
4045     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4046
4047     def _real_extract(self, url):
4048         m_url = re.match(self._VALID_URL, url)
4049         video_id = m_url.group('id')
4050         blog = m_url.group('blog_name')
4051
4052         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4053         webpage = self._download_webpage(url, video_id)
4054
4055         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4056         video = re.search(re_video, webpage)
4057         if video is None:
4058             self.to_screen("No video founded")
4059             return []
4060         video_url = video.group('video_url')
4061         ext = video.group('ext')
4062
4063         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4064         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4065
4066         # The only place where you can get a title, it's not complete,
4067         # but searching in other places doesn't work for all videos
4068         re_title = r'<title>(?P<title>.*?)</title>'
4069         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4070
4071         return [{'id': video_id,
4072                  'url': video_url,
4073                  'title': title,
4074                  'thumbnail': thumb,
4075                  'ext': ext
4076                  }]
4077
4078 class BandcampIE(InfoExtractor):
4079     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4080
4081     def _real_extract(self, url):
4082         mobj = re.match(self._VALID_URL, url)
4083         title = mobj.group('title')
4084         webpage = self._download_webpage(url, title)
4085         # We get the link to the free download page
4086         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4087         if m_download is None:
4088             raise ExtractorError(u'No free songs founded')
4089
4090         download_link = m_download.group(1)
4091         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4092                        webpage, re.MULTILINE|re.DOTALL).group('id')
4093
4094         download_webpage = self._download_webpage(download_link, id,
4095                                                   'Downloading free downloads page')
4096         # We get the dictionary of the track from some javascrip code
4097         info = re.search(r'items: (.*?),$',
4098                          download_webpage, re.MULTILINE).group(1)
4099         info = json.loads(info)[0]
4100         # We pick mp3-320 for now, until format selection can be easily implemented.
4101         mp3_info = info[u'downloads'][u'mp3-320']
4102         # If we try to use this url it says the link has expired
4103         initial_url = mp3_info[u'url']
4104         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4105         m_url = re.match(re_url, initial_url)
4106         #We build the url we will use to get the final track url
4107         # This url is build in Bandcamp in the script download_bunde_*.js
4108         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4109         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4110         # If we could correctly generate the .rand field the url would be
4111         #in the "download_url" key
4112         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4113
4114         track_info = {'id':id,
4115                       'title' : info[u'title'],
4116                       'ext' : 'mp3',
4117                       'url' : final_url,
4118                       'thumbnail' : info[u'thumb_url'],
4119                       'uploader' : info[u'artist']
4120                       }
4121
4122         return [track_info]
4123
4124 class RedTubeIE(InfoExtractor):
4125     """Information Extractor for redtube"""
4126     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4127
4128     def _real_extract(self,url):
4129         mobj = re.match(self._VALID_URL, url)
4130         if mobj is None:
4131             raise ExtractorError(u'Invalid URL: %s' % url)
4132
4133         video_id = mobj.group('id')
4134         video_extension = 'mp4'
4135         webpage = self._download_webpage(url, video_id)
4136         self.report_extraction(video_id)
4137         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4138
4139         if mobj is None:
4140             raise ExtractorError(u'Unable to extract media URL')
4141
4142         video_url = mobj.group(1)
4143         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4144         if mobj is None:
4145             raise ExtractorError(u'Unable to extract title')
4146         video_title = mobj.group(1)
4147
4148         return [{
4149             'id':       video_id,
4150             'url':      video_url,
4151             'ext':      video_extension,
4152             'title':    video_title,
4153         }]
4154
4155
4156 def gen_extractors():
4157     """ Return a list of an instance of every supported extractor.
4158     The order does matter; the first extractor matched is the one handling the URL.
4159     """
4160     return [
4161         YoutubePlaylistIE(),
4162         YoutubeChannelIE(),
4163         YoutubeUserIE(),
4164         YoutubeSearchIE(),
4165         YoutubeIE(),
4166         MetacafeIE(),
4167         DailymotionIE(),
4168         GoogleSearchIE(),
4169         PhotobucketIE(),
4170         YahooIE(),
4171         YahooSearchIE(),
4172         DepositFilesIE(),
4173         FacebookIE(),
4174         BlipTVUserIE(),
4175         BlipTVIE(),
4176         VimeoIE(),
4177         MyVideoIE(),
4178         ComedyCentralIE(),
4179         EscapistIE(),
4180         CollegeHumorIE(),
4181         XVideosIE(),
4182         SoundcloudSetIE(),
4183         SoundcloudIE(),
4184         InfoQIE(),
4185         MixcloudIE(),
4186         StanfordOpenClassroomIE(),
4187         MTVIE(),
4188         YoukuIE(),
4189         XNXXIE(),
4190         YouJizzIE(),
4191         PornotubeIE(),
4192         YouPornIE(),
4193         GooglePlusIE(),
4194         ArteTvIE(),
4195         NBAIE(),
4196         WorldStarHipHopIE(),
4197         JustinTVIE(),
4198         FunnyOrDieIE(),
4199         SteamIE(),
4200         UstreamIE(),
4201         RBMARadioIE(),
4202         EightTracksIE(),
4203         KeekIE(),
4204         TEDIE(),
4205         MySpassIE(),
4206         SpiegelIE(),
4207         LiveLeakIE(),
4208         ARDIE(),
4209         TumblrIE(),
4210         BandcampIE(),
4211         RedTubeIE(),
4212         GenericIE()
4213     ]
4214
4215 def get_info_extractor(ie_name):
4216     """Returns the info extractor class with the given ie_name"""
4217     return globals()[ie_name+'IE']