_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 566             for expression in format_expressions:
 567                 try:
 568                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 569                 except:
 570                     pass
 571
 572         # description
 573         video_description = get_element_by_id("eow-description", video_webpage)
 574         if video_description:
 575             video_description = clean_html(video_description)
 576         else:
 577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 578             if fd_mobj:
 579                 video_description = unescapeHTML(fd_mobj.group(1))
 580             else:
 581                 video_description = u''
 582
 583         # subtitles
 584         video_subtitles = None
 585
 586         if self._downloader.params.get('writesubtitles', False):
 587             video_subtitles = self._extract_subtitle(video_id)
 588             if video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitles[0]
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('allsubtitles', False):
 594             video_subtitles = self._extract_all_subtitles(video_id)
 595             for video_subtitle in video_subtitles:
 596                 (sub_error, sub_lang, sub) = video_subtitle
 597                 if sub_error:
 598                     self._downloader.report_error(sub_error)
 599
 600         if self._downloader.params.get('listsubtitles', False):
 601             sub_lang_list = self._list_available_subtitles(video_id)
 602             return
 603
 604         if 'length_seconds' not in video_info:
 605             self._downloader.report_warning(u'unable to extract video duration')
 606             video_duration = ''
 607         else:
 608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 609
 610         # token
 611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 612
 613         # Decide which formats to download
 614         req_format = self._downloader.params.get('format', None)
 615
 616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 617             self.report_rtmp_download()
 618             video_url_list = [(None, video_info['conn'][0])]
 619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 624
 625             format_limit = self._downloader.params.get('format_limit', None)
 626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 627             if format_limit is not None and format_limit in available_formats:
 628                 format_list = available_formats[available_formats.index(format_limit):]
 629             else:
 630                 format_list = available_formats
 631             existing_formats = [x for x in format_list if x in url_map]
 632             if len(existing_formats) == 0:
 633                 raise ExtractorError(u'no known formats available for video')
 634             if self._downloader.params.get('listformats', None):
 635                 self._print_formats(existing_formats)
 636                 return
 637             if req_format is None or req_format == 'best':
 638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 639             elif req_format == 'worst':
 640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 641             elif req_format in ('-1', 'all'):
 642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 643             else:
 644                 # Specific formats. We pick the first in a slash-delimeted sequence.
 645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 646                 req_formats = req_format.split('/')
 647                 video_url_list = None
 648                 for rf in req_formats:
 649                     if rf in url_map:
 650                         video_url_list = [(rf, url_map[rf])]
 651                         break
 652                 if video_url_list is None:
 653                     raise ExtractorError(u'requested format not available')
 654         else:
 655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 656
 657         results = []
 658         for format_param, video_real_url in video_url_list:
 659             # Extension
 660             video_extension = self._video_extensions.get(format_param, 'flv')
 661
 662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 663                                               self._video_dimensions.get(format_param, '???'))
 664
 665             results.append({
 666                 'id':       video_id,
 667                 'url':      video_real_url,
 668                 'uploader': video_uploader,
 669                 'uploader_id': video_uploader_id,
 670                 'upload_date':  upload_date,
 671                 'title':    video_title,
 672                 'ext':      video_extension,
 673                 'format':   video_format,
 674                 'thumbnail':    video_thumbnail,
 675                 'description':  video_description,
 676                 'player_url':   player_url,
 677                 'subtitles':    video_subtitles,
 678                 'duration':     video_duration
 679             })
 680         return results
 681
 682
 683 class MetacafeIE(InfoExtractor):
 684     """Information Extractor for metacafe.com."""
 685
 686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 689     IE_NAME = u'metacafe'
 690
 691     def __init__(self, downloader=None):
 692         InfoExtractor.__init__(self, downloader)
 693
 694     def report_disclaimer(self):
 695         """Report disclaimer retrieval."""
 696         self.to_screen(u'Retrieving disclaimer')
 697
 698     def _real_initialize(self):
 699         # Retrieve disclaimer
 700         request = compat_urllib_request.Request(self._DISCLAIMER)
 701         try:
 702             self.report_disclaimer()
 703             disclaimer = compat_urllib_request.urlopen(request).read()
 704         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 705             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 706             return
 707
 708         # Confirm age
 709         disclaimer_form = {
 710             'filters': '0',
 711             'submit': "Continue - I'm over 18",
 712             }
 713         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 714         try:
 715             self.report_age_confirmation()
 716             disclaimer = compat_urllib_request.urlopen(request).read()
 717         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 718             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 719             return
 720
 721     def _real_extract(self, url):
 722         # Extract id and simplified title from URL
 723         mobj = re.match(self._VALID_URL, url)
 724         if mobj is None:
 725             self._downloader.report_error(u'invalid URL: %s' % url)
 726             return
 727
 728         video_id = mobj.group(1)
 729
 730         # Check if video comes from YouTube
 731         mobj2 = re.match(r'^yt-(.*)$', video_id)
 732         if mobj2 is not None:
 733             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 734
 735         # Retrieve video webpage to extract further information
 736         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 737
 738         # Extract URL, uploader and title from webpage
 739         self.report_extraction(video_id)
 740         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 741         if mobj is not None:
 742             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 743             video_extension = mediaURL[-3:]
 744
 745             # Extract gdaKey if available
 746             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 747             if mobj is None:
 748                 video_url = mediaURL
 749             else:
 750                 gdaKey = mobj.group(1)
 751                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 752         else:
 753             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 754             if mobj is None:
 755                 self._downloader.report_error(u'unable to extract media URL')
 756                 return
 757             vardict = compat_parse_qs(mobj.group(1))
 758             if 'mediaData' not in vardict:
 759                 self._downloader.report_error(u'unable to extract media URL')
 760                 return
 761             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 762             if mobj is None:
 763                 self._downloader.report_error(u'unable to extract media URL')
 764                 return
 765             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 766             video_extension = mediaURL[-3:]
 767             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 768
 769         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 770         if mobj is None:
 771             self._downloader.report_error(u'unable to extract title')
 772             return
 773         video_title = mobj.group(1).decode('utf-8')
 774
 775         mobj = re.search(r'submitter=(.*?);', webpage)
 776         if mobj is None:
 777             self._downloader.report_error(u'unable to extract uploader nickname')
 778             return
 779         video_uploader = mobj.group(1)
 780
 781         return [{
 782             'id':       video_id.decode('utf-8'),
 783             'url':      video_url.decode('utf-8'),
 784             'uploader': video_uploader.decode('utf-8'),
 785             'upload_date':  None,
 786             'title':    video_title,
 787             'ext':      video_extension.decode('utf-8'),
 788         }]
 789
 790
 791 class DailymotionIE(InfoExtractor):
 792     """Information Extractor for Dailymotion"""
 793
 794     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 795     IE_NAME = u'dailymotion'
 796     _WORKING = False
 797
 798     def __init__(self, downloader=None):
 799         InfoExtractor.__init__(self, downloader)
 800
 801     def _real_extract(self, url):
 802         # Extract id and simplified title from URL
 803         mobj = re.match(self._VALID_URL, url)
 804         if mobj is None:
 805             self._downloader.report_error(u'invalid URL: %s' % url)
 806             return
 807
 808         video_id = mobj.group(1).split('_')[0].split('?')[0]
 809
 810         video_extension = 'mp4'
 811
 812         # Retrieve video webpage to extract further information
 813         request = compat_urllib_request.Request(url)
 814         request.add_header('Cookie', 'family_filter=off')
 815         webpage = self._download_webpage(request, video_id)
 816
 817         # Extract URL, uploader and title from webpage
 818         self.report_extraction(video_id)
 819         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 820         if mobj is None:
 821             self._downloader.report_error(u'unable to extract media URL')
 822             return
 823         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 824
 825         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 826             if key in flashvars:
 827                 max_quality = key
 828                 self.to_screen(u'Using %s' % key)
 829                 break
 830         else:
 831             self._downloader.report_error(u'unable to extract video URL')
 832             return
 833
 834         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 835         if mobj is None:
 836             self._downloader.report_error(u'unable to extract video URL')
 837             return
 838
 839         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 840
 841         # TODO: support choosing qualities
 842
 843         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 844         if mobj is None:
 845             self._downloader.report_error(u'unable to extract title')
 846             return
 847         video_title = unescapeHTML(mobj.group('title'))
 848
 849         video_uploader = None
 850         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 851         if mobj is None:
 852             # lookin for official user
 853             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 854             if mobj_official is None:
 855                 self._downloader.report_warning(u'unable to extract uploader nickname')
 856             else:
 857                 video_uploader = mobj_official.group(1)
 858         else:
 859             video_uploader = mobj.group(1)
 860
 861         video_upload_date = None
 862         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 863         if mobj is not None:
 864             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 865
 866         return [{
 867             'id':       video_id,
 868             'url':      video_url,
 869             'uploader': video_uploader,
 870             'upload_date':  video_upload_date,
 871             'title':    video_title,
 872             'ext':      video_extension,
 873         }]
 874
 875
 876 class PhotobucketIE(InfoExtractor):
 877     """Information extractor for photobucket.com."""
 878
 879     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 880     IE_NAME = u'photobucket'
 881
 882     def __init__(self, downloader=None):
 883         InfoExtractor.__init__(self, downloader)
 884
 885     def _real_extract(self, url):
 886         # Extract id from URL
 887         mobj = re.match(self._VALID_URL, url)
 888         if mobj is None:
 889             self._downloader.report_error(u'Invalid URL: %s' % url)
 890             return
 891
 892         video_id = mobj.group(1)
 893
 894         video_extension = 'flv'
 895
 896         # Retrieve video webpage to extract further information
 897         request = compat_urllib_request.Request(url)
 898         try:
 899             self.report_download_webpage(video_id)
 900             webpage = compat_urllib_request.urlopen(request).read()
 901         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 902             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 903             return
 904
 905         # Extract URL, uploader, and title from webpage
 906         self.report_extraction(video_id)
 907         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 908         if mobj is None:
 909             self._downloader.report_error(u'unable to extract media URL')
 910             return
 911         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 912
 913         video_url = mediaURL
 914
 915         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 916         if mobj is None:
 917             self._downloader.report_error(u'unable to extract title')
 918             return
 919         video_title = mobj.group(1).decode('utf-8')
 920
 921         video_uploader = mobj.group(2).decode('utf-8')
 922
 923         return [{
 924             'id':       video_id.decode('utf-8'),
 925             'url':      video_url.decode('utf-8'),
 926             'uploader': video_uploader,
 927             'upload_date':  None,
 928             'title':    video_title,
 929             'ext':      video_extension.decode('utf-8'),
 930         }]
 931
 932
 933 class YahooIE(InfoExtractor):
 934     """Information extractor for video.yahoo.com."""
 935
 936     _WORKING = False
 937     # _VALID_URL matches all Yahoo! Video URLs
 938     # _VPAGE_URL matches only the extractable '/watch/' URLs
 939     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 940     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 941     IE_NAME = u'video.yahoo'
 942
 943     def __init__(self, downloader=None):
 944         InfoExtractor.__init__(self, downloader)
 945
 946     def _real_extract(self, url, new_video=True):
 947         # Extract ID from URL
 948         mobj = re.match(self._VALID_URL, url)
 949         if mobj is None:
 950             self._downloader.report_error(u'Invalid URL: %s' % url)
 951             return
 952
 953         video_id = mobj.group(2)
 954         video_extension = 'flv'
 955
 956         # Rewrite valid but non-extractable URLs as
 957         # extractable English language /watch/ URLs
 958         if re.match(self._VPAGE_URL, url) is None:
 959             request = compat_urllib_request.Request(url)
 960             try:
 961                 webpage = compat_urllib_request.urlopen(request).read()
 962             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 963                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 964                 return
 965
 966             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 967             if mobj is None:
 968                 self._downloader.report_error(u'Unable to extract id field')
 969                 return
 970             yahoo_id = mobj.group(1)
 971
 972             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 973             if mobj is None:
 974                 self._downloader.report_error(u'Unable to extract vid field')
 975                 return
 976             yahoo_vid = mobj.group(1)
 977
 978             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 979             return self._real_extract(url, new_video=False)
 980
 981         # Retrieve video webpage to extract further information
 982         request = compat_urllib_request.Request(url)
 983         try:
 984             self.report_download_webpage(video_id)
 985             webpage = compat_urllib_request.urlopen(request).read()
 986         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 987             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 988             return
 989
 990         # Extract uploader and title from webpage
 991         self.report_extraction(video_id)
 992         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 993         if mobj is None:
 994             self._downloader.report_error(u'unable to extract video title')
 995             return
 996         video_title = mobj.group(1).decode('utf-8')
 997
 998         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 999         if mobj is None:
1000             self._downloader.report_error(u'unable to extract video uploader')
1001             return
1002         video_uploader = mobj.group(1).decode('utf-8')
1003
1004         # Extract video thumbnail
1005         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1006         if mobj is None:
1007             self._downloader.report_error(u'unable to extract video thumbnail')
1008             return
1009         video_thumbnail = mobj.group(1).decode('utf-8')
1010
1011         # Extract video description
1012         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1013         if mobj is None:
1014             self._downloader.report_error(u'unable to extract video description')
1015             return
1016         video_description = mobj.group(1).decode('utf-8')
1017         if not video_description:
1018             video_description = 'No description available.'
1019
1020         # Extract video height and width
1021         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1022         if mobj is None:
1023             self._downloader.report_error(u'unable to extract video height')
1024             return
1025         yv_video_height = mobj.group(1)
1026
1027         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1028         if mobj is None:
1029             self._downloader.report_error(u'unable to extract video width')
1030             return
1031         yv_video_width = mobj.group(1)
1032
1033         # Retrieve video playlist to extract media URL
1034         # I'm not completely sure what all these options are, but we
1035         # seem to need most of them, otherwise the server sends a 401.
1036         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1037         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1038         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1039                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1040                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1041         try:
1042             self.report_download_webpage(video_id)
1043             webpage = compat_urllib_request.urlopen(request).read()
1044         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1045             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1046             return
1047
1048         # Extract media URL from playlist XML
1049         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1050         if mobj is None:
1051             self._downloader.report_error(u'Unable to extract media URL')
1052             return
1053         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1054         video_url = unescapeHTML(video_url)
1055
1056         return [{
1057             'id':       video_id.decode('utf-8'),
1058             'url':      video_url,
1059             'uploader': video_uploader,
1060             'upload_date':  None,
1061             'title':    video_title,
1062             'ext':      video_extension.decode('utf-8'),
1063             'thumbnail':    video_thumbnail.decode('utf-8'),
1064             'description':  video_description,
1065         }]
1066
1067
1068 class VimeoIE(InfoExtractor):
1069     """Information extractor for vimeo.com."""
1070
1071     # _VALID_URL matches Vimeo URLs
1072     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1073     IE_NAME = u'vimeo'
1074
1075     def __init__(self, downloader=None):
1076         InfoExtractor.__init__(self, downloader)
1077
1078     def _real_extract(self, url, new_video=True):
1079         # Extract ID from URL
1080         mobj = re.match(self._VALID_URL, url)
1081         if mobj is None:
1082             self._downloader.report_error(u'Invalid URL: %s' % url)
1083             return
1084
1085         video_id = mobj.group('id')
1086         if not mobj.group('proto'):
1087             url = 'https://' + url
1088         if mobj.group('direct_link'):
1089             url = 'https://vimeo.com/' + video_id
1090
1091         # Retrieve video webpage to extract further information
1092         request = compat_urllib_request.Request(url, None, std_headers)
1093         try:
1094             self.report_download_webpage(video_id)
1095             webpage_bytes = compat_urllib_request.urlopen(request).read()
1096             webpage = webpage_bytes.decode('utf-8')
1097         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1098             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1099             return
1100
1101         # Now we begin extracting as much information as we can from what we
1102         # retrieved. First we extract the information common to all extractors,
1103         # and latter we extract those that are Vimeo specific.
1104         self.report_extraction(video_id)
1105
1106         # Extract the config JSON
1107         try:
1108             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1109             config = json.loads(config)
1110         except:
1111             self._downloader.report_error(u'unable to extract info section')
1112             return
1113
1114         # Extract title
1115         video_title = config["video"]["title"]
1116
1117         # Extract uploader and uploader_id
1118         video_uploader = config["video"]["owner"]["name"]
1119         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1120
1121         # Extract video thumbnail
1122         video_thumbnail = config["video"]["thumbnail"]
1123
1124         # Extract video description
1125         video_description = get_element_by_attribute("itemprop", "description", webpage)
1126         if video_description: video_description = clean_html(video_description)
1127         else: video_description = u''
1128
1129         # Extract upload date
1130         video_upload_date = None
1131         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1132         if mobj is not None:
1133             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1134
1135         # Vimeo specific: extract request signature and timestamp
1136         sig = config['request']['signature']
1137         timestamp = config['request']['timestamp']
1138
1139         # Vimeo specific: extract video codec and quality information
1140         # First consider quality, then codecs, then take everything
1141         # TODO bind to format param
1142         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1143         files = { 'hd': [], 'sd': [], 'other': []}
1144         for codec_name, codec_extension in codecs:
1145             if codec_name in config["video"]["files"]:
1146                 if 'hd' in config["video"]["files"][codec_name]:
1147                     files['hd'].append((codec_name, codec_extension, 'hd'))
1148                 elif 'sd' in config["video"]["files"][codec_name]:
1149                     files['sd'].append((codec_name, codec_extension, 'sd'))
1150                 else:
1151                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1152
1153         for quality in ('hd', 'sd', 'other'):
1154             if len(files[quality]) > 0:
1155                 video_quality = files[quality][0][2]
1156                 video_codec = files[quality][0][0]
1157                 video_extension = files[quality][0][1]
1158                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1159                 break
1160         else:
1161             self._downloader.report_error(u'no known codec found')
1162             return
1163
1164         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1165                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1166
1167         return [{
1168             'id':       video_id,
1169             'url':      video_url,
1170             'uploader': video_uploader,
1171             'uploader_id': video_uploader_id,
1172             'upload_date':  video_upload_date,
1173             'title':    video_title,
1174             'ext':      video_extension,
1175             'thumbnail':    video_thumbnail,
1176             'description':  video_description,
1177         }]
1178
1179
1180 class ArteTvIE(InfoExtractor):
1181     """arte.tv information extractor."""
1182
1183     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1184     _LIVE_URL = r'index-[0-9]+\.html$'
1185
1186     IE_NAME = u'arte.tv'
1187
1188     def __init__(self, downloader=None):
1189         InfoExtractor.__init__(self, downloader)
1190
1191     def fetch_webpage(self, url):
1192         request = compat_urllib_request.Request(url)
1193         try:
1194             self.report_download_webpage(url)
1195             webpage = compat_urllib_request.urlopen(request).read()
1196         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1197             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1198             return
1199         except ValueError as err:
1200             self._downloader.report_error(u'Invalid URL: %s' % url)
1201             return
1202         return webpage
1203
1204     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1205         page = self.fetch_webpage(url)
1206         mobj = re.search(regex, page, regexFlags)
1207         info = {}
1208
1209         if mobj is None:
1210             self._downloader.report_error(u'Invalid URL: %s' % url)
1211             return
1212
1213         for (i, key, err) in matchTuples:
1214             if mobj.group(i) is None:
1215                 self._downloader.report_error(err)
1216                 return
1217             else:
1218                 info[key] = mobj.group(i)
1219
1220         return info
1221
1222     def extractLiveStream(self, url):
1223         video_lang = url.split('/')[-4]
1224         info = self.grep_webpage(
1225             url,
1226             r'src="(.*?/videothek_js.*?\.js)',
1227             0,
1228             [
1229                 (1, 'url', u'Invalid URL: %s' % url)
1230             ]
1231         )
1232         http_host = url.split('/')[2]
1233         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1234         info = self.grep_webpage(
1235             next_url,
1236             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1237                 '(http://.*?\.swf).*?' +
1238                 '(rtmp://.*?)\'',
1239             re.DOTALL,
1240             [
1241                 (1, 'path',   u'could not extract video path: %s' % url),
1242                 (2, 'player', u'could not extract video player: %s' % url),
1243                 (3, 'url',    u'could not extract video url: %s' % url)
1244             ]
1245         )
1246         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1247
1248     def extractPlus7Stream(self, url):
1249         video_lang = url.split('/')[-3]
1250         info = self.grep_webpage(
1251             url,
1252             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1253             0,
1254             [
1255                 (1, 'url', u'Invalid URL: %s' % url)
1256             ]
1257         )
1258         next_url = compat_urllib_parse.unquote(info.get('url'))
1259         info = self.grep_webpage(
1260             next_url,
1261             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1262             0,
1263             [
1264                 (1, 'url', u'Could not find <video> tag: %s' % url)
1265             ]
1266         )
1267         next_url = compat_urllib_parse.unquote(info.get('url'))
1268
1269         info = self.grep_webpage(
1270             next_url,
1271             r'<video id="(.*?)".*?>.*?' +
1272                 '<name>(.*?)</name>.*?' +
1273                 '<dateVideo>(.*?)</dateVideo>.*?' +
1274                 '<url quality="hd">(.*?)</url>',
1275             re.DOTALL,
1276             [
1277                 (1, 'id',    u'could not extract video id: %s' % url),
1278                 (2, 'title', u'could not extract video title: %s' % url),
1279                 (3, 'date',  u'could not extract video date: %s' % url),
1280                 (4, 'url',   u'could not extract video url: %s' % url)
1281             ]
1282         )
1283
1284         return {
1285             'id':           info.get('id'),
1286             'url':          compat_urllib_parse.unquote(info.get('url')),
1287             'uploader':     u'arte.tv',
1288             'upload_date':  info.get('date'),
1289             'title':        info.get('title').decode('utf-8'),
1290             'ext':          u'mp4',
1291             'format':       u'NA',
1292             'player_url':   None,
1293         }
1294
1295     def _real_extract(self, url):
1296         video_id = url.split('/')[-1]
1297         self.report_extraction(video_id)
1298
1299         if re.search(self._LIVE_URL, video_id) is not None:
1300             self.extractLiveStream(url)
1301             return
1302         else:
1303             info = self.extractPlus7Stream(url)
1304
1305         return [info]
1306
1307
1308 class GenericIE(InfoExtractor):
1309     """Generic last-resort information extractor."""
1310
1311     _VALID_URL = r'.*'
1312     IE_NAME = u'generic'
1313
1314     def __init__(self, downloader=None):
1315         InfoExtractor.__init__(self, downloader)
1316
1317     def report_download_webpage(self, video_id):
1318         """Report webpage download."""
1319         if not self._downloader.params.get('test', False):
1320             self._downloader.report_warning(u'Falling back on generic information extractor.')
1321         super(GenericIE, self).report_download_webpage(video_id)
1322
1323     def report_following_redirect(self, new_url):
1324         """Report information extraction."""
1325         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1326
1327     def _test_redirect(self, url):
1328         """Check if it is a redirect, like url shorteners, in case return the new url."""
1329         class HeadRequest(compat_urllib_request.Request):
1330             def get_method(self):
1331                 return "HEAD"
1332
1333         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1334             """
1335             Subclass the HTTPRedirectHandler to make it use our
1336             HeadRequest also on the redirected URL
1337             """
1338             def redirect_request(self, req, fp, code, msg, headers, newurl):
1339                 if code in (301, 302, 303, 307):
1340                     newurl = newurl.replace(' ', '%20')
1341                     newheaders = dict((k,v) for k,v in req.headers.items()
1342                                       if k.lower() not in ("content-length", "content-type"))
1343                     return HeadRequest(newurl,
1344                                        headers=newheaders,
1345                                        origin_req_host=req.get_origin_req_host(),
1346                                        unverifiable=True)
1347                 else:
1348                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1349
1350         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1351             """
1352             Fallback to GET if HEAD is not allowed (405 HTTP error)
1353             """
1354             def http_error_405(self, req, fp, code, msg, headers):
1355                 fp.read()
1356                 fp.close()
1357
1358                 newheaders = dict((k,v) for k,v in req.headers.items()
1359                                   if k.lower() not in ("content-length", "content-type"))
1360                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1361                                                  headers=newheaders,
1362                                                  origin_req_host=req.get_origin_req_host(),
1363                                                  unverifiable=True))
1364
1365         # Build our opener
1366         opener = compat_urllib_request.OpenerDirector()
1367         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1368                         HTTPMethodFallback, HEADRedirectHandler,
1369                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1370             opener.add_handler(handler())
1371
1372         response = opener.open(HeadRequest(url))
1373         new_url = response.geturl()
1374
1375         if url == new_url:
1376             return False
1377
1378         self.report_following_redirect(new_url)
1379         return new_url
1380
1381     def _real_extract(self, url):
1382         new_url = self._test_redirect(url)
1383         if new_url: return [self.url_result(new_url)]
1384
1385         video_id = url.split('/')[-1]
1386         try:
1387             webpage = self._download_webpage(url, video_id)
1388         except ValueError as err:
1389             # since this is the last-resort InfoExtractor, if
1390             # this error is thrown, it'll be thrown here
1391             self._downloader.report_error(u'Invalid URL: %s' % url)
1392             return
1393
1394         self.report_extraction(video_id)
1395         # Start with something easy: JW Player in SWFObject
1396         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1397         if mobj is None:
1398             # Broaden the search a little bit
1399             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1400         if mobj is None:
1401             # Broaden the search a little bit: JWPlayer JS loader
1402             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1403         if mobj is None:
1404             self._downloader.report_error(u'Invalid URL: %s' % url)
1405             return
1406
1407         # It's possible that one of the regexes
1408         # matched, but returned an empty group:
1409         if mobj.group(1) is None:
1410             self._downloader.report_error(u'Invalid URL: %s' % url)
1411             return
1412
1413         video_url = compat_urllib_parse.unquote(mobj.group(1))
1414         video_id = os.path.basename(video_url)
1415
1416         # here's a fun little line of code for you:
1417         video_extension = os.path.splitext(video_id)[1][1:]
1418         video_id = os.path.splitext(video_id)[0]
1419
1420         # it's tempting to parse this further, but you would
1421         # have to take into account all the variations like
1422         #   Video Title - Site Name
1423         #   Site Name | Video Title
1424         #   Video Title - Tagline | Site Name
1425         # and so on and so forth; it's just not practical
1426         mobj = re.search(r'<title>(.*)</title>', webpage)
1427         if mobj is None:
1428             self._downloader.report_error(u'unable to extract title')
1429             return
1430         video_title = mobj.group(1)
1431
1432         # video uploader is domain name
1433         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1434         if mobj is None:
1435             self._downloader.report_error(u'unable to extract title')
1436             return
1437         video_uploader = mobj.group(1)
1438
1439         return [{
1440             'id':       video_id,
1441             'url':      video_url,
1442             'uploader': video_uploader,
1443             'upload_date':  None,
1444             'title':    video_title,
1445             'ext':      video_extension,
1446         }]
1447
1448
1449 class YoutubeSearchIE(InfoExtractor):
1450     """Information Extractor for YouTube search queries."""
1451     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1452     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1453     _max_youtube_results = 1000
1454     IE_NAME = u'youtube:search'
1455
1456     def __init__(self, downloader=None):
1457         InfoExtractor.__init__(self, downloader)
1458
1459     def report_download_page(self, query, pagenum):
1460         """Report attempt to download search page with given number."""
1461         query = query.decode(preferredencoding())
1462         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1463
1464     def _real_extract(self, query):
1465         mobj = re.match(self._VALID_URL, query)
1466         if mobj is None:
1467             self._downloader.report_error(u'invalid search query "%s"' % query)
1468             return
1469
1470         prefix, query = query.split(':')
1471         prefix = prefix[8:]
1472         query = query.encode('utf-8')
1473         if prefix == '':
1474             return self._get_n_results(query, 1)
1475         elif prefix == 'all':
1476             self._get_n_results(query, self._max_youtube_results)
1477         else:
1478             try:
1479                 n = int(prefix)
1480                 if n <= 0:
1481                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1482                     return
1483                 elif n > self._max_youtube_results:
1484                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1485                     n = self._max_youtube_results
1486                 return self._get_n_results(query, n)
1487             except ValueError: # parsing prefix as integer fails
1488                 return self._get_n_results(query, 1)
1489
1490     def _get_n_results(self, query, n):
1491         """Get a specified number of results for a query"""
1492
1493         video_ids = []
1494         pagenum = 0
1495         limit = n
1496
1497         while (50 * pagenum) < limit:
1498             self.report_download_page(query, pagenum+1)
1499             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1500             request = compat_urllib_request.Request(result_url)
1501             try:
1502                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1503             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1504                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1505                 return
1506             api_response = json.loads(data)['data']
1507
1508             if not 'items' in api_response:
1509                 self._downloader.report_error(u'[youtube] No video results')
1510                 return
1511
1512             new_ids = list(video['id'] for video in api_response['items'])
1513             video_ids += new_ids
1514
1515             limit = min(n, api_response['totalItems'])
1516             pagenum += 1
1517
1518         if len(video_ids) > n:
1519             video_ids = video_ids[:n]
1520         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1521         return videos
1522
1523
1524 class GoogleSearchIE(InfoExtractor):
1525     """Information Extractor for Google Video search queries."""
1526     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1527     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1528     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1529     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1530     _max_google_results = 1000
1531     IE_NAME = u'video.google:search'
1532
1533     def __init__(self, downloader=None):
1534         InfoExtractor.__init__(self, downloader)
1535
1536     def report_download_page(self, query, pagenum):
1537         """Report attempt to download playlist page with given number."""
1538         query = query.decode(preferredencoding())
1539         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1540
1541     def _real_extract(self, query):
1542         mobj = re.match(self._VALID_URL, query)
1543         if mobj is None:
1544             self._downloader.report_error(u'invalid search query "%s"' % query)
1545             return
1546
1547         prefix, query = query.split(':')
1548         prefix = prefix[8:]
1549         query = query.encode('utf-8')
1550         if prefix == '':
1551             self._download_n_results(query, 1)
1552             return
1553         elif prefix == 'all':
1554             self._download_n_results(query, self._max_google_results)
1555             return
1556         else:
1557             try:
1558                 n = int(prefix)
1559                 if n <= 0:
1560                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1561                     return
1562                 elif n > self._max_google_results:
1563                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1564                     n = self._max_google_results
1565                 self._download_n_results(query, n)
1566                 return
1567             except ValueError: # parsing prefix as integer fails
1568                 self._download_n_results(query, 1)
1569                 return
1570
1571     def _download_n_results(self, query, n):
1572         """Downloads a specified number of results for a query"""
1573
1574         video_ids = []
1575         pagenum = 0
1576
1577         while True:
1578             self.report_download_page(query, pagenum)
1579             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1580             request = compat_urllib_request.Request(result_url)
1581             try:
1582                 page = compat_urllib_request.urlopen(request).read()
1583             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1585                 return
1586
1587             # Extract video identifiers
1588             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589                 video_id = mobj.group(1)
1590                 if video_id not in video_ids:
1591                     video_ids.append(video_id)
1592                     if len(video_ids) == n:
1593                         # Specified n videos reached
1594                         for id in video_ids:
1595                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1596                         return
1597
1598             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1599                 for id in video_ids:
1600                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1601                 return
1602
1603             pagenum = pagenum + 1
1604
1605
1606 class YahooSearchIE(InfoExtractor):
1607     """Information Extractor for Yahoo! Video search queries."""
1608
1609     _WORKING = False
1610     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1611     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1612     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1613     _MORE_PAGES_INDICATOR = r'\s*Next'
1614     _max_yahoo_results = 1000
1615     IE_NAME = u'video.yahoo:search'
1616
1617     def __init__(self, downloader=None):
1618         InfoExtractor.__init__(self, downloader)
1619
1620     def report_download_page(self, query, pagenum):
1621         """Report attempt to download playlist page with given number."""
1622         query = query.decode(preferredencoding())
1623         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1624
1625     def _real_extract(self, query):
1626         mobj = re.match(self._VALID_URL, query)
1627         if mobj is None:
1628             self._downloader.report_error(u'invalid search query "%s"' % query)
1629             return
1630
1631         prefix, query = query.split(':')
1632         prefix = prefix[8:]
1633         query = query.encode('utf-8')
1634         if prefix == '':
1635             self._download_n_results(query, 1)
1636             return
1637         elif prefix == 'all':
1638             self._download_n_results(query, self._max_yahoo_results)
1639             return
1640         else:
1641             try:
1642                 n = int(prefix)
1643                 if n <= 0:
1644                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1645                     return
1646                 elif n > self._max_yahoo_results:
1647                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1648                     n = self._max_yahoo_results
1649                 self._download_n_results(query, n)
1650                 return
1651             except ValueError: # parsing prefix as integer fails
1652                 self._download_n_results(query, 1)
1653                 return
1654
1655     def _download_n_results(self, query, n):
1656         """Downloads a specified number of results for a query"""
1657
1658         video_ids = []
1659         already_seen = set()
1660         pagenum = 1
1661
1662         while True:
1663             self.report_download_page(query, pagenum)
1664             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1665             request = compat_urllib_request.Request(result_url)
1666             try:
1667                 page = compat_urllib_request.urlopen(request).read()
1668             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1669                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1670                 return
1671
1672             # Extract video identifiers
1673             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1674                 video_id = mobj.group(1)
1675                 if video_id not in already_seen:
1676                     video_ids.append(video_id)
1677                     already_seen.add(video_id)
1678                     if len(video_ids) == n:
1679                         # Specified n videos reached
1680                         for id in video_ids:
1681                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1682                         return
1683
1684             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1685                 for id in video_ids:
1686                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1687                 return
1688
1689             pagenum = pagenum + 1
1690
1691
1692 class YoutubePlaylistIE(InfoExtractor):
1693     """Information Extractor for YouTube playlists."""
1694
1695     _VALID_URL = r"""(?:
1696                         (?:https?://)?
1697                         (?:\w+\.)?
1698                         youtube\.com/
1699                         (?:
1700                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1701                            \? (?:.*?&)*? (?:p|a|list)=
1702                         |  p/
1703                         )
1704                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1705                         .*
1706                      |
1707                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1708                      )"""
1709     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1710     _MAX_RESULTS = 50
1711     IE_NAME = u'youtube:playlist'
1712
1713     def __init__(self, downloader=None):
1714         InfoExtractor.__init__(self, downloader)
1715
1716     @classmethod
1717     def suitable(cls, url):
1718         """Receives a URL and returns True if suitable for this IE."""
1719         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1720
1721     def report_download_page(self, playlist_id, pagenum):
1722         """Report attempt to download playlist page with given number."""
1723         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1724
1725     def _real_extract(self, url):
1726         # Extract playlist id
1727         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1728         if mobj is None:
1729             self._downloader.report_error(u'invalid url: %s' % url)
1730             return
1731
1732         # Download playlist videos from API
1733         playlist_id = mobj.group(1) or mobj.group(2)
1734         page_num = 1
1735         videos = []
1736
1737         while True:
1738             self.report_download_page(playlist_id, page_num)
1739
1740             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1741             try:
1742                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1743             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1744                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1745                 return
1746
1747             try:
1748                 response = json.loads(page)
1749             except ValueError as err:
1750                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1751                 return
1752
1753             if 'feed' not in response:
1754                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1755                 return
1756             if 'entry' not in response['feed']:
1757                 # Number of videos is a multiple of self._MAX_RESULTS
1758                 break
1759
1760             playlist_title = response['feed']['title']['$t']
1761
1762             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1763                         for entry in response['feed']['entry']
1764                         if 'content' in entry ]
1765
1766             if len(response['feed']['entry']) < self._MAX_RESULTS:
1767                 break
1768             page_num += 1
1769
1770         videos = [v[1] for v in sorted(videos)]
1771
1772         url_results = [self.url_result(url, 'Youtube') for url in videos]
1773         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1774
1775
1776 class YoutubeChannelIE(InfoExtractor):
1777     """Information Extractor for YouTube channels."""
1778
1779     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1780     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1781     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1782     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1783     IE_NAME = u'youtube:channel'
1784
1785     def report_download_page(self, channel_id, pagenum):
1786         """Report attempt to download channel page with given number."""
1787         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1788
1789     def extract_videos_from_page(self, page):
1790         ids_in_page = []
1791         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1792             if mobj.group(1) not in ids_in_page:
1793                 ids_in_page.append(mobj.group(1))
1794         return ids_in_page
1795
1796     def _real_extract(self, url):
1797         # Extract channel id
1798         mobj = re.match(self._VALID_URL, url)
1799         if mobj is None:
1800             self._downloader.report_error(u'invalid url: %s' % url)
1801             return
1802
1803         # Download channel page
1804         channel_id = mobj.group(1)
1805         video_ids = []
1806         pagenum = 1
1807
1808         self.report_download_page(channel_id, pagenum)
1809         url = self._TEMPLATE_URL % (channel_id, pagenum)
1810         request = compat_urllib_request.Request(url)
1811         try:
1812             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1813         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1814             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1815             return
1816
1817         # Extract video identifiers
1818         ids_in_page = self.extract_videos_from_page(page)
1819         video_ids.extend(ids_in_page)
1820
1821         # Download any subsequent channel pages using the json-based channel_ajax query
1822         if self._MORE_PAGES_INDICATOR in page:
1823             while True:
1824                 pagenum = pagenum + 1
1825
1826                 self.report_download_page(channel_id, pagenum)
1827                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1828                 request = compat_urllib_request.Request(url)
1829                 try:
1830                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1831                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1832                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1833                     return
1834
1835                 page = json.loads(page)
1836
1837                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1838                 video_ids.extend(ids_in_page)
1839
1840                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1841                     break
1842
1843         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1844
1845         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1846         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1847         return [self.playlist_result(url_entries, channel_id)]
1848
1849
1850 class YoutubeUserIE(InfoExtractor):
1851     """Information Extractor for YouTube users."""
1852
1853     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1854     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1855     _GDATA_PAGE_SIZE = 50
1856     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1857     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1858     IE_NAME = u'youtube:user'
1859
1860     def __init__(self, downloader=None):
1861         InfoExtractor.__init__(self, downloader)
1862
1863     def report_download_page(self, username, start_index):
1864         """Report attempt to download user page."""
1865         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1866                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1867
1868     def _real_extract(self, url):
1869         # Extract username
1870         mobj = re.match(self._VALID_URL, url)
1871         if mobj is None:
1872             self._downloader.report_error(u'invalid url: %s' % url)
1873             return
1874
1875         username = mobj.group(1)
1876
1877         # Download video ids using YouTube Data API. Result size per
1878         # query is limited (currently to 50 videos) so we need to query
1879         # page by page until there are no video ids - it means we got
1880         # all of them.
1881
1882         video_ids = []
1883         pagenum = 0
1884
1885         while True:
1886             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1887             self.report_download_page(username, start_index)
1888
1889             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1890
1891             try:
1892                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1893             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1894                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1895                 return
1896
1897             # Extract video identifiers
1898             ids_in_page = []
1899
1900             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1901                 if mobj.group(1) not in ids_in_page:
1902                     ids_in_page.append(mobj.group(1))
1903
1904             video_ids.extend(ids_in_page)
1905
1906             # A little optimization - if current page is not
1907             # "full", ie. does not contain PAGE_SIZE video ids then
1908             # we can assume that this page is the last one - there
1909             # are no more ids on further pages - no need to query
1910             # again.
1911
1912             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1913                 break
1914
1915             pagenum += 1
1916
1917         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1918         url_results = [self.url_result(url, 'Youtube') for url in urls]
1919         return [self.playlist_result(url_results, playlist_title = username)]
1920
1921
1922 class BlipTVUserIE(InfoExtractor):
1923     """Information Extractor for blip.tv users."""
1924
1925     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1926     _PAGE_SIZE = 12
1927     IE_NAME = u'blip.tv:user'
1928
1929     def __init__(self, downloader=None):
1930         InfoExtractor.__init__(self, downloader)
1931
1932     def report_download_page(self, username, pagenum):
1933         """Report attempt to download user page."""
1934         self.to_screen(u'user %s: Downloading video ids from page %d' %
1935                 (username, pagenum))
1936
1937     def _real_extract(self, url):
1938         # Extract username
1939         mobj = re.match(self._VALID_URL, url)
1940         if mobj is None:
1941             self._downloader.report_error(u'invalid url: %s' % url)
1942             return
1943
1944         username = mobj.group(1)
1945
1946         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1947
1948         request = compat_urllib_request.Request(url)
1949
1950         try:
1951             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1952             mobj = re.search(r'data-users-id="([^"]+)"', page)
1953             page_base = page_base % mobj.group(1)
1954         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1955             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1956             return
1957
1958
1959         # Download video ids using BlipTV Ajax calls. Result size per
1960         # query is limited (currently to 12 videos) so we need to query
1961         # page by page until there are no video ids - it means we got
1962         # all of them.
1963
1964         video_ids = []
1965         pagenum = 1
1966
1967         while True:
1968             self.report_download_page(username, pagenum)
1969             url = page_base + "&page=" + str(pagenum)
1970             request = compat_urllib_request.Request( url )
1971             try:
1972                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1973             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1974                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1975                 return
1976
1977             # Extract video identifiers
1978             ids_in_page = []
1979
1980             for mobj in re.finditer(r'href="/([^"]+)"', page):
1981                 if mobj.group(1) not in ids_in_page:
1982                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1983
1984             video_ids.extend(ids_in_page)
1985
1986             # A little optimization - if current page is not
1987             # "full", ie. does not contain PAGE_SIZE video ids then
1988             # we can assume that this page is the last one - there
1989             # are no more ids on further pages - no need to query
1990             # again.
1991
1992             if len(ids_in_page) < self._PAGE_SIZE:
1993                 break
1994
1995             pagenum += 1
1996
1997         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1998         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1999         return [self.playlist_result(url_entries, playlist_title = username)]
2000
2001
2002 class DepositFilesIE(InfoExtractor):
2003     """Information extractor for depositfiles.com"""
2004
2005     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2006
2007     def _real_extract(self, url):
2008         file_id = url.split('/')[-1]
2009         # Rebuild url in english locale
2010         url = 'http://depositfiles.com/en/files/' + file_id
2011
2012         # Retrieve file webpage with 'Free download' button pressed
2013         free_download_indication = { 'gateway_result' : '1' }
2014         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2015         try:
2016             self.report_download_webpage(file_id)
2017             webpage = compat_urllib_request.urlopen(request).read()
2018         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2019             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2020             return
2021
2022         # Search for the real file URL
2023         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2024         if (mobj is None) or (mobj.group(1) is None):
2025             # Try to figure out reason of the error.
2026             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2027             if (mobj is not None) and (mobj.group(1) is not None):
2028                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2029                 self._downloader.report_error(u'%s' % restriction_message)
2030             else:
2031                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2032             return
2033
2034         file_url = mobj.group(1)
2035         file_extension = os.path.splitext(file_url)[1][1:]
2036
2037         # Search for file title
2038         mobj = re.search(r'<b title="(.*?)">', webpage)
2039         if mobj is None:
2040             self._downloader.report_error(u'unable to extract title')
2041             return
2042         file_title = mobj.group(1).decode('utf-8')
2043
2044         return [{
2045             'id':       file_id.decode('utf-8'),
2046             'url':      file_url.decode('utf-8'),
2047             'uploader': None,
2048             'upload_date':  None,
2049             'title':    file_title,
2050             'ext':      file_extension.decode('utf-8'),
2051         }]
2052
2053
2054 class FacebookIE(InfoExtractor):
2055     """Information Extractor for Facebook"""
2056
2057     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2058     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2059     _NETRC_MACHINE = 'facebook'
2060     IE_NAME = u'facebook'
2061
2062     def report_login(self):
2063         """Report attempt to log in."""
2064         self.to_screen(u'Logging in')
2065
2066     def _real_initialize(self):
2067         if self._downloader is None:
2068             return
2069
2070         useremail = None
2071         password = None
2072         downloader_params = self._downloader.params
2073
2074         # Attempt to use provided username and password or .netrc data
2075         if downloader_params.get('username', None) is not None:
2076             useremail = downloader_params['username']
2077             password = downloader_params['password']
2078         elif downloader_params.get('usenetrc', False):
2079             try:
2080                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2081                 if info is not None:
2082                     useremail = info[0]
2083                     password = info[2]
2084                 else:
2085                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2086             except (IOError, netrc.NetrcParseError) as err:
2087                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2088                 return
2089
2090         if useremail is None:
2091             return
2092
2093         # Log in
2094         login_form = {
2095             'email': useremail,
2096             'pass': password,
2097             'login': 'Log+In'
2098             }
2099         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2100         try:
2101             self.report_login()
2102             login_results = compat_urllib_request.urlopen(request).read()
2103             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2104                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2105                 return
2106         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2107             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2108             return
2109
2110     def _real_extract(self, url):
2111         mobj = re.match(self._VALID_URL, url)
2112         if mobj is None:
2113             self._downloader.report_error(u'invalid URL: %s' % url)
2114             return
2115         video_id = mobj.group('ID')
2116
2117         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2118         webpage = self._download_webpage(url, video_id)
2119
2120         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2121         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2122         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2123         if not m:
2124             raise ExtractorError(u'Cannot parse data')
2125         data = dict(json.loads(m.group(1)))
2126         params_raw = compat_urllib_parse.unquote(data['params'])
2127         params = json.loads(params_raw)
2128         video_data = params['video_data'][0]
2129         video_url = video_data.get('hd_src')
2130         if not video_url:
2131             video_url = video_data['sd_src']
2132         if not video_url:
2133             raise ExtractorError(u'Cannot find video URL')
2134         video_duration = int(video_data['video_duration'])
2135         thumbnail = video_data['thumbnail_src']
2136
2137         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2138         if not m:
2139             raise ExtractorError(u'Cannot find title in webpage')
2140         video_title = unescapeHTML(m.group(1))
2141
2142         info = {
2143             'id': video_id,
2144             'title': video_title,
2145             'url': video_url,
2146             'ext': 'mp4',
2147             'duration': video_duration,
2148             'thumbnail': thumbnail,
2149         }
2150         return [info]
2151
2152
2153 class BlipTVIE(InfoExtractor):
2154     """Information extractor for blip.tv"""
2155
2156     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2157     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2158     IE_NAME = u'blip.tv'
2159
2160     def report_direct_download(self, title):
2161         """Report information extraction."""
2162         self.to_screen(u'%s: Direct download detected' % title)
2163
2164     def _real_extract(self, url):
2165         mobj = re.match(self._VALID_URL, url)
2166         if mobj is None:
2167             self._downloader.report_error(u'invalid URL: %s' % url)
2168             return
2169
2170         urlp = compat_urllib_parse_urlparse(url)
2171         if urlp.path.startswith('/play/'):
2172             request = compat_urllib_request.Request(url)
2173             response = compat_urllib_request.urlopen(request)
2174             redirecturl = response.geturl()
2175             rurlp = compat_urllib_parse_urlparse(redirecturl)
2176             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2177             url = 'http://blip.tv/a/a-' + file_id
2178             return self._real_extract(url)
2179
2180
2181         if '?' in url:
2182             cchar = '&'
2183         else:
2184             cchar = '?'
2185         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186         request = compat_urllib_request.Request(json_url)
2187         request.add_header('User-Agent', 'iTunes/10.6.1')
2188         self.report_extraction(mobj.group(1))
2189         info = None
2190         try:
2191             urlh = compat_urllib_request.urlopen(request)
2192             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2193                 basename = url.split('/')[-1]
2194                 title,ext = os.path.splitext(basename)
2195                 title = title.decode('UTF-8')
2196                 ext = ext.replace('.', '')
2197                 self.report_direct_download(title)
2198                 info = {
2199                     'id': title,
2200                     'url': url,
2201                     'uploader': None,
2202                     'upload_date': None,
2203                     'title': title,
2204                     'ext': ext,
2205                     'urlhandle': urlh
2206                 }
2207         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2208             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2209         if info is None: # Regular URL
2210             try:
2211                 json_code_bytes = urlh.read()
2212                 json_code = json_code_bytes.decode('utf-8')
2213             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2215                 return
2216
2217             try:
2218                 json_data = json.loads(json_code)
2219                 if 'Post' in json_data:
2220                     data = json_data['Post']
2221                 else:
2222                     data = json_data
2223
2224                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225                 video_url = data['media']['url']
2226                 umobj = re.match(self._URL_EXT, video_url)
2227                 if umobj is None:
2228                     raise ValueError('Can not determine filename extension')
2229                 ext = umobj.group(1)
2230
2231                 info = {
2232                     'id': data['item_id'],
2233                     'url': video_url,
2234                     'uploader': data['display_name'],
2235                     'upload_date': upload_date,
2236                     'title': data['title'],
2237                     'ext': ext,
2238                     'format': data['media']['mimeType'],
2239                     'thumbnail': data['thumbnailUrl'],
2240                     'description': data['description'],
2241                     'player_url': data['embedUrl'],
2242                     'user_agent': 'iTunes/10.6.1',
2243                 }
2244             except (ValueError,KeyError) as err:
2245                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2246                 return
2247
2248         return [info]
2249
2250
2251 class MyVideoIE(InfoExtractor):
2252     """Information Extractor for myvideo.de."""
2253
2254     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255     IE_NAME = u'myvideo'
2256
2257     def __init__(self, downloader=None):
2258         InfoExtractor.__init__(self, downloader)
2259
2260     def _real_extract(self,url):
2261         mobj = re.match(self._VALID_URL, url)
2262         if mobj is None:
2263             self._download.report_error(u'invalid URL: %s' % url)
2264             return
2265
2266         video_id = mobj.group(1)
2267
2268         # Get video webpage
2269         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2270         webpage = self._download_webpage(webpage_url, video_id)
2271
2272         self.report_extraction(video_id)
2273         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2274                  webpage)
2275         if mobj is None:
2276             self._downloader.report_error(u'unable to extract media URL')
2277             return
2278         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2279
2280         mobj = re.search('<title>([^<]+)</title>', webpage)
2281         if mobj is None:
2282             self._downloader.report_error(u'unable to extract title')
2283             return
2284
2285         video_title = mobj.group(1)
2286
2287         return [{
2288             'id':       video_id,
2289             'url':      video_url,
2290             'uploader': None,
2291             'upload_date':  None,
2292             'title':    video_title,
2293             'ext':      u'flv',
2294         }]
2295
2296 class ComedyCentralIE(InfoExtractor):
2297     """Information extractor for The Daily Show and Colbert Report """
2298
2299     # urls can be abbreviations like :thedailyshow or :colbert
2300     # urls for episodes like:
2301     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2302     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2303     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2304     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2305                       |(https?://)?(www\.)?
2306                           (?P<showname>thedailyshow|colbertnation)\.com/
2307                          (full-episodes/(?P<episode>.*)|
2308                           (?P<clip>
2309                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2310                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2311                      $"""
2312
2313     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2314
2315     _video_extensions = {
2316         '3500': 'mp4',
2317         '2200': 'mp4',
2318         '1700': 'mp4',
2319         '1200': 'mp4',
2320         '750': 'mp4',
2321         '400': 'mp4',
2322     }
2323     _video_dimensions = {
2324         '3500': '1280x720',
2325         '2200': '960x540',
2326         '1700': '768x432',
2327         '1200': '640x360',
2328         '750': '512x288',
2329         '400': '384x216',
2330     }
2331
2332     @classmethod
2333     def suitable(cls, url):
2334         """Receives a URL and returns True if suitable for this IE."""
2335         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2336
2337     def report_config_download(self, episode_id, media_id):
2338         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2339
2340     def report_index_download(self, episode_id):
2341         self.to_screen(u'%s: Downloading show index' % episode_id)
2342
2343     def _print_formats(self, formats):
2344         print('Available formats:')
2345         for x in formats:
2346             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2347
2348
2349     def _real_extract(self, url):
2350         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2351         if mobj is None:
2352             self._downloader.report_error(u'invalid URL: %s' % url)
2353             return
2354
2355         if mobj.group('shortname'):
2356             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2357                 url = u'http://www.thedailyshow.com/full-episodes/'
2358             else:
2359                 url = u'http://www.colbertnation.com/full-episodes/'
2360             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2361             assert mobj is not None
2362
2363         if mobj.group('clip'):
2364             if mobj.group('showname') == 'thedailyshow':
2365                 epTitle = mobj.group('tdstitle')
2366             else:
2367                 epTitle = mobj.group('cntitle')
2368             dlNewest = False
2369         else:
2370             dlNewest = not mobj.group('episode')
2371             if dlNewest:
2372                 epTitle = mobj.group('showname')
2373             else:
2374                 epTitle = mobj.group('episode')
2375
2376         req = compat_urllib_request.Request(url)
2377         self.report_extraction(epTitle)
2378         try:
2379             htmlHandle = compat_urllib_request.urlopen(req)
2380             html = htmlHandle.read()
2381             webpage = html.decode('utf-8')
2382         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2383             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2384             return
2385         if dlNewest:
2386             url = htmlHandle.geturl()
2387             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2388             if mobj is None:
2389                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2390                 return
2391             if mobj.group('episode') == '':
2392                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2393                 return
2394             epTitle = mobj.group('episode')
2395
2396         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2397
2398         if len(mMovieParams) == 0:
2399             # The Colbert Report embeds the information in a without
2400             # a URL prefix; so extract the alternate reference
2401             # and then add the URL prefix manually.
2402
2403             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2404             if len(altMovieParams) == 0:
2405                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2406                 return
2407             else:
2408                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2409
2410         uri = mMovieParams[0][1]
2411         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2412         self.report_index_download(epTitle)
2413         try:
2414             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2415         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2416             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2417             return
2418
2419         results = []
2420
2421         idoc = xml.etree.ElementTree.fromstring(indexXml)
2422         itemEls = idoc.findall('.//item')
2423         for partNum,itemEl in enumerate(itemEls):
2424             mediaId = itemEl.findall('./guid')[0].text
2425             shortMediaId = mediaId.split(':')[-1]
2426             showId = mediaId.split(':')[-2].replace('.com', '')
2427             officialTitle = itemEl.findall('./title')[0].text
2428             officialDate = itemEl.findall('./pubDate')[0].text
2429
2430             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2431                         compat_urllib_parse.urlencode({'uri': mediaId}))
2432             configReq = compat_urllib_request.Request(configUrl)
2433             self.report_config_download(epTitle, shortMediaId)
2434             try:
2435                 configXml = compat_urllib_request.urlopen(configReq).read()
2436             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2437                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2438                 return
2439
2440             cdoc = xml.etree.ElementTree.fromstring(configXml)
2441             turls = []
2442             for rendition in cdoc.findall('.//rendition'):
2443                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2444                 turls.append(finfo)
2445
2446             if len(turls) == 0:
2447                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2448                 continue
2449
2450             if self._downloader.params.get('listformats', None):
2451                 self._print_formats([i[0] for i in turls])
2452                 return
2453
2454             # For now, just pick the highest bitrate
2455             format,rtmp_video_url = turls[-1]
2456
2457             # Get the format arg from the arg stream
2458             req_format = self._downloader.params.get('format', None)
2459
2460             # Select format if we can find one
2461             for f,v in turls:
2462                 if f == req_format:
2463                     format, rtmp_video_url = f, v
2464                     break
2465
2466             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2467             if not m:
2468                 raise ExtractorError(u'Cannot transform RTMP url')
2469             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2470             video_url = base + m.group('finalid')
2471
2472             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2473             info = {
2474                 'id': shortMediaId,
2475                 'url': video_url,
2476                 'uploader': showId,
2477                 'upload_date': officialDate,
2478                 'title': effTitle,
2479                 'ext': 'mp4',
2480                 'format': format,
2481                 'thumbnail': None,
2482                 'description': officialTitle,
2483             }
2484             results.append(info)
2485
2486         return results
2487
2488
2489 class EscapistIE(InfoExtractor):
2490     """Information extractor for The Escapist """
2491
2492     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2493     IE_NAME = u'escapist'
2494
2495     def report_config_download(self, showName):
2496         self.to_screen(u'%s: Downloading configuration' % showName)
2497
2498     def _real_extract(self, url):
2499         mobj = re.match(self._VALID_URL, url)
2500         if mobj is None:
2501             self._downloader.report_error(u'invalid URL: %s' % url)
2502             return
2503         showName = mobj.group('showname')
2504         videoId = mobj.group('episode')
2505
2506         self.report_extraction(showName)
2507         try:
2508             webPage = compat_urllib_request.urlopen(url)
2509             webPageBytes = webPage.read()
2510             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2511             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2512         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2513             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2514             return
2515
2516         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2517         description = unescapeHTML(descMatch.group(1))
2518         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2519         imgUrl = unescapeHTML(imgMatch.group(1))
2520         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2521         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2522         configUrlMatch = re.search('config=(.*)$', playerUrl)
2523         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2524
2525         self.report_config_download(showName)
2526         try:
2527             configJSON = compat_urllib_request.urlopen(configUrl)
2528             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2529             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2530         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2531             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2532             return
2533
2534         # Technically, it's JavaScript, not JSON
2535         configJSON = configJSON.replace("'", '"')
2536
2537         try:
2538             config = json.loads(configJSON)
2539         except (ValueError,) as err:
2540             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2541             return
2542
2543         playlist = config['playlist']
2544         videoUrl = playlist[1]['url']
2545
2546         info = {
2547             'id': videoId,
2548             'url': videoUrl,
2549             'uploader': showName,
2550             'upload_date': None,
2551             'title': showName,
2552             'ext': 'mp4',
2553             'thumbnail': imgUrl,
2554             'description': description,
2555             'player_url': playerUrl,
2556         }
2557
2558         return [info]
2559
2560 class CollegeHumorIE(InfoExtractor):
2561     """Information extractor for collegehumor.com"""
2562
2563     _WORKING = False
2564     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2565     IE_NAME = u'collegehumor'
2566
2567     def report_manifest(self, video_id):
2568         """Report information extraction."""
2569         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2570
2571     def _real_extract(self, url):
2572         mobj = re.match(self._VALID_URL, url)
2573         if mobj is None:
2574             self._downloader.report_error(u'invalid URL: %s' % url)
2575             return
2576         video_id = mobj.group('videoid')
2577
2578         info = {
2579             'id': video_id,
2580             'uploader': None,
2581             'upload_date': None,
2582         }
2583
2584         self.report_extraction(video_id)
2585         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2586         try:
2587             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2588         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2589             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2590             return
2591
2592         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2593         try:
2594             videoNode = mdoc.findall('./video')[0]
2595             info['description'] = videoNode.findall('./description')[0].text
2596             info['title'] = videoNode.findall('./caption')[0].text
2597             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2598             manifest_url = videoNode.findall('./file')[0].text
2599         except IndexError:
2600             self._downloader.report_error(u'Invalid metadata XML file')
2601             return
2602
2603         manifest_url += '?hdcore=2.10.3'
2604         self.report_manifest(video_id)
2605         try:
2606             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2607         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2608             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2609             return
2610
2611         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2612         try:
2613             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2614             node_id = media_node.attrib['url']
2615             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2616         except IndexError as err:
2617             self._downloader.report_error(u'Invalid manifest file')
2618             return
2619
2620         url_pr = compat_urllib_parse_urlparse(manifest_url)
2621         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2622
2623         info['url'] = url
2624         info['ext'] = 'f4f'
2625         return [info]
2626
2627
2628 class XVideosIE(InfoExtractor):
2629     """Information extractor for xvideos.com"""
2630
2631     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2632     IE_NAME = u'xvideos'
2633
2634     def _real_extract(self, url):
2635         mobj = re.match(self._VALID_URL, url)
2636         if mobj is None:
2637             self._downloader.report_error(u'invalid URL: %s' % url)
2638             return
2639         video_id = mobj.group(1)
2640
2641         webpage = self._download_webpage(url, video_id)
2642
2643         self.report_extraction(video_id)
2644
2645
2646         # Extract video URL
2647         mobj = re.search(r'flv_url=(.+?)&', webpage)
2648         if mobj is None:
2649             self._downloader.report_error(u'unable to extract video url')
2650             return
2651         video_url = compat_urllib_parse.unquote(mobj.group(1))
2652
2653
2654         # Extract title
2655         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2656         if mobj is None:
2657             self._downloader.report_error(u'unable to extract video title')
2658             return
2659         video_title = mobj.group(1)
2660
2661
2662         # Extract video thumbnail
2663         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2664         if mobj is None:
2665             self._downloader.report_error(u'unable to extract video thumbnail')
2666             return
2667         video_thumbnail = mobj.group(0)
2668
2669         info = {
2670             'id': video_id,
2671             'url': video_url,
2672             'uploader': None,
2673             'upload_date': None,
2674             'title': video_title,
2675             'ext': 'flv',
2676             'thumbnail': video_thumbnail,
2677             'description': None,
2678         }
2679
2680         return [info]
2681
2682
2683 class SoundcloudIE(InfoExtractor):
2684     """Information extractor for soundcloud.com
2685        To access the media, the uid of the song and a stream token
2686        must be extracted from the page source and the script must make
2687        a request to media.soundcloud.com/crossdomain.xml. Then
2688        the media can be grabbed by requesting from an url composed
2689        of the stream token and uid
2690      """
2691
2692     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693     IE_NAME = u'soundcloud'
2694
2695     def __init__(self, downloader=None):
2696         InfoExtractor.__init__(self, downloader)
2697
2698     def report_resolve(self, video_id):
2699         """Report information extraction."""
2700         self.to_screen(u'%s: Resolving id' % video_id)
2701
2702     def _real_extract(self, url):
2703         mobj = re.match(self._VALID_URL, url)
2704         if mobj is None:
2705             self._downloader.report_error(u'invalid URL: %s' % url)
2706             return
2707
2708         # extract uploader (which is in the url)
2709         uploader = mobj.group(1)
2710         # extract simple title (uploader + slug of song title)
2711         slug_title =  mobj.group(2)
2712         simple_title = uploader + u'-' + slug_title
2713
2714         self.report_resolve('%s/%s' % (uploader, slug_title))
2715
2716         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2717         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2718         request = compat_urllib_request.Request(resolv_url)
2719         try:
2720             info_json_bytes = compat_urllib_request.urlopen(request).read()
2721             info_json = info_json_bytes.decode('utf-8')
2722         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2724             return
2725
2726         info = json.loads(info_json)
2727         video_id = info['id']
2728         self.report_extraction('%s/%s' % (uploader, slug_title))
2729
2730         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2731         request = compat_urllib_request.Request(streams_url)
2732         try:
2733             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2734             stream_json = stream_json_bytes.decode('utf-8')
2735         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2737             return
2738
2739         streams = json.loads(stream_json)
2740         mediaURL = streams['http_mp3_128_url']
2741
2742         return [{
2743             'id':       info['id'],
2744             'url':      mediaURL,
2745             'uploader': info['user']['username'],
2746             'upload_date':  info['created_at'],
2747             'title':    info['title'],
2748             'ext':      u'mp3',
2749             'description': info['description'],
2750         }]
2751
2752 class SoundcloudSetIE(InfoExtractor):
2753     """Information extractor for soundcloud.com sets
2754        To access the media, the uid of the song and a stream token
2755        must be extracted from the page source and the script must make
2756        a request to media.soundcloud.com/crossdomain.xml. Then
2757        the media can be grabbed by requesting from an url composed
2758        of the stream token and uid
2759      """
2760
2761     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2762     IE_NAME = u'soundcloud'
2763
2764     def __init__(self, downloader=None):
2765         InfoExtractor.__init__(self, downloader)
2766
2767     def report_resolve(self, video_id):
2768         """Report information extraction."""
2769         self.to_screen(u'%s: Resolving id' % video_id)
2770
2771     def _real_extract(self, url):
2772         mobj = re.match(self._VALID_URL, url)
2773         if mobj is None:
2774             self._downloader.report_error(u'invalid URL: %s' % url)
2775             return
2776
2777         # extract uploader (which is in the url)
2778         uploader = mobj.group(1)
2779         # extract simple title (uploader + slug of song title)
2780         slug_title =  mobj.group(2)
2781         simple_title = uploader + u'-' + slug_title
2782
2783         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2784
2785         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2786         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2787         request = compat_urllib_request.Request(resolv_url)
2788         try:
2789             info_json_bytes = compat_urllib_request.urlopen(request).read()
2790             info_json = info_json_bytes.decode('utf-8')
2791         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2793             return
2794
2795         videos = []
2796         info = json.loads(info_json)
2797         if 'errors' in info:
2798             for err in info['errors']:
2799                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2800             return
2801
2802         for track in info['tracks']:
2803             video_id = track['id']
2804             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2805
2806             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807             request = compat_urllib_request.Request(streams_url)
2808             try:
2809                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2810                 stream_json = stream_json_bytes.decode('utf-8')
2811             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2813                 return
2814
2815             streams = json.loads(stream_json)
2816             mediaURL = streams['http_mp3_128_url']
2817
2818             videos.append({
2819                 'id':       video_id,
2820                 'url':      mediaURL,
2821                 'uploader': track['user']['username'],
2822                 'upload_date':  track['created_at'],
2823                 'title':    track['title'],
2824                 'ext':      u'mp3',
2825                 'description': track['description'],
2826             })
2827         return videos
2828
2829
2830 class InfoQIE(InfoExtractor):
2831     """Information extractor for infoq.com"""
2832     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2833
2834     def _real_extract(self, url):
2835         mobj = re.match(self._VALID_URL, url)
2836         if mobj is None:
2837             self._downloader.report_error(u'invalid URL: %s' % url)
2838             return
2839
2840         webpage = self._download_webpage(url, video_id=url)
2841         self.report_extraction(url)
2842
2843         # Extract video URL
2844         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2845         if mobj is None:
2846             self._downloader.report_error(u'unable to extract video url')
2847             return
2848         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2849         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2850
2851         # Extract title
2852         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2853         if mobj is None:
2854             self._downloader.report_error(u'unable to extract video title')
2855             return
2856         video_title = mobj.group(1)
2857
2858         # Extract description
2859         video_description = u'No description available.'
2860         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2861         if mobj is not None:
2862             video_description = mobj.group(1)
2863
2864         video_filename = video_url.split('/')[-1]
2865         video_id, extension = video_filename.split('.')
2866
2867         info = {
2868             'id': video_id,
2869             'url': video_url,
2870             'uploader': None,
2871             'upload_date': None,
2872             'title': video_title,
2873             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2874             'thumbnail': None,
2875             'description': video_description,
2876         }
2877
2878         return [info]
2879
2880 class MixcloudIE(InfoExtractor):
2881     """Information extractor for www.mixcloud.com"""
2882
2883     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2884     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2885     IE_NAME = u'mixcloud'
2886
2887     def __init__(self, downloader=None):
2888         InfoExtractor.__init__(self, downloader)
2889
2890     def report_download_json(self, file_id):
2891         """Report JSON download."""
2892         self.to_screen(u'Downloading json')
2893
2894     def get_urls(self, jsonData, fmt, bitrate='best'):
2895         """Get urls from 'audio_formats' section in json"""
2896         file_url = None
2897         try:
2898             bitrate_list = jsonData[fmt]
2899             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2900                 bitrate = max(bitrate_list) # select highest
2901
2902             url_list = jsonData[fmt][bitrate]
2903         except TypeError: # we have no bitrate info.
2904             url_list = jsonData[fmt]
2905         return url_list
2906
2907     def check_urls(self, url_list):
2908         """Returns 1st active url from list"""
2909         for url in url_list:
2910             try:
2911                 compat_urllib_request.urlopen(url)
2912                 return url
2913             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2914                 url = None
2915
2916         return None
2917
2918     def _print_formats(self, formats):
2919         print('Available formats:')
2920         for fmt in formats.keys():
2921             for b in formats[fmt]:
2922                 try:
2923                     ext = formats[fmt][b][0]
2924                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2925                 except TypeError: # we have no bitrate info
2926                     ext = formats[fmt][0]
2927                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2928                     break
2929
2930     def _real_extract(self, url):
2931         mobj = re.match(self._VALID_URL, url)
2932         if mobj is None:
2933             self._downloader.report_error(u'invalid URL: %s' % url)
2934             return
2935         # extract uploader & filename from url
2936         uploader = mobj.group(1).decode('utf-8')
2937         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2938
2939         # construct API request
2940         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2941         # retrieve .json file with links to files
2942         request = compat_urllib_request.Request(file_url)
2943         try:
2944             self.report_download_json(file_url)
2945             jsonData = compat_urllib_request.urlopen(request).read()
2946         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2948             return
2949
2950         # parse JSON
2951         json_data = json.loads(jsonData)
2952         player_url = json_data['player_swf_url']
2953         formats = dict(json_data['audio_formats'])
2954
2955         req_format = self._downloader.params.get('format', None)
2956         bitrate = None
2957
2958         if self._downloader.params.get('listformats', None):
2959             self._print_formats(formats)
2960             return
2961
2962         if req_format is None or req_format == 'best':
2963             for format_param in formats.keys():
2964                 url_list = self.get_urls(formats, format_param)
2965                 # check urls
2966                 file_url = self.check_urls(url_list)
2967                 if file_url is not None:
2968                     break # got it!
2969         else:
2970             if req_format not in formats:
2971                 self._downloader.report_error(u'format is not available')
2972                 return
2973
2974             url_list = self.get_urls(formats, req_format)
2975             file_url = self.check_urls(url_list)
2976             format_param = req_format
2977
2978         return [{
2979             'id': file_id.decode('utf-8'),
2980             'url': file_url.decode('utf-8'),
2981             'uploader': uploader.decode('utf-8'),
2982             'upload_date': None,
2983             'title': json_data['name'],
2984             'ext': file_url.split('.')[-1].decode('utf-8'),
2985             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2986             'thumbnail': json_data['thumbnail_url'],
2987             'description': json_data['description'],
2988             'player_url': player_url.decode('utf-8'),
2989         }]
2990
2991 class StanfordOpenClassroomIE(InfoExtractor):
2992     """Information extractor for Stanford's Open ClassRoom"""
2993
2994     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2995     IE_NAME = u'stanfordoc'
2996
2997     def _real_extract(self, url):
2998         mobj = re.match(self._VALID_URL, url)
2999         if mobj is None:
3000             raise ExtractorError(u'Invalid URL: %s' % url)
3001
3002         if mobj.group('course') and mobj.group('video'): # A specific video
3003             course = mobj.group('course')
3004             video = mobj.group('video')
3005             info = {
3006                 'id': course + '_' + video,
3007                 'uploader': None,
3008                 'upload_date': None,
3009             }
3010
3011             self.report_extraction(info['id'])
3012             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3013             xmlUrl = baseUrl + video + '.xml'
3014             try:
3015                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3016             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3017                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3018                 return
3019             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3020             try:
3021                 info['title'] = mdoc.findall('./title')[0].text
3022                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3023             except IndexError:
3024                 self._downloader.report_error(u'Invalid metadata XML file')
3025                 return
3026             info['ext'] = info['url'].rpartition('.')[2]
3027             return [info]
3028         elif mobj.group('course'): # A course page
3029             course = mobj.group('course')
3030             info = {
3031                 'id': course,
3032                 'type': 'playlist',
3033                 'uploader': None,
3034                 'upload_date': None,
3035             }
3036
3037             coursepage = self._download_webpage(url, info['id'],
3038                                         note='Downloading course info page',
3039                                         errnote='Unable to download course info page')
3040
3041             m = re.search('<h1>([^<]+)</h1>', coursepage)
3042             if m:
3043                 info['title'] = unescapeHTML(m.group(1))
3044             else:
3045                 info['title'] = info['id']
3046
3047             m = re.search('<description>([^<]+)</description>', coursepage)
3048             if m:
3049                 info['description'] = unescapeHTML(m.group(1))
3050
3051             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3052             info['list'] = [
3053                 {
3054                     'type': 'reference',
3055                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3056                 }
3057                     for vpage in links]
3058             results = []
3059             for entry in info['list']:
3060                 assert entry['type'] == 'reference'
3061                 results += self.extract(entry['url'])
3062             return results
3063         else: # Root page
3064             info = {
3065                 'id': 'Stanford OpenClassroom',
3066                 'type': 'playlist',
3067                 'uploader': None,
3068                 'upload_date': None,
3069             }
3070
3071             self.report_download_webpage(info['id'])
3072             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3073             try:
3074                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3075             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3076                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3077                 return
3078
3079             info['title'] = info['id']
3080
3081             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3082             info['list'] = [
3083                 {
3084                     'type': 'reference',
3085                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3086                 }
3087                     for cpage in links]
3088
3089             results = []
3090             for entry in info['list']:
3091                 assert entry['type'] == 'reference'
3092                 results += self.extract(entry['url'])
3093             return results
3094
3095 class MTVIE(InfoExtractor):
3096     """Information extractor for MTV.com"""
3097
3098     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3099     IE_NAME = u'mtv'
3100
3101     def _real_extract(self, url):
3102         mobj = re.match(self._VALID_URL, url)
3103         if mobj is None:
3104             self._downloader.report_error(u'invalid URL: %s' % url)
3105             return
3106         if not mobj.group('proto'):
3107             url = 'http://' + url
3108         video_id = mobj.group('videoid')
3109
3110         webpage = self._download_webpage(url, video_id)
3111
3112         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3113         if mobj is None:
3114             self._downloader.report_error(u'unable to extract song name')
3115             return
3116         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3117         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3118         if mobj is None:
3119             self._downloader.report_error(u'unable to extract performer')
3120             return
3121         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3122         video_title = performer + ' - ' + song_name
3123
3124         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3125         if mobj is None:
3126             self._downloader.report_error(u'unable to mtvn_uri')
3127             return
3128         mtvn_uri = mobj.group(1)
3129
3130         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3131         if mobj is None:
3132             self._downloader.report_error(u'unable to extract content id')
3133             return
3134         content_id = mobj.group(1)
3135
3136         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3137         self.report_extraction(video_id)
3138         request = compat_urllib_request.Request(videogen_url)
3139         try:
3140             metadataXml = compat_urllib_request.urlopen(request).read()
3141         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3143             return
3144
3145         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3146         renditions = mdoc.findall('.//rendition')
3147
3148         # For now, always pick the highest quality.
3149         rendition = renditions[-1]
3150
3151         try:
3152             _,_,ext = rendition.attrib['type'].partition('/')
3153             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3154             video_url = rendition.find('./src').text
3155         except KeyError:
3156             self._downloader.report_error('Invalid rendition field.')
3157             return
3158
3159         info = {
3160             'id': video_id,
3161             'url': video_url,
3162             'uploader': performer,
3163             'upload_date': None,
3164             'title': video_title,
3165             'ext': ext,
3166             'format': format,
3167         }
3168
3169         return [info]
3170
3171
3172 class YoukuIE(InfoExtractor):
3173     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3174
3175     def _gen_sid(self):
3176         nowTime = int(time.time() * 1000)
3177         random1 = random.randint(1000,1998)
3178         random2 = random.randint(1000,9999)
3179
3180         return "%d%d%d" %(nowTime,random1,random2)
3181
3182     def _get_file_ID_mix_string(self, seed):
3183         mixed = []
3184         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3185         seed = float(seed)
3186         for i in range(len(source)):
3187             seed  =  (seed * 211 + 30031 ) % 65536
3188             index  =  math.floor(seed / 65536 * len(source) )
3189             mixed.append(source[int(index)])
3190             source.remove(source[int(index)])
3191         #return ''.join(mixed)
3192         return mixed
3193
3194     def _get_file_id(self, fileId, seed):
3195         mixed = self._get_file_ID_mix_string(seed)
3196         ids = fileId.split('*')
3197         realId = []
3198         for ch in ids:
3199             if ch:
3200                 realId.append(mixed[int(ch)])
3201         return ''.join(realId)
3202
3203     def _real_extract(self, url):
3204         mobj = re.match(self._VALID_URL, url)
3205         if mobj is None:
3206             self._downloader.report_error(u'invalid URL: %s' % url)
3207             return
3208         video_id = mobj.group('ID')
3209
3210         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3211
3212         request = compat_urllib_request.Request(info_url, None, std_headers)
3213         try:
3214             self.report_download_webpage(video_id)
3215             jsondata = compat_urllib_request.urlopen(request).read()
3216         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3217             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3218             return
3219
3220         self.report_extraction(video_id)
3221         try:
3222             jsonstr = jsondata.decode('utf-8')
3223             config = json.loads(jsonstr)
3224
3225             video_title =  config['data'][0]['title']
3226             seed = config['data'][0]['seed']
3227
3228             format = self._downloader.params.get('format', None)
3229             supported_format = list(config['data'][0]['streamfileids'].keys())
3230
3231             if format is None or format == 'best':
3232                 if 'hd2' in supported_format:
3233                     format = 'hd2'
3234                 else:
3235                     format = 'flv'
3236                 ext = u'flv'
3237             elif format == 'worst':
3238                 format = 'mp4'
3239                 ext = u'mp4'
3240             else:
3241                 format = 'flv'
3242                 ext = u'flv'
3243
3244
3245             fileid = config['data'][0]['streamfileids'][format]
3246             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3247         except (UnicodeDecodeError, ValueError, KeyError):
3248             self._downloader.report_error(u'unable to extract info section')
3249             return
3250
3251         files_info=[]
3252         sid = self._gen_sid()
3253         fileid = self._get_file_id(fileid, seed)
3254
3255         #column 8,9 of fileid represent the segment number
3256         #fileid[7:9] should be changed
3257         for index, key in enumerate(keys):
3258
3259             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3260             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3261
3262             info = {
3263                 'id': '%s_part%02d' % (video_id, index),
3264                 'url': download_url,
3265                 'uploader': None,
3266                 'upload_date': None,
3267                 'title': video_title,
3268                 'ext': ext,
3269             }
3270             files_info.append(info)
3271
3272         return files_info
3273
3274
3275 class XNXXIE(InfoExtractor):
3276     """Information extractor for xnxx.com"""
3277
3278     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3279     IE_NAME = u'xnxx'
3280     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3281     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3282     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3283
3284     def _real_extract(self, url):
3285         mobj = re.match(self._VALID_URL, url)
3286         if mobj is None:
3287             self._downloader.report_error(u'invalid URL: %s' % url)
3288             return
3289         video_id = mobj.group(1)
3290
3291         self.report_download_webpage(video_id)
3292
3293         # Get webpage content
3294         try:
3295             webpage_bytes = compat_urllib_request.urlopen(url).read()
3296             webpage = webpage_bytes.decode('utf-8')
3297         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3298             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3299             return
3300
3301         result = re.search(self.VIDEO_URL_RE, webpage)
3302         if result is None:
3303             self._downloader.report_error(u'unable to extract video url')
3304             return
3305         video_url = compat_urllib_parse.unquote(result.group(1))
3306
3307         result = re.search(self.VIDEO_TITLE_RE, webpage)
3308         if result is None:
3309             self._downloader.report_error(u'unable to extract video title')
3310             return
3311         video_title = result.group(1)
3312
3313         result = re.search(self.VIDEO_THUMB_RE, webpage)
3314         if result is None:
3315             self._downloader.report_error(u'unable to extract video thumbnail')
3316             return
3317         video_thumbnail = result.group(1)
3318
3319         return [{
3320             'id': video_id,
3321             'url': video_url,
3322             'uploader': None,
3323             'upload_date': None,
3324             'title': video_title,
3325             'ext': 'flv',
3326             'thumbnail': video_thumbnail,
3327             'description': None,
3328         }]
3329
3330
3331 class GooglePlusIE(InfoExtractor):
3332     """Information extractor for plus.google.com."""
3333
3334     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3335     IE_NAME = u'plus.google'
3336
3337     def __init__(self, downloader=None):
3338         InfoExtractor.__init__(self, downloader)
3339
3340     def report_extract_entry(self, url):
3341         """Report downloading extry"""
3342         self.to_screen(u'Downloading entry: %s' % url)
3343
3344     def report_date(self, upload_date):
3345         """Report downloading extry"""
3346         self.to_screen(u'Entry date: %s' % upload_date)
3347
3348     def report_uploader(self, uploader):
3349         """Report downloading extry"""
3350         self.to_screen(u'Uploader: %s' % uploader)
3351
3352     def report_title(self, video_title):
3353         """Report downloading extry"""
3354         self.to_screen(u'Title: %s' % video_title)
3355
3356     def report_extract_vid_page(self, video_page):
3357         """Report information extraction."""
3358         self.to_screen(u'Extracting video page: %s' % video_page)
3359
3360     def _real_extract(self, url):
3361         # Extract id from URL
3362         mobj = re.match(self._VALID_URL, url)
3363         if mobj is None:
3364             self._downloader.report_error(u'Invalid URL: %s' % url)
3365             return
3366
3367         post_url = mobj.group(0)
3368         video_id = mobj.group(1)
3369
3370         video_extension = 'flv'
3371
3372         # Step 1, Retrieve post webpage to extract further information
3373         self.report_extract_entry(post_url)
3374         request = compat_urllib_request.Request(post_url)
3375         try:
3376             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3377         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3378             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3379             return
3380
3381         # Extract update date
3382         upload_date = None
3383         pattern = 'title="Timestamp">(.*?)</a>'
3384         mobj = re.search(pattern, webpage)
3385         if mobj:
3386             upload_date = mobj.group(1)
3387             # Convert timestring to a format suitable for filename
3388             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3389             upload_date = upload_date.strftime('%Y%m%d')
3390         self.report_date(upload_date)
3391
3392         # Extract uploader
3393         uploader = None
3394         pattern = r'rel\="author".*?>(.*?)</a>'
3395         mobj = re.search(pattern, webpage)
3396         if mobj:
3397             uploader = mobj.group(1)
3398         self.report_uploader(uploader)
3399
3400         # Extract title
3401         # Get the first line for title
3402         video_title = u'NA'
3403         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3404         mobj = re.search(pattern, webpage)
3405         if mobj:
3406             video_title = mobj.group(1)
3407         self.report_title(video_title)
3408
3409         # Step 2, Stimulate clicking the image box to launch video
3410         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3411         mobj = re.search(pattern, webpage)
3412         if mobj is None:
3413             self._downloader.report_error(u'unable to extract video page URL')
3414
3415         video_page = mobj.group(1)
3416         request = compat_urllib_request.Request(video_page)
3417         try:
3418             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3419         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3420             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3421             return
3422         self.report_extract_vid_page(video_page)
3423
3424
3425         # Extract video links on video page
3426         """Extract video links of all sizes"""
3427         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3428         mobj = re.findall(pattern, webpage)
3429         if len(mobj) == 0:
3430             self._downloader.report_error(u'unable to extract video links')
3431
3432         # Sort in resolution
3433         links = sorted(mobj)
3434
3435         # Choose the lowest of the sort, i.e. highest resolution
3436         video_url = links[-1]
3437         # Only get the url. The resolution part in the tuple has no use anymore
3438         video_url = video_url[-1]
3439         # Treat escaped \u0026 style hex
3440         try:
3441             video_url = video_url.decode("unicode_escape")
3442         except AttributeError: # Python 3
3443             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3444
3445
3446         return [{
3447             'id':       video_id,
3448             'url':      video_url,
3449             'uploader': uploader,
3450             'upload_date':  upload_date,
3451             'title':    video_title,
3452             'ext':      video_extension,
3453         }]
3454
3455 class NBAIE(InfoExtractor):
3456     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3457     IE_NAME = u'nba'
3458
3459     def _real_extract(self, url):
3460         mobj = re.match(self._VALID_URL, url)
3461         if mobj is None:
3462             self._downloader.report_error(u'invalid URL: %s' % url)
3463             return
3464
3465         video_id = mobj.group(1)
3466         if video_id.endswith('/index.html'):
3467             video_id = video_id[:-len('/index.html')]
3468
3469         webpage = self._download_webpage(url, video_id)
3470
3471         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3472         def _findProp(rexp, default=None):
3473             m = re.search(rexp, webpage)
3474             if m:
3475                 return unescapeHTML(m.group(1))
3476             else:
3477                 return default
3478
3479         shortened_video_id = video_id.rpartition('/')[2]
3480         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3481         info = {
3482             'id': shortened_video_id,
3483             'url': video_url,
3484             'ext': 'mp4',
3485             'title': title,
3486             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3487             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3488         }
3489         return [info]
3490
3491 class JustinTVIE(InfoExtractor):
3492     """Information extractor for justin.tv and twitch.tv"""
3493     # TODO: One broadcast may be split into multiple videos. The key
3494     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3495     # starts at 1 and increases. Can we treat all parts as one video?
3496
3497     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3498         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3499     _JUSTIN_PAGE_LIMIT = 100
3500     IE_NAME = u'justin.tv'
3501
3502     def report_download_page(self, channel, offset):
3503         """Report attempt to download a single page of videos."""
3504         self.to_screen(u'%s: Downloading video information from %d to %d' %
3505                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3506
3507     # Return count of items, list of *valid* items
3508     def _parse_page(self, url):
3509         try:
3510             urlh = compat_urllib_request.urlopen(url)
3511             webpage_bytes = urlh.read()
3512             webpage = webpage_bytes.decode('utf-8', 'ignore')
3513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3514             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3515             return
3516
3517         response = json.loads(webpage)
3518         if type(response) != list:
3519             error_text = response.get('error', 'unknown error')
3520             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3521             return
3522         info = []
3523         for clip in response:
3524             video_url = clip['video_file_url']
3525             if video_url:
3526                 video_extension = os.path.splitext(video_url)[1][1:]
3527                 video_date = re.sub('-', '', clip['start_time'][:10])
3528                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3529                 video_id = clip['id']
3530                 video_title = clip.get('title', video_id)
3531                 info.append({
3532                     'id': video_id,
3533                     'url': video_url,
3534                     'title': video_title,
3535                     'uploader': clip.get('channel_name', video_uploader_id),
3536                     'uploader_id': video_uploader_id,
3537                     'upload_date': video_date,
3538                     'ext': video_extension,
3539                 })
3540         return (len(response), info)
3541
3542     def _real_extract(self, url):
3543         mobj = re.match(self._VALID_URL, url)
3544         if mobj is None:
3545             self._downloader.report_error(u'invalid URL: %s' % url)
3546             return
3547
3548         api = 'http://api.justin.tv'
3549         video_id = mobj.group(mobj.lastindex)
3550         paged = False
3551         if mobj.lastindex == 1:
3552             paged = True
3553             api += '/channel/archives/%s.json'
3554         else:
3555             api += '/broadcast/by_archive/%s.json'
3556         api = api % (video_id,)
3557
3558         self.report_extraction(video_id)
3559
3560         info = []
3561         offset = 0
3562         limit = self._JUSTIN_PAGE_LIMIT
3563         while True:
3564             if paged:
3565                 self.report_download_page(video_id, offset)
3566             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3567             page_count, page_info = self._parse_page(page_url)
3568             info.extend(page_info)
3569             if not paged or page_count != limit:
3570                 break
3571             offset += limit
3572         return info
3573
3574 class FunnyOrDieIE(InfoExtractor):
3575     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3576
3577     def _real_extract(self, url):
3578         mobj = re.match(self._VALID_URL, url)
3579         if mobj is None:
3580             self._downloader.report_error(u'invalid URL: %s' % url)
3581             return
3582
3583         video_id = mobj.group('id')
3584         webpage = self._download_webpage(url, video_id)
3585
3586         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3587         if not m:
3588             self._downloader.report_error(u'unable to find video information')
3589         video_url = unescapeHTML(m.group('url'))
3590
3591         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3592         if not m:
3593             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3594             if not m:
3595                 self._downloader.report_error(u'Cannot find video title')
3596         title = clean_html(m.group('title'))
3597
3598         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3599         if m:
3600             desc = unescapeHTML(m.group('desc'))
3601         else:
3602             desc = None
3603
3604         info = {
3605             'id': video_id,
3606             'url': video_url,
3607             'ext': 'mp4',
3608             'title': title,
3609             'description': desc,
3610         }
3611         return [info]
3612
3613 class SteamIE(InfoExtractor):
3614     _VALID_URL = r"""http://store.steampowered.com/
3615                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3616                 (?P<gameID>\d+)/?
3617                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3618                 """
3619
3620     @classmethod
3621     def suitable(cls, url):
3622         """Receives a URL and returns True if suitable for this IE."""
3623         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3624
3625     def _real_extract(self, url):
3626         m = re.match(self._VALID_URL, url, re.VERBOSE)
3627         gameID = m.group('gameID')
3628         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3629         self.report_age_confirmation()
3630         webpage = self._download_webpage(videourl, gameID)
3631         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3632
3633         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3634         mweb = re.finditer(urlRE, webpage)
3635         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3636         titles = re.finditer(namesRE, webpage)
3637         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3638         thumbs = re.finditer(thumbsRE, webpage)
3639         videos = []
3640         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3641             video_id = vid.group('videoID')
3642             title = vtitle.group('videoName')
3643             video_url = vid.group('videoURL')
3644             video_thumb = thumb.group('thumbnail')
3645             if not video_url:
3646                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3647             info = {
3648                 'id':video_id,
3649                 'url':video_url,
3650                 'ext': 'flv',
3651                 'title': unescapeHTML(title),
3652                 'thumbnail': video_thumb
3653                   }
3654             videos.append(info)
3655         return [self.playlist_result(videos, gameID, game_title)]
3656
3657 class UstreamIE(InfoExtractor):
3658     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3659     IE_NAME = u'ustream'
3660
3661     def _real_extract(self, url):
3662         m = re.match(self._VALID_URL, url)
3663         video_id = m.group('videoID')
3664         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3665         webpage = self._download_webpage(url, video_id)
3666         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3667         title = m.group('title')
3668         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3669         uploader = m.group('uploader')
3670         info = {
3671                 'id':video_id,
3672                 'url':video_url,
3673                 'ext': 'flv',
3674                 'title': title,
3675                 'uploader': uploader
3676                   }
3677         return [info]
3678
3679 class WorldStarHipHopIE(InfoExtractor):
3680     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3681     IE_NAME = u'WorldStarHipHop'
3682
3683     def _real_extract(self, url):
3684         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3685
3686         webpage_src = compat_urllib_request.urlopen(url).read()
3687         webpage_src = webpage_src.decode('utf-8')
3688
3689         mobj = re.search(_src_url, webpage_src)
3690
3691         m = re.match(self._VALID_URL, url)
3692         video_id = m.group('id')
3693
3694         if mobj is not None:
3695             video_url = mobj.group()
3696             if 'mp4' in video_url:
3697                 ext = 'mp4'
3698             else:
3699                 ext = 'flv'
3700         else:
3701             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3702             return
3703
3704         _title = r"""<title>(.*)</title>"""
3705
3706         mobj = re.search(_title, webpage_src)
3707
3708         if mobj is not None:
3709             title = mobj.group(1)
3710         else:
3711             title = 'World Start Hip Hop - %s' % time.ctime()
3712
3713         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3714         mobj = re.search(_thumbnail, webpage_src)
3715
3716         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3717         if mobj is not None:
3718             thumbnail = mobj.group(1)
3719         else:
3720             _title = r"""candytitles.*>(.*)</span>"""
3721             mobj = re.search(_title, webpage_src)
3722             if mobj is not None:
3723                 title = mobj.group(1)
3724             thumbnail = None
3725
3726         results = [{
3727                     'id': video_id,
3728                     'url' : video_url,
3729                     'title' : title,
3730                     'thumbnail' : thumbnail,
3731                     'ext' : ext,
3732                     }]
3733         return results
3734
3735 class RBMARadioIE(InfoExtractor):
3736     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3737
3738     def _real_extract(self, url):
3739         m = re.match(self._VALID_URL, url)
3740         video_id = m.group('videoID')
3741
3742         webpage = self._download_webpage(url, video_id)
3743         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3744         if not m:
3745             raise ExtractorError(u'Cannot find metadata')
3746         json_data = m.group(1)
3747
3748         try:
3749             data = json.loads(json_data)
3750         except ValueError as e:
3751             raise ExtractorError(u'Invalid JSON: ' + str(e))
3752
3753         video_url = data['akamai_url'] + '&cbr=256'
3754         url_parts = compat_urllib_parse_urlparse(video_url)
3755         video_ext = url_parts.path.rpartition('.')[2]
3756         info = {
3757                 'id': video_id,
3758                 'url': video_url,
3759                 'ext': video_ext,
3760                 'title': data['title'],
3761                 'description': data.get('teaser_text'),
3762                 'location': data.get('country_of_origin'),
3763                 'uploader': data.get('host', {}).get('name'),
3764                 'uploader_id': data.get('host', {}).get('slug'),
3765                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3766                 'duration': data.get('duration'),
3767         }
3768         return [info]
3769
3770
3771 class YouPornIE(InfoExtractor):
3772     """Information extractor for youporn.com."""
3773     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3774
3775     def _print_formats(self, formats):
3776         """Print all available formats"""
3777         print(u'Available formats:')
3778         print(u'ext\t\tformat')
3779         print(u'---------------------------------')
3780         for format in formats:
3781             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3782
3783     def _specific(self, req_format, formats):
3784         for x in formats:
3785             if(x["format"]==req_format):
3786                 return x
3787         return None
3788
3789     def _real_extract(self, url):
3790         mobj = re.match(self._VALID_URL, url)
3791         if mobj is None:
3792             self._downloader.report_error(u'invalid URL: %s' % url)
3793             return
3794
3795         video_id = mobj.group('videoid')
3796
3797         req = compat_urllib_request.Request(url)
3798         req.add_header('Cookie', 'age_verified=1')
3799         webpage = self._download_webpage(req, video_id)
3800
3801         # Get the video title
3802         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3803         if result is None:
3804             raise ExtractorError(u'Unable to extract video title')
3805         video_title = result.group('title').strip()
3806
3807         # Get the video date
3808         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3809         if result is None:
3810             self._downloader.report_warning(u'unable to extract video date')
3811             upload_date = None
3812         else:
3813             upload_date = result.group('date').strip()
3814
3815         # Get the video uploader
3816         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3817         if result is None:
3818             self._downloader.report_warning(u'unable to extract uploader')
3819             video_uploader = None
3820         else:
3821             video_uploader = result.group('uploader').strip()
3822             video_uploader = clean_html( video_uploader )
3823
3824         # Get all of the formats available
3825         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3826         result = re.search(DOWNLOAD_LIST_RE, webpage)
3827         if result is None:
3828             raise ExtractorError(u'Unable to extract download list')
3829         download_list_html = result.group('download_list').strip()
3830
3831         # Get all of the links from the page
3832         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3833         links = re.findall(LINK_RE, download_list_html)
3834         if(len(links) == 0):
3835             raise ExtractorError(u'ERROR: no known formats available for video')
3836
3837         self.to_screen(u'Links found: %d' % len(links))
3838
3839         formats = []
3840         for link in links:
3841
3842             # A link looks like this:
3843             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3844             # A path looks like this:
3845             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3846             video_url = unescapeHTML( link )
3847             path = compat_urllib_parse_urlparse( video_url ).path
3848             extension = os.path.splitext( path )[1][1:]
3849             format = path.split('/')[4].split('_')[:2]
3850             size = format[0]
3851             bitrate = format[1]
3852             format = "-".join( format )
3853             title = u'%s-%s-%s' % (video_title, size, bitrate)
3854
3855             formats.append({
3856                 'id': video_id,
3857                 'url': video_url,
3858                 'uploader': video_uploader,
3859                 'upload_date': upload_date,
3860                 'title': title,
3861                 'ext': extension,
3862                 'format': format,
3863                 'thumbnail': None,
3864                 'description': None,
3865                 'player_url': None
3866             })
3867
3868         if self._downloader.params.get('listformats', None):
3869             self._print_formats(formats)
3870             return
3871
3872         req_format = self._downloader.params.get('format', None)
3873         self.to_screen(u'Format: %s' % req_format)
3874
3875         if req_format is None or req_format == 'best':
3876             return [formats[0]]
3877         elif req_format == 'worst':
3878             return [formats[-1]]
3879         elif req_format in ('-1', 'all'):
3880             return formats
3881         else:
3882             format = self._specific( req_format, formats )
3883             if result is None:
3884                 self._downloader.report_error(u'requested format not available')
3885                 return
3886             return [format]
3887
3888
3889
3890 class PornotubeIE(InfoExtractor):
3891     """Information extractor for pornotube.com."""
3892     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3893
3894     def _real_extract(self, url):
3895         mobj = re.match(self._VALID_URL, url)
3896         if mobj is None:
3897             self._downloader.report_error(u'invalid URL: %s' % url)
3898             return
3899
3900         video_id = mobj.group('videoid')
3901         video_title = mobj.group('title')
3902
3903         # Get webpage content
3904         webpage = self._download_webpage(url, video_id)
3905
3906         # Get the video URL
3907         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3908         result = re.search(VIDEO_URL_RE, webpage)
3909         if result is None:
3910             self._downloader.report_error(u'unable to extract video url')
3911             return
3912         video_url = compat_urllib_parse.unquote(result.group('url'))
3913
3914         #Get the uploaded date
3915         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3916         result = re.search(VIDEO_UPLOADED_RE, webpage)
3917         if result is None:
3918             self._downloader.report_error(u'unable to extract video title')
3919             return
3920         upload_date = result.group('date')
3921
3922         info = {'id': video_id,
3923                 'url': video_url,
3924                 'uploader': None,
3925                 'upload_date': upload_date,
3926                 'title': video_title,
3927                 'ext': 'flv',
3928                 'format': 'flv'}
3929
3930         return [info]
3931
3932 class YouJizzIE(InfoExtractor):
3933     """Information extractor for youjizz.com."""
3934     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3935
3936     def _real_extract(self, url):
3937         mobj = re.match(self._VALID_URL, url)
3938         if mobj is None:
3939             self._downloader.report_error(u'invalid URL: %s' % url)
3940             return
3941
3942         video_id = mobj.group('videoid')
3943
3944         # Get webpage content
3945         webpage = self._download_webpage(url, video_id)
3946
3947         # Get the video title
3948         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3949         if result is None:
3950             raise ExtractorError(u'ERROR: unable to extract video title')
3951         video_title = result.group('title').strip()
3952
3953         # Get the embed page
3954         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3955         if result is None:
3956             raise ExtractorError(u'ERROR: unable to extract embed page')
3957
3958         embed_page_url = result.group(0).strip()
3959         video_id = result.group('videoid')
3960
3961         webpage = self._download_webpage(embed_page_url, video_id)
3962
3963         # Get the video URL
3964         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3965         if result is None:
3966             raise ExtractorError(u'ERROR: unable to extract video url')
3967         video_url = result.group('source')
3968
3969         info = {'id': video_id,
3970                 'url': video_url,
3971                 'title': video_title,
3972                 'ext': 'flv',
3973                 'format': 'flv',
3974                 'player_url': embed_page_url}
3975
3976         return [info]
3977
3978 class EightTracksIE(InfoExtractor):
3979     IE_NAME = '8tracks'
3980     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3981
3982     def _real_extract(self, url):
3983         mobj = re.match(self._VALID_URL, url)
3984         if mobj is None:
3985             raise ExtractorError(u'Invalid URL: %s' % url)
3986         playlist_id = mobj.group('id')
3987
3988         webpage = self._download_webpage(url, playlist_id)
3989
3990         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3991         if not m:
3992             raise ExtractorError(u'Cannot find trax information')
3993         json_like = m.group(1)
3994         data = json.loads(json_like)
3995
3996         session = str(random.randint(0, 1000000000))
3997         mix_id = data['id']
3998         track_count = data['tracks_count']
3999         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4000         next_url = first_url
4001         res = []
4002         for i in itertools.count():
4003             api_json = self._download_webpage(next_url, playlist_id,
4004                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4005                 errnote=u'Failed to download song information')
4006             api_data = json.loads(api_json)
4007             track_data = api_data[u'set']['track']
4008             info = {
4009                 'id': track_data['id'],
4010                 'url': track_data['track_file_stream_url'],
4011                 'title': track_data['performer'] + u' - ' + track_data['name'],
4012                 'raw_title': track_data['name'],
4013                 'uploader_id': data['user']['login'],
4014                 'ext': 'm4a',
4015             }
4016             res.append(info)
4017             if api_data['set']['at_last_track']:
4018                 break
4019             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4020         return res
4021
4022 class KeekIE(InfoExtractor):
4023     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4024     IE_NAME = u'keek'
4025
4026     def _real_extract(self, url):
4027         m = re.match(self._VALID_URL, url)
4028         video_id = m.group('videoID')
4029         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4030         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4031         webpage = self._download_webpage(url, video_id)
4032         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4033         title = unescapeHTML(m.group('title'))
4034         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4035         uploader = clean_html(m.group('uploader'))
4036         info = {
4037                 'id': video_id,
4038                 'url': video_url,
4039                 'ext': 'mp4',
4040                 'title': title,
4041                 'thumbnail': thumbnail,
4042                 'uploader': uploader
4043         }
4044         return [info]
4045
4046 class TEDIE(InfoExtractor):
4047     _VALID_URL=r'''http://www.ted.com/
4048                    (
4049                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4050                         |
4051                         ((?P<type_talk>talks)) # We have a simple talk
4052                    )
4053                    /(?P<name>\w+) # Here goes the name and then ".html"
4054                    '''
4055
4056     @classmethod
4057     def suitable(cls, url):
4058         """Receives a URL and returns True if suitable for this IE."""
4059         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4060
4061     def _real_extract(self, url):
4062         m=re.match(self._VALID_URL, url, re.VERBOSE)
4063         if m.group('type_talk'):
4064             return [self._talk_info(url)]
4065         else :
4066             playlist_id=m.group('playlist_id')
4067             name=m.group('name')
4068             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4069             return [self._playlist_videos_info(url,name,playlist_id)]
4070
4071     def _talk_video_link(self,mediaSlug):
4072         '''Returns the video link for that mediaSlug'''
4073         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4074
4075     def _playlist_videos_info(self,url,name,playlist_id=0):
4076         '''Returns the videos of the playlist'''
4077         video_RE=r'''
4078                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4079                      ([.\s]*?)data-playlist_item_id="(\d+)"
4080                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4081                      '''
4082         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4083         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4084         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4085         m_names=re.finditer(video_name_RE,webpage)
4086
4087         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4088         m_playlist = re.search(playlist_RE, webpage)
4089         playlist_title = m_playlist.group('playlist_title')
4090
4091         playlist_entries = []
4092         for m_video, m_name in zip(m_videos,m_names):
4093             video_id=m_video.group('video_id')
4094             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4095             playlist_entries.append(self.url_result(talk_url, 'TED'))
4096         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4097
4098     def _talk_info(self, url, video_id=0):
4099         """Return the video for the talk in the url"""
4100         m=re.match(self._VALID_URL, url,re.VERBOSE)
4101         videoName=m.group('name')
4102         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4103         # If the url includes the language we get the title translated
4104         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4105         title=re.search(title_RE, webpage).group('title')
4106         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4107                         "id":(?P<videoID>[\d]+).*?
4108                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4109         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4110         thumb_match=re.search(thumb_RE,webpage)
4111         info_match=re.search(info_RE,webpage,re.VERBOSE)
4112         video_id=info_match.group('videoID')
4113         mediaSlug=info_match.group('mediaSlug')
4114         video_url=self._talk_video_link(mediaSlug)
4115         info = {
4116                 'id': video_id,
4117                 'url': video_url,
4118                 'ext': 'mp4',
4119                 'title': title,
4120                 'thumbnail': thumb_match.group('thumbnail')
4121                 }
4122         return info
4123
4124 class MySpassIE(InfoExtractor):
4125     _VALID_URL = r'http://www.myspass.de/.*'
4126
4127     def _real_extract(self, url):
4128         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4129
4130         # video id is the last path element of the URL
4131         # usually there is a trailing slash, so also try the second but last
4132         url_path = compat_urllib_parse_urlparse(url).path
4133         url_parent_path, video_id = os.path.split(url_path)
4134         if not video_id:
4135             _, video_id = os.path.split(url_parent_path)
4136
4137         # get metadata
4138         metadata_url = META_DATA_URL_TEMPLATE % video_id
4139         metadata_text = self._download_webpage(metadata_url, video_id)
4140         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4141
4142         # extract values from metadata
4143         url_flv_el = metadata.find('url_flv')
4144         if url_flv_el is None:
4145             self._downloader.report_error(u'unable to extract download url')
4146             return
4147         video_url = url_flv_el.text
4148         extension = os.path.splitext(video_url)[1][1:]
4149         title_el = metadata.find('title')
4150         if title_el is None:
4151             self._downloader.report_error(u'unable to extract title')
4152             return
4153         title = title_el.text
4154         format_id_el = metadata.find('format_id')
4155         if format_id_el is None:
4156             format = ext
4157         else:
4158             format = format_id_el.text
4159         description_el = metadata.find('description')
4160         if description_el is not None:
4161             description = description_el.text
4162         else:
4163             description = None
4164         imagePreview_el = metadata.find('imagePreview')
4165         if imagePreview_el is not None:
4166             thumbnail = imagePreview_el.text
4167         else:
4168             thumbnail = None
4169         info = {
4170             'id': video_id,
4171             'url': video_url,
4172             'title': title,
4173             'ext': extension,
4174             'format': format,
4175             'thumbnail': thumbnail,
4176             'description': description
4177         }
4178         return [info]
4179
4180 class SpiegelIE(InfoExtractor):
4181     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4182
4183     def _real_extract(self, url):
4184         m = re.match(self._VALID_URL, url)
4185         video_id = m.group('videoID')
4186
4187         webpage = self._download_webpage(url, video_id)
4188         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4189         if not m:
4190             raise ExtractorError(u'Cannot find title')
4191         video_title = unescapeHTML(m.group(1))
4192
4193         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4194         xml_code = self._download_webpage(xml_url, video_id,
4195                     note=u'Downloading XML', errnote=u'Failed to download XML')
4196
4197         idoc = xml.etree.ElementTree.fromstring(xml_code)
4198         last_type = idoc[-1]
4199         filename = last_type.findall('./filename')[0].text
4200         duration = float(last_type.findall('./duration')[0].text)
4201
4202         video_url = 'http://video2.spiegel.de/flash/' + filename
4203         video_ext = filename.rpartition('.')[2]
4204         info = {
4205             'id': video_id,
4206             'url': video_url,
4207             'ext': video_ext,
4208             'title': video_title,
4209             'duration': duration,
4210         }
4211         return [info]
4212
4213 class LiveLeakIE(InfoExtractor):
4214
4215     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4216     IE_NAME = u'liveleak'
4217
4218     def _real_extract(self, url):
4219         mobj = re.match(self._VALID_URL, url)
4220         if mobj is None:
4221             self._downloader.report_error(u'invalid URL: %s' % url)
4222             return
4223
4224         video_id = mobj.group('video_id')
4225
4226         webpage = self._download_webpage(url, video_id)
4227
4228         m = re.search(r'file: "(.*?)",', webpage)
4229         if not m:
4230             self._downloader.report_error(u'unable to find video url')
4231             return
4232         video_url = m.group(1)
4233
4234         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4235         if not m:
4236             self._downloader.report_error(u'Cannot find video title')
4237         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4238
4239         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4240         if m:
4241             desc = unescapeHTML(m.group('desc'))
4242         else:
4243             desc = None
4244
4245         m = re.search(r'By:.*?(\w+)</a>', webpage)
4246         if m:
4247             uploader = clean_html(m.group(1))
4248         else:
4249             uploader = None
4250
4251         info = {
4252             'id':  video_id,
4253             'url': video_url,
4254             'ext': 'mp4',
4255             'title': title,
4256             'description': desc,
4257             'uploader': uploader
4258         }
4259
4260         return [info]
4261
4262 class ARDIE(InfoExtractor):
4263     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4264     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4265     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4266
4267     def _real_extract(self, url):
4268         # determine video id from url
4269         m = re.match(self._VALID_URL, url)
4270
4271         numid = re.search(r'documentId=([0-9]+)', url)
4272         if numid:
4273             video_id = numid.group(1)
4274         else:
4275             video_id = m.group('video_id')
4276
4277         # determine title and media streams from webpage
4278         html = self._download_webpage(url, video_id)
4279         title = re.search(self._TITLE, html).group('title')
4280         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4281         if not streams:
4282             assert '"fsk"' in html
4283             self._downloader.report_error(u'this video is only available after 8:00 pm')
4284             return
4285
4286         # choose default media type and highest quality for now
4287         stream = max([s for s in streams if int(s["media_type"]) == 0],
4288                      key=lambda s: int(s["quality"]))
4289
4290         # there's two possibilities: RTMP stream or HTTP download
4291         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4292         if stream['rtmp_url']:
4293             self.to_screen(u'RTMP download detected')
4294             assert stream['video_url'].startswith('mp4:')
4295             info["url"] = stream["rtmp_url"]
4296             info["play_path"] = stream['video_url']
4297         else:
4298             assert stream["video_url"].endswith('.mp4')
4299             info["url"] = stream["video_url"]
4300         return [info]
4301
4302 class TumblrIE(InfoExtractor):
4303     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4304
4305     def _real_extract(self, url):
4306         m_url = re.match(self._VALID_URL, url)
4307         video_id = m_url.group('id')
4308         blog = m_url.group('blog_name')
4309
4310         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4311         webpage = self._download_webpage(url, video_id)
4312
4313         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4314         video = re.search(re_video, webpage)
4315         if video is None:
4316             self.to_screen("No video founded")
4317             return []
4318         video_url = video.group('video_url')
4319         ext = video.group('ext')
4320
4321         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4322         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4323
4324         # The only place where you can get a title, it's not complete,
4325         # but searching in other places doesn't work for all videos
4326         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4327         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4328
4329         return [{'id': video_id,
4330                  'url': video_url,
4331                  'title': title,
4332                  'thumbnail': thumb,
4333                  'ext': ext
4334                  }]
4335
4336
4337 def gen_extractors():
4338     """ Return a list of an instance of every supported extractor.
4339     The order does matter; the first extractor matched is the one handling the URL.
4340     """
4341     return [
4342         YoutubePlaylistIE(),
4343         YoutubeChannelIE(),
4344         YoutubeUserIE(),
4345         YoutubeSearchIE(),
4346         YoutubeIE(),
4347         MetacafeIE(),
4348         DailymotionIE(),
4349         GoogleSearchIE(),
4350         PhotobucketIE(),
4351         YahooIE(),
4352         YahooSearchIE(),
4353         DepositFilesIE(),
4354         FacebookIE(),
4355         BlipTVUserIE(),
4356         BlipTVIE(),
4357         VimeoIE(),
4358         MyVideoIE(),
4359         ComedyCentralIE(),
4360         EscapistIE(),
4361         CollegeHumorIE(),
4362         XVideosIE(),
4363         SoundcloudSetIE(),
4364         SoundcloudIE(),
4365         InfoQIE(),
4366         MixcloudIE(),
4367         StanfordOpenClassroomIE(),
4368         MTVIE(),
4369         YoukuIE(),
4370         XNXXIE(),
4371         YouJizzIE(),
4372         PornotubeIE(),
4373         YouPornIE(),
4374         GooglePlusIE(),
4375         ArteTvIE(),
4376         NBAIE(),
4377         WorldStarHipHopIE(),
4378         JustinTVIE(),
4379         FunnyOrDieIE(),
4380         SteamIE(),
4381         UstreamIE(),
4382         RBMARadioIE(),
4383         EightTracksIE(),
4384         KeekIE(),
4385         TEDIE(),
4386         MySpassIE(),
4387         SpiegelIE(),
4388         LiveLeakIE(),
4389         ARDIE(),
4390         TumblrIE(),
4391         GenericIE()
4392     ]
4393
4394 def get_info_extractor(ie_name):
4395     """Returns the info extractor class with the given ie_name"""
4396     return globals()[ie_name+'IE']