_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         content_type = urlh.headers.get('Content-Type', '')
 130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 131         if m:
 132             encoding = m.group(1)
 133         else:
 134             encoding = 'utf-8'
 135         webpage_bytes = urlh.read()
 136         return webpage_bytes.decode(encoding, 'replace')
 137
 138     #Methods for following #608
 139     #They set the correct value of the '_type' key
 140     def video_result(self, video_info):
 141         """Returns a video"""
 142         video_info['_type'] = 'video'
 143         return video_info
 144     def url_result(self, url, ie=None):
 145         """Returns a url that points to a page that should be processed"""
 146         #TODO: ie should be the class used for getting the info
 147         video_info = {'_type': 'url',
 148                       'url': url}
 149         return video_info
 150     def playlist_result(self, entries):
 151         """Returns a playlist"""
 152         video_info = {'_type': 'playlist',
 153                       'entries': entries}
 154         return video_info
 155
 156
 157 class YoutubeIE(InfoExtractor):
 158     """Information extractor for youtube.com."""
 159
 160     _VALID_URL = r"""^
 161                      (
 162                          (?:https?://)?                                       # http(s):// (optional)
 163                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 164                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 165                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 166                          (?:                                                  # the various things that can precede the ID:
 167                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 168                              |(?:                                             # or the v= param in all its forms
 169                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 170                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 171                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 172                                  v=
 173                              )
 174                          )?                                                   # optional -> youtube.com/xxxx is OK
 175                      )?                                                       # all until now is optional -> you can pass the naked ID
 176                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 177                      (?(1).+)?                                                # if we found the ID, everything can follow
 178                      $"""
 179     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 180     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 181     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 182     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 183     _NETRC_MACHINE = 'youtube'
 184     # Listed in order of quality
 185     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 186     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 187     _video_extensions = {
 188         '13': '3gp',
 189         '17': 'mp4',
 190         '18': 'mp4',
 191         '22': 'mp4',
 192         '37': 'mp4',
 193         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 194         '43': 'webm',
 195         '44': 'webm',
 196         '45': 'webm',
 197         '46': 'webm',
 198     }
 199     _video_dimensions = {
 200         '5': '240x400',
 201         '6': '???',
 202         '13': '???',
 203         '17': '144x176',
 204         '18': '360x640',
 205         '22': '720x1280',
 206         '34': '360x640',
 207         '35': '480x854',
 208         '37': '1080x1920',
 209         '38': '3072x4096',
 210         '43': '360x640',
 211         '44': '480x854',
 212         '45': '720x1280',
 213         '46': '1080x1920',
 214     }
 215     IE_NAME = u'youtube'
 216
 217     @classmethod
 218     def suitable(cls, url):
 219         """Receives a URL and returns True if suitable for this IE."""
 220         if YoutubePlaylistIE.suitable(url): return False
 221         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 222
 223     def report_lang(self):
 224         """Report attempt to set language."""
 225         self._downloader.to_screen(u'[youtube] Setting language')
 226
 227     def report_login(self):
 228         """Report attempt to log in."""
 229         self._downloader.to_screen(u'[youtube] Logging in')
 230
 231     def report_age_confirmation(self):
 232         """Report attempt to confirm age."""
 233         self._downloader.to_screen(u'[youtube] Confirming age')
 234
 235     def report_video_webpage_download(self, video_id):
 236         """Report attempt to download video webpage."""
 237         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 238
 239     def report_video_info_webpage_download(self, video_id):
 240         """Report attempt to download video info webpage."""
 241         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 242
 243     def report_video_subtitles_download(self, video_id):
 244         """Report attempt to download video info webpage."""
 245         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 246
 247     def report_video_subtitles_request(self, video_id, sub_lang, format):
 248         """Report attempt to download video info webpage."""
 249         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 250
 251     def report_video_subtitles_available(self, video_id, sub_lang_list):
 252         """Report available subtitles."""
 253         sub_lang = ",".join(list(sub_lang_list.keys()))
 254         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 255
 256     def report_information_extraction(self, video_id):
 257         """Report attempt to extract video information."""
 258         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 259
 260     def report_unavailable_format(self, video_id, format):
 261         """Report extracted video URL."""
 262         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 263
 264     def report_rtmp_download(self):
 265         """Indicate the download will use the RTMP protocol."""
 266         self._downloader.to_screen(u'[youtube] RTMP download detected')
 267
 268     def _get_available_subtitles(self, video_id):
 269         self.report_video_subtitles_download(video_id)
 270         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 271         try:
 272             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 273         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 274             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 275         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 276         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 277         if not sub_lang_list:
 278             return (u'WARNING: video doesn\'t have subtitles', None)
 279         return sub_lang_list
 280
 281     def _list_available_subtitles(self, video_id):
 282         sub_lang_list = self._get_available_subtitles(video_id)
 283         self.report_video_subtitles_available(video_id, sub_lang_list)
 284
 285     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 286         self.report_video_subtitles_request(video_id, sub_lang, format)
 287         params = compat_urllib_parse.urlencode({
 288             'lang': sub_lang,
 289             'name': sub_name,
 290             'v': video_id,
 291             'fmt': format,
 292         })
 293         url = 'http://www.youtube.com/api/timedtext?' + params
 294         try:
 295             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 296         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 297             return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
 298         if not sub:
 299             return (u'WARNING: Did not fetch video subtitles', None)
 300         return (None, sub_lang, sub)
 301
 302     def _extract_subtitle(self, video_id):
 303         sub_lang_list = self._get_available_subtitles(video_id)
 304         sub_format = self._downloader.params.get('subtitlesformat')
 305         if self._downloader.params.get('subtitleslang', False):
 306             sub_lang = self._downloader.params.get('subtitleslang')
 307         elif 'en' in sub_lang_list:
 308             sub_lang = 'en'
 309         else:
 310             sub_lang = list(sub_lang_list.keys())[0]
 311         if not sub_lang in sub_lang_list:
 312             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
 313
 314         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 315         return [subtitle]
 316
 317     def _extract_all_subtitles(self, video_id):
 318         sub_lang_list = self._get_available_subtitles(video_id)
 319         sub_format = self._downloader.params.get('subtitlesformat')
 320         subtitles = []
 321         for sub_lang in sub_lang_list:
 322             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 323             subtitles.append(subtitle)
 324         return subtitles
 325
 326     def _print_formats(self, formats):
 327         print('Available formats:')
 328         for x in formats:
 329             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 330
 331     def _real_initialize(self):
 332         if self._downloader is None:
 333             return
 334
 335         username = None
 336         password = None
 337         downloader_params = self._downloader.params
 338
 339         # Attempt to use provided username and password or .netrc data
 340         if downloader_params.get('username', None) is not None:
 341             username = downloader_params['username']
 342             password = downloader_params['password']
 343         elif downloader_params.get('usenetrc', False):
 344             try:
 345                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 346                 if info is not None:
 347                     username = info[0]
 348                     password = info[2]
 349                 else:
 350                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 351             except (IOError, netrc.NetrcParseError) as err:
 352                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 353                 return
 354
 355         # Set language
 356         request = compat_urllib_request.Request(self._LANG_URL)
 357         try:
 358             self.report_lang()
 359             compat_urllib_request.urlopen(request).read()
 360         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 361             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 362             return
 363
 364         # No authentication to be performed
 365         if username is None:
 366             return
 367
 368         request = compat_urllib_request.Request(self._LOGIN_URL)
 369         try:
 370             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 371         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 372             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 373             return
 374
 375         galx = None
 376         dsh = None
 377         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 378         if match:
 379           galx = match.group(1)
 380
 381         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 382         if match:
 383           dsh = match.group(1)
 384
 385         # Log in
 386         login_form_strs = {
 387                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 388                 u'Email': username,
 389                 u'GALX': galx,
 390                 u'Passwd': password,
 391                 u'PersistentCookie': u'yes',
 392                 u'_utf8': u'霱',
 393                 u'bgresponse': u'js_disabled',
 394                 u'checkConnection': u'',
 395                 u'checkedDomains': u'youtube',
 396                 u'dnConn': u'',
 397                 u'dsh': dsh,
 398                 u'pstMsg': u'0',
 399                 u'rmShown': u'1',
 400                 u'secTok': u'',
 401                 u'signIn': u'Sign in',
 402                 u'timeStmp': u'',
 403                 u'service': u'youtube',
 404                 u'uilel': u'3',
 405                 u'hl': u'en_US',
 406         }
 407         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 408         # chokes on unicode
 409         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 410         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 411         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 412         try:
 413             self.report_login()
 414             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 415             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 416                 self._downloader.report_warning(u'unable to log in: bad username or password')
 417                 return
 418         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 419             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 420             return
 421
 422         # Confirm age
 423         age_form = {
 424                 'next_url':     '/',
 425                 'action_confirm':   'Confirm',
 426                 }
 427         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 428         try:
 429             self.report_age_confirmation()
 430             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 431         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 432             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 433             return
 434
 435     def _extract_id(self, url):
 436         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 437         if mobj is None:
 438             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 439             return
 440         video_id = mobj.group(2)
 441         return video_id
 442
 443     def _real_extract(self, url):
 444         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 445         mobj = re.search(self._NEXT_URL_RE, url)
 446         if mobj:
 447             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 448         video_id = self._extract_id(url)
 449
 450         # Get video webpage
 451         self.report_video_webpage_download(video_id)
 452         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 453         request = compat_urllib_request.Request(url)
 454         try:
 455             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
 458             return
 459
 460         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 461
 462         # Attempt to extract SWF player URL
 463         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 464         if mobj is not None:
 465             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 466         else:
 467             player_url = None
 468
 469         # Get video info
 470         self.report_video_info_webpage_download(video_id)
 471         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 472             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 473                     % (video_id, el_type))
 474             request = compat_urllib_request.Request(video_info_url)
 475             try:
 476                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 477                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 478                 video_info = compat_parse_qs(video_info_webpage)
 479                 if 'token' in video_info:
 480                     break
 481             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 482                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 483                 return
 484         if 'token' not in video_info:
 485             if 'reason' in video_info:
 486                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
 487             else:
 488                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 489             return
 490
 491         # Check for "rental" videos
 492         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 493             self._downloader.trouble(u'ERROR: "rental" videos not supported')
 494             return
 495
 496         # Start extracting information
 497         self.report_information_extraction(video_id)
 498
 499         # uploader
 500         if 'author' not in video_info:
 501             self._downloader.trouble(u'ERROR: unable to extract uploader name')
 502             return
 503         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 504
 505         # uploader_id
 506         video_uploader_id = None
 507         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 508         if mobj is not None:
 509             video_uploader_id = mobj.group(1)
 510         else:
 511             self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 512
 513         # title
 514         if 'title' not in video_info:
 515             self._downloader.trouble(u'ERROR: unable to extract video title')
 516             return
 517         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 518
 519         # thumbnail image
 520         if 'thumbnail_url' not in video_info:
 521             self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 522             video_thumbnail = ''
 523         else:   # don't panic if we can't find it
 524             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 525
 526         # upload date
 527         upload_date = None
 528         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 529         if mobj is not None:
 530             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 531             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 532             for expression in format_expressions:
 533                 try:
 534                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 535                 except:
 536                     pass
 537
 538         # description
 539         video_description = get_element_by_id("eow-description", video_webpage)
 540         if video_description:
 541             video_description = clean_html(video_description)
 542         else:
 543             video_description = ''
 544
 545         # subtitles
 546         video_subtitles = None
 547
 548         if self._downloader.params.get('writesubtitles', False):
 549             video_subtitles = self._extract_subtitle(video_id)
 550             if video_subtitles:
 551                 (sub_error, sub_lang, sub) = video_subtitles[0]
 552                 if sub_error:
 553                     self._downloader.trouble(sub_error)
 554
 555         if self._downloader.params.get('allsubtitles', False):
 556             video_subtitles = self._extract_all_subtitles(video_id)
 557             for video_subtitle in video_subtitles:
 558                 (sub_error, sub_lang, sub) = video_subtitle
 559                 if sub_error:
 560                     self._downloader.trouble(sub_error)
 561
 562         if self._downloader.params.get('listsubtitles', False):
 563             sub_lang_list = self._list_available_subtitles(video_id)
 564             return
 565
 566         if 'length_seconds' not in video_info:
 567             self._downloader.trouble(u'WARNING: unable to extract video duration')
 568             video_duration = ''
 569         else:
 570             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 571
 572         # token
 573         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 574
 575         # Decide which formats to download
 576         req_format = self._downloader.params.get('format', None)
 577
 578         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 579             self.report_rtmp_download()
 580             video_url_list = [(None, video_info['conn'][0])]
 581         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 582             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 583             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 584             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 585             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 586
 587             format_limit = self._downloader.params.get('format_limit', None)
 588             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 589             if format_limit is not None and format_limit in available_formats:
 590                 format_list = available_formats[available_formats.index(format_limit):]
 591             else:
 592                 format_list = available_formats
 593             existing_formats = [x for x in format_list if x in url_map]
 594             if len(existing_formats) == 0:
 595                 self._downloader.trouble(u'ERROR: no known formats available for video')
 596                 return
 597             if self._downloader.params.get('listformats', None):
 598                 self._print_formats(existing_formats)
 599                 return
 600             if req_format is None or req_format == 'best':
 601                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 602             elif req_format == 'worst':
 603                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 604             elif req_format in ('-1', 'all'):
 605                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 606             else:
 607                 # Specific formats. We pick the first in a slash-delimeted sequence.
 608                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 609                 req_formats = req_format.split('/')
 610                 video_url_list = None
 611                 for rf in req_formats:
 612                     if rf in url_map:
 613                         video_url_list = [(rf, url_map[rf])]
 614                         break
 615                 if video_url_list is None:
 616                     self._downloader.trouble(u'ERROR: requested format not available')
 617                     return
 618         else:
 619             self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 620             return
 621
 622         results = []
 623         for format_param, video_real_url in video_url_list:
 624             # Extension
 625             video_extension = self._video_extensions.get(format_param, 'flv')
 626
 627             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 628                                               self._video_dimensions.get(format_param, '???'))
 629
 630             results.append({
 631                 'id':       video_id,
 632                 'url':      video_real_url,
 633                 'uploader': video_uploader,
 634                 'uploader_id': video_uploader_id,
 635                 'upload_date':  upload_date,
 636                 'title':    video_title,
 637                 'ext':      video_extension,
 638                 'format':   video_format,
 639                 'thumbnail':    video_thumbnail,
 640                 'description':  video_description,
 641                 'player_url':   player_url,
 642                 'subtitles':    video_subtitles,
 643                 'duration':     video_duration
 644             })
 645         return results
 646
 647
 648 class MetacafeIE(InfoExtractor):
 649     """Information Extractor for metacafe.com."""
 650
 651     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 652     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 653     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 654     IE_NAME = u'metacafe'
 655
 656     def __init__(self, downloader=None):
 657         InfoExtractor.__init__(self, downloader)
 658
 659     def report_disclaimer(self):
 660         """Report disclaimer retrieval."""
 661         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 662
 663     def report_age_confirmation(self):
 664         """Report attempt to confirm age."""
 665         self._downloader.to_screen(u'[metacafe] Confirming age')
 666
 667     def report_download_webpage(self, video_id):
 668         """Report webpage download."""
 669         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 670
 671     def report_extraction(self, video_id):
 672         """Report information extraction."""
 673         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 674
 675     def _real_initialize(self):
 676         # Retrieve disclaimer
 677         request = compat_urllib_request.Request(self._DISCLAIMER)
 678         try:
 679             self.report_disclaimer()
 680             disclaimer = compat_urllib_request.urlopen(request).read()
 681         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 682             self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
 683             return
 684
 685         # Confirm age
 686         disclaimer_form = {
 687             'filters': '0',
 688             'submit': "Continue - I'm over 18",
 689             }
 690         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 691         try:
 692             self.report_age_confirmation()
 693             disclaimer = compat_urllib_request.urlopen(request).read()
 694         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 695             self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
 696             return
 697
 698     def _real_extract(self, url):
 699         # Extract id and simplified title from URL
 700         mobj = re.match(self._VALID_URL, url)
 701         if mobj is None:
 702             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 703             return
 704
 705         video_id = mobj.group(1)
 706
 707         # Check if video comes from YouTube
 708         mobj2 = re.match(r'^yt-(.*)$', video_id)
 709         if mobj2 is not None:
 710             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 711             return
 712
 713         # Retrieve video webpage to extract further information
 714         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 715         try:
 716             self.report_download_webpage(video_id)
 717             webpage = compat_urllib_request.urlopen(request).read()
 718         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 719             self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
 720             return
 721
 722         # Extract URL, uploader and title from webpage
 723         self.report_extraction(video_id)
 724         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 725         if mobj is not None:
 726             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 727             video_extension = mediaURL[-3:]
 728
 729             # Extract gdaKey if available
 730             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 731             if mobj is None:
 732                 video_url = mediaURL
 733             else:
 734                 gdaKey = mobj.group(1)
 735                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 736         else:
 737             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 738             if mobj is None:
 739                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 740                 return
 741             vardict = compat_parse_qs(mobj.group(1))
 742             if 'mediaData' not in vardict:
 743                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 744                 return
 745             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 746             if mobj is None:
 747                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 748                 return
 749             mediaURL = mobj.group(1).replace('\\/', '/')
 750             video_extension = mediaURL[-3:]
 751             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 752
 753         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 754         if mobj is None:
 755             self._downloader.trouble(u'ERROR: unable to extract title')
 756             return
 757         video_title = mobj.group(1).decode('utf-8')
 758
 759         mobj = re.search(r'submitter=(.*?);', webpage)
 760         if mobj is None:
 761             self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 762             return
 763         video_uploader = mobj.group(1)
 764
 765         return [{
 766             'id':       video_id.decode('utf-8'),
 767             'url':      video_url.decode('utf-8'),
 768             'uploader': video_uploader.decode('utf-8'),
 769             'upload_date':  None,
 770             'title':    video_title,
 771             'ext':      video_extension.decode('utf-8'),
 772         }]
 773
 774
 775 class DailymotionIE(InfoExtractor):
 776     """Information Extractor for Dailymotion"""
 777
 778     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 779     IE_NAME = u'dailymotion'
 780     _WORKING = False
 781
 782     def __init__(self, downloader=None):
 783         InfoExtractor.__init__(self, downloader)
 784
 785     def report_extraction(self, video_id):
 786         """Report information extraction."""
 787         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 794             return
 795
 796         video_id = mobj.group(1).split('_')[0].split('?')[0]
 797
 798         video_extension = 'mp4'
 799
 800         # Retrieve video webpage to extract further information
 801         request = compat_urllib_request.Request(url)
 802         request.add_header('Cookie', 'family_filter=off')
 803         webpage = self._download_webpage(request, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 808         if mobj is None:
 809             self._downloader.trouble(u'ERROR: unable to extract media URL')
 810             return
 811         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 812
 813         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 814             if key in flashvars:
 815                 max_quality = key
 816                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 817                 break
 818         else:
 819             self._downloader.trouble(u'ERROR: unable to extract video URL')
 820             return
 821
 822         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 823         if mobj is None:
 824             self._downloader.trouble(u'ERROR: unable to extract video URL')
 825             return
 826
 827         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 828
 829         # TODO: support choosing qualities
 830
 831         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 832         if mobj is None:
 833             self._downloader.trouble(u'ERROR: unable to extract title')
 834             return
 835         video_title = unescapeHTML(mobj.group('title'))
 836
 837         video_uploader = None
 838         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 839         if mobj is None:
 840             # lookin for official user
 841             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 842             if mobj_official is None:
 843                 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 844             else:
 845                 video_uploader = mobj_official.group(1)
 846         else:
 847             video_uploader = mobj.group(1)
 848
 849         video_upload_date = None
 850         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 851         if mobj is not None:
 852             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 853
 854         return [{
 855             'id':       video_id,
 856             'url':      video_url,
 857             'uploader': video_uploader,
 858             'upload_date':  video_upload_date,
 859             'title':    video_title,
 860             'ext':      video_extension,
 861         }]
 862
 863
 864 class PhotobucketIE(InfoExtractor):
 865     """Information extractor for photobucket.com."""
 866
 867     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 868     IE_NAME = u'photobucket'
 869
 870     def __init__(self, downloader=None):
 871         InfoExtractor.__init__(self, downloader)
 872
 873     def report_download_webpage(self, video_id):
 874         """Report webpage download."""
 875         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 876
 877     def report_extraction(self, video_id):
 878         """Report information extraction."""
 879         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 880
 881     def _real_extract(self, url):
 882         # Extract id from URL
 883         mobj = re.match(self._VALID_URL, url)
 884         if mobj is None:
 885             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 886             return
 887
 888         video_id = mobj.group(1)
 889
 890         video_extension = 'flv'
 891
 892         # Retrieve video webpage to extract further information
 893         request = compat_urllib_request.Request(url)
 894         try:
 895             self.report_download_webpage(video_id)
 896             webpage = compat_urllib_request.urlopen(request).read()
 897         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 898             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 899             return
 900
 901         # Extract URL, uploader, and title from webpage
 902         self.report_extraction(video_id)
 903         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 904         if mobj is None:
 905             self._downloader.trouble(u'ERROR: unable to extract media URL')
 906             return
 907         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 908
 909         video_url = mediaURL
 910
 911         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 912         if mobj is None:
 913             self._downloader.trouble(u'ERROR: unable to extract title')
 914             return
 915         video_title = mobj.group(1).decode('utf-8')
 916
 917         video_uploader = mobj.group(2).decode('utf-8')
 918
 919         return [{
 920             'id':       video_id.decode('utf-8'),
 921             'url':      video_url.decode('utf-8'),
 922             'uploader': video_uploader,
 923             'upload_date':  None,
 924             'title':    video_title,
 925             'ext':      video_extension.decode('utf-8'),
 926         }]
 927
 928
 929 class YahooIE(InfoExtractor):
 930     """Information extractor for video.yahoo.com."""
 931
 932     _WORKING = False
 933     # _VALID_URL matches all Yahoo! Video URLs
 934     # _VPAGE_URL matches only the extractable '/watch/' URLs
 935     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 936     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 937     IE_NAME = u'video.yahoo'
 938
 939     def __init__(self, downloader=None):
 940         InfoExtractor.__init__(self, downloader)
 941
 942     def report_download_webpage(self, video_id):
 943         """Report webpage download."""
 944         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 945
 946     def report_extraction(self, video_id):
 947         """Report information extraction."""
 948         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 949
 950     def _real_extract(self, url, new_video=True):
 951         # Extract ID from URL
 952         mobj = re.match(self._VALID_URL, url)
 953         if mobj is None:
 954             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 955             return
 956
 957         video_id = mobj.group(2)
 958         video_extension = 'flv'
 959
 960         # Rewrite valid but non-extractable URLs as
 961         # extractable English language /watch/ URLs
 962         if re.match(self._VPAGE_URL, url) is None:
 963             request = compat_urllib_request.Request(url)
 964             try:
 965                 webpage = compat_urllib_request.urlopen(request).read()
 966             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 967                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 968                 return
 969
 970             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 971             if mobj is None:
 972                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 973                 return
 974             yahoo_id = mobj.group(1)
 975
 976             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 977             if mobj is None:
 978                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 979                 return
 980             yahoo_vid = mobj.group(1)
 981
 982             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 983             return self._real_extract(url, new_video=False)
 984
 985         # Retrieve video webpage to extract further information
 986         request = compat_urllib_request.Request(url)
 987         try:
 988             self.report_download_webpage(video_id)
 989             webpage = compat_urllib_request.urlopen(request).read()
 990         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 991             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
 992             return
 993
 994         # Extract uploader and title from webpage
 995         self.report_extraction(video_id)
 996         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 997         if mobj is None:
 998             self._downloader.trouble(u'ERROR: unable to extract video title')
 999             return
1000         video_title = mobj.group(1).decode('utf-8')
1001
1002         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1003         if mobj is None:
1004             self._downloader.trouble(u'ERROR: unable to extract video uploader')
1005             return
1006         video_uploader = mobj.group(1).decode('utf-8')
1007
1008         # Extract video thumbnail
1009         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1010         if mobj is None:
1011             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1012             return
1013         video_thumbnail = mobj.group(1).decode('utf-8')
1014
1015         # Extract video description
1016         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1017         if mobj is None:
1018             self._downloader.trouble(u'ERROR: unable to extract video description')
1019             return
1020         video_description = mobj.group(1).decode('utf-8')
1021         if not video_description:
1022             video_description = 'No description available.'
1023
1024         # Extract video height and width
1025         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1026         if mobj is None:
1027             self._downloader.trouble(u'ERROR: unable to extract video height')
1028             return
1029         yv_video_height = mobj.group(1)
1030
1031         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1032         if mobj is None:
1033             self._downloader.trouble(u'ERROR: unable to extract video width')
1034             return
1035         yv_video_width = mobj.group(1)
1036
1037         # Retrieve video playlist to extract media URL
1038         # I'm not completely sure what all these options are, but we
1039         # seem to need most of them, otherwise the server sends a 401.
1040         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1041         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1042         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1043                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1044                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1045         try:
1046             self.report_download_webpage(video_id)
1047             webpage = compat_urllib_request.urlopen(request).read()
1048         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1049             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1050             return
1051
1052         # Extract media URL from playlist XML
1053         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1054         if mobj is None:
1055             self._downloader.trouble(u'ERROR: Unable to extract media URL')
1056             return
1057         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1058         video_url = unescapeHTML(video_url)
1059
1060         return [{
1061             'id':       video_id.decode('utf-8'),
1062             'url':      video_url,
1063             'uploader': video_uploader,
1064             'upload_date':  None,
1065             'title':    video_title,
1066             'ext':      video_extension.decode('utf-8'),
1067             'thumbnail':    video_thumbnail.decode('utf-8'),
1068             'description':  video_description,
1069         }]
1070
1071
1072 class VimeoIE(InfoExtractor):
1073     """Information extractor for vimeo.com."""
1074
1075     # _VALID_URL matches Vimeo URLs
1076     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1077     IE_NAME = u'vimeo'
1078
1079     def __init__(self, downloader=None):
1080         InfoExtractor.__init__(self, downloader)
1081
1082     def report_download_webpage(self, video_id):
1083         """Report webpage download."""
1084         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1085
1086     def report_extraction(self, video_id):
1087         """Report information extraction."""
1088         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1089
1090     def _real_extract(self, url, new_video=True):
1091         # Extract ID from URL
1092         mobj = re.match(self._VALID_URL, url)
1093         if mobj is None:
1094             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1095             return
1096
1097         video_id = mobj.group('id')
1098         if not mobj.group('proto'):
1099             url = 'https://' + url
1100         if mobj.group('direct_link'):
1101             url = 'https://vimeo.com/' + video_id
1102
1103         # Retrieve video webpage to extract further information
1104         request = compat_urllib_request.Request(url, None, std_headers)
1105         try:
1106             self.report_download_webpage(video_id)
1107             webpage_bytes = compat_urllib_request.urlopen(request).read()
1108             webpage = webpage_bytes.decode('utf-8')
1109         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1110             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1111             return
1112
1113         # Now we begin extracting as much information as we can from what we
1114         # retrieved. First we extract the information common to all extractors,
1115         # and latter we extract those that are Vimeo specific.
1116         self.report_extraction(video_id)
1117
1118         # Extract the config JSON
1119         try:
1120             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1121             config = json.loads(config)
1122         except:
1123             self._downloader.trouble(u'ERROR: unable to extract info section')
1124             return
1125
1126         # Extract title
1127         video_title = config["video"]["title"]
1128
1129         # Extract uploader and uploader_id
1130         video_uploader = config["video"]["owner"]["name"]
1131         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1132
1133         # Extract video thumbnail
1134         video_thumbnail = config["video"]["thumbnail"]
1135
1136         # Extract video description
1137         video_description = get_element_by_attribute("itemprop", "description", webpage)
1138         if video_description: video_description = clean_html(video_description)
1139         else: video_description = ''
1140
1141         # Extract upload date
1142         video_upload_date = None
1143         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1144         if mobj is not None:
1145             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1146
1147         # Vimeo specific: extract request signature and timestamp
1148         sig = config['request']['signature']
1149         timestamp = config['request']['timestamp']
1150
1151         # Vimeo specific: extract video codec and quality information
1152         # First consider quality, then codecs, then take everything
1153         # TODO bind to format param
1154         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1155         files = { 'hd': [], 'sd': [], 'other': []}
1156         for codec_name, codec_extension in codecs:
1157             if codec_name in config["video"]["files"]:
1158                 if 'hd' in config["video"]["files"][codec_name]:
1159                     files['hd'].append((codec_name, codec_extension, 'hd'))
1160                 elif 'sd' in config["video"]["files"][codec_name]:
1161                     files['sd'].append((codec_name, codec_extension, 'sd'))
1162                 else:
1163                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1164
1165         for quality in ('hd', 'sd', 'other'):
1166             if len(files[quality]) > 0:
1167                 video_quality = files[quality][0][2]
1168                 video_codec = files[quality][0][0]
1169                 video_extension = files[quality][0][1]
1170                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1171                 break
1172         else:
1173             self._downloader.trouble(u'ERROR: no known codec found')
1174             return
1175
1176         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178
1179         return [{
1180             'id':       video_id,
1181             'url':      video_url,
1182             'uploader': video_uploader,
1183             'uploader_id': video_uploader_id,
1184             'upload_date':  video_upload_date,
1185             'title':    video_title,
1186             'ext':      video_extension,
1187             'thumbnail':    video_thumbnail,
1188             'description':  video_description,
1189         }]
1190
1191
1192 class ArteTvIE(InfoExtractor):
1193     """arte.tv information extractor."""
1194
1195     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196     _LIVE_URL = r'index-[0-9]+\.html$'
1197
1198     IE_NAME = u'arte.tv'
1199
1200     def __init__(self, downloader=None):
1201         InfoExtractor.__init__(self, downloader)
1202
1203     def report_download_webpage(self, video_id):
1204         """Report webpage download."""
1205         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1206
1207     def report_extraction(self, video_id):
1208         """Report information extraction."""
1209         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1210
1211     def fetch_webpage(self, url):
1212         request = compat_urllib_request.Request(url)
1213         try:
1214             self.report_download_webpage(url)
1215             webpage = compat_urllib_request.urlopen(request).read()
1216         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1217             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1218             return
1219         except ValueError as err:
1220             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1221             return
1222         return webpage
1223
1224     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1225         page = self.fetch_webpage(url)
1226         mobj = re.search(regex, page, regexFlags)
1227         info = {}
1228
1229         if mobj is None:
1230             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1231             return
1232
1233         for (i, key, err) in matchTuples:
1234             if mobj.group(i) is None:
1235                 self._downloader.trouble(err)
1236                 return
1237             else:
1238                 info[key] = mobj.group(i)
1239
1240         return info
1241
1242     def extractLiveStream(self, url):
1243         video_lang = url.split('/')[-4]
1244         info = self.grep_webpage(
1245             url,
1246             r'src="(.*?/videothek_js.*?\.js)',
1247             0,
1248             [
1249                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1250             ]
1251         )
1252         http_host = url.split('/')[2]
1253         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1254         info = self.grep_webpage(
1255             next_url,
1256             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1257                 '(http://.*?\.swf).*?' +
1258                 '(rtmp://.*?)\'',
1259             re.DOTALL,
1260             [
1261                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1262                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1263                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1264             ]
1265         )
1266         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1267
1268     def extractPlus7Stream(self, url):
1269         video_lang = url.split('/')[-3]
1270         info = self.grep_webpage(
1271             url,
1272             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1273             0,
1274             [
1275                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1276             ]
1277         )
1278         next_url = compat_urllib_parse.unquote(info.get('url'))
1279         info = self.grep_webpage(
1280             next_url,
1281             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1282             0,
1283             [
1284                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1285             ]
1286         )
1287         next_url = compat_urllib_parse.unquote(info.get('url'))
1288
1289         info = self.grep_webpage(
1290             next_url,
1291             r'<video id="(.*?)".*?>.*?' +
1292                 '<name>(.*?)</name>.*?' +
1293                 '<dateVideo>(.*?)</dateVideo>.*?' +
1294                 '<url quality="hd">(.*?)</url>',
1295             re.DOTALL,
1296             [
1297                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1298                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1299                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1300                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1301             ]
1302         )
1303
1304         return {
1305             'id':           info.get('id'),
1306             'url':          compat_urllib_parse.unquote(info.get('url')),
1307             'uploader':     u'arte.tv',
1308             'upload_date':  info.get('date'),
1309             'title':        info.get('title').decode('utf-8'),
1310             'ext':          u'mp4',
1311             'format':       u'NA',
1312             'player_url':   None,
1313         }
1314
1315     def _real_extract(self, url):
1316         video_id = url.split('/')[-1]
1317         self.report_extraction(video_id)
1318
1319         if re.search(self._LIVE_URL, video_id) is not None:
1320             self.extractLiveStream(url)
1321             return
1322         else:
1323             info = self.extractPlus7Stream(url)
1324
1325         return [info]
1326
1327
1328 class GenericIE(InfoExtractor):
1329     """Generic last-resort information extractor."""
1330
1331     _VALID_URL = r'.*'
1332     IE_NAME = u'generic'
1333
1334     def __init__(self, downloader=None):
1335         InfoExtractor.__init__(self, downloader)
1336
1337     def report_download_webpage(self, video_id):
1338         """Report webpage download."""
1339         if not self._downloader.params.get('test', False):
1340             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1341         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1342
1343     def report_extraction(self, video_id):
1344         """Report information extraction."""
1345         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1346
1347     def report_following_redirect(self, new_url):
1348         """Report information extraction."""
1349         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1350
1351     def _test_redirect(self, url):
1352         """Check if it is a redirect, like url shorteners, in case return the new url."""
1353         class HeadRequest(compat_urllib_request.Request):
1354             def get_method(self):
1355                 return "HEAD"
1356
1357         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1358             """
1359             Subclass the HTTPRedirectHandler to make it use our
1360             HeadRequest also on the redirected URL
1361             """
1362             def redirect_request(self, req, fp, code, msg, headers, newurl):
1363                 if code in (301, 302, 303, 307):
1364                     newurl = newurl.replace(' ', '%20')
1365                     newheaders = dict((k,v) for k,v in req.headers.items()
1366                                       if k.lower() not in ("content-length", "content-type"))
1367                     return HeadRequest(newurl,
1368                                        headers=newheaders,
1369                                        origin_req_host=req.get_origin_req_host(),
1370                                        unverifiable=True)
1371                 else:
1372                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1373
1374         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1375             """
1376             Fallback to GET if HEAD is not allowed (405 HTTP error)
1377             """
1378             def http_error_405(self, req, fp, code, msg, headers):
1379                 fp.read()
1380                 fp.close()
1381
1382                 newheaders = dict((k,v) for k,v in req.headers.items()
1383                                   if k.lower() not in ("content-length", "content-type"))
1384                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1385                                                  headers=newheaders,
1386                                                  origin_req_host=req.get_origin_req_host(),
1387                                                  unverifiable=True))
1388
1389         # Build our opener
1390         opener = compat_urllib_request.OpenerDirector()
1391         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1392                         HTTPMethodFallback, HEADRedirectHandler,
1393                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1394             opener.add_handler(handler())
1395
1396         response = opener.open(HeadRequest(url))
1397         new_url = response.geturl()
1398
1399         if url == new_url:
1400             return False
1401
1402         self.report_following_redirect(new_url)
1403         return new_url
1404
1405     def _real_extract(self, url):
1406         new_url = self._test_redirect(url)
1407         if new_url: return [self.url_result(new_url)]
1408
1409         video_id = url.split('/')[-1]
1410         try:
1411             webpage = self._download_webpage(url, video_id)
1412         except ValueError as err:
1413             # since this is the last-resort InfoExtractor, if
1414             # this error is thrown, it'll be thrown here
1415             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1416             return
1417
1418         self.report_extraction(video_id)
1419         # Start with something easy: JW Player in SWFObject
1420         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1421         if mobj is None:
1422             # Broaden the search a little bit
1423             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1424         if mobj is None:
1425             # Broaden the search a little bit: JWPlayer JS loader
1426             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1427         if mobj is None:
1428             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1429             return
1430
1431         # It's possible that one of the regexes
1432         # matched, but returned an empty group:
1433         if mobj.group(1) is None:
1434             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1435             return
1436
1437         video_url = compat_urllib_parse.unquote(mobj.group(1))
1438         video_id = os.path.basename(video_url)
1439
1440         # here's a fun little line of code for you:
1441         video_extension = os.path.splitext(video_id)[1][1:]
1442         video_id = os.path.splitext(video_id)[0]
1443
1444         # it's tempting to parse this further, but you would
1445         # have to take into account all the variations like
1446         #   Video Title - Site Name
1447         #   Site Name | Video Title
1448         #   Video Title - Tagline | Site Name
1449         # and so on and so forth; it's just not practical
1450         mobj = re.search(r'<title>(.*)</title>', webpage)
1451         if mobj is None:
1452             self._downloader.trouble(u'ERROR: unable to extract title')
1453             return
1454         video_title = mobj.group(1)
1455
1456         # video uploader is domain name
1457         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1458         if mobj is None:
1459             self._downloader.trouble(u'ERROR: unable to extract title')
1460             return
1461         video_uploader = mobj.group(1)
1462
1463         return [{
1464             'id':       video_id,
1465             'url':      video_url,
1466             'uploader': video_uploader,
1467             'upload_date':  None,
1468             'title':    video_title,
1469             'ext':      video_extension,
1470         }]
1471
1472
1473 class YoutubeSearchIE(InfoExtractor):
1474     """Information Extractor for YouTube search queries."""
1475     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1476     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1477     _max_youtube_results = 1000
1478     IE_NAME = u'youtube:search'
1479
1480     def __init__(self, downloader=None):
1481         InfoExtractor.__init__(self, downloader)
1482
1483     def report_download_page(self, query, pagenum):
1484         """Report attempt to download search page with given number."""
1485         query = query.decode(preferredencoding())
1486         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1487
1488     def _real_extract(self, query):
1489         mobj = re.match(self._VALID_URL, query)
1490         if mobj is None:
1491             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1492             return
1493
1494         prefix, query = query.split(':')
1495         prefix = prefix[8:]
1496         query = query.encode('utf-8')
1497         if prefix == '':
1498             self._download_n_results(query, 1)
1499             return
1500         elif prefix == 'all':
1501             self._download_n_results(query, self._max_youtube_results)
1502             return
1503         else:
1504             try:
1505                 n = int(prefix)
1506                 if n <= 0:
1507                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1508                     return
1509                 elif n > self._max_youtube_results:
1510                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1511                     n = self._max_youtube_results
1512                 self._download_n_results(query, n)
1513                 return
1514             except ValueError: # parsing prefix as integer fails
1515                 self._download_n_results(query, 1)
1516                 return
1517
1518     def _download_n_results(self, query, n):
1519         """Downloads a specified number of results for a query"""
1520
1521         video_ids = []
1522         pagenum = 0
1523         limit = n
1524
1525         while (50 * pagenum) < limit:
1526             self.report_download_page(query, pagenum+1)
1527             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1528             request = compat_urllib_request.Request(result_url)
1529             try:
1530                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1531             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1532                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1533                 return
1534             api_response = json.loads(data)['data']
1535
1536             if not 'items' in api_response:
1537                 self._downloader.trouble(u'[youtube] No video results')
1538                 return
1539
1540             new_ids = list(video['id'] for video in api_response['items'])
1541             video_ids += new_ids
1542
1543             limit = min(n, api_response['totalItems'])
1544             pagenum += 1
1545
1546         if len(video_ids) > n:
1547             video_ids = video_ids[:n]
1548         for id in video_ids:
1549             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1550         return
1551
1552
1553 class GoogleSearchIE(InfoExtractor):
1554     """Information Extractor for Google Video search queries."""
1555     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1556     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1557     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1558     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1559     _max_google_results = 1000
1560     IE_NAME = u'video.google:search'
1561
1562     def __init__(self, downloader=None):
1563         InfoExtractor.__init__(self, downloader)
1564
1565     def report_download_page(self, query, pagenum):
1566         """Report attempt to download playlist page with given number."""
1567         query = query.decode(preferredencoding())
1568         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1569
1570     def _real_extract(self, query):
1571         mobj = re.match(self._VALID_URL, query)
1572         if mobj is None:
1573             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1574             return
1575
1576         prefix, query = query.split(':')
1577         prefix = prefix[8:]
1578         query = query.encode('utf-8')
1579         if prefix == '':
1580             self._download_n_results(query, 1)
1581             return
1582         elif prefix == 'all':
1583             self._download_n_results(query, self._max_google_results)
1584             return
1585         else:
1586             try:
1587                 n = int(prefix)
1588                 if n <= 0:
1589                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1590                     return
1591                 elif n > self._max_google_results:
1592                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1593                     n = self._max_google_results
1594                 self._download_n_results(query, n)
1595                 return
1596             except ValueError: # parsing prefix as integer fails
1597                 self._download_n_results(query, 1)
1598                 return
1599
1600     def _download_n_results(self, query, n):
1601         """Downloads a specified number of results for a query"""
1602
1603         video_ids = []
1604         pagenum = 0
1605
1606         while True:
1607             self.report_download_page(query, pagenum)
1608             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1609             request = compat_urllib_request.Request(result_url)
1610             try:
1611                 page = compat_urllib_request.urlopen(request).read()
1612             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1613                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1614                 return
1615
1616             # Extract video identifiers
1617             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1618                 video_id = mobj.group(1)
1619                 if video_id not in video_ids:
1620                     video_ids.append(video_id)
1621                     if len(video_ids) == n:
1622                         # Specified n videos reached
1623                         for id in video_ids:
1624                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1625                         return
1626
1627             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1628                 for id in video_ids:
1629                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1630                 return
1631
1632             pagenum = pagenum + 1
1633
1634
1635 class YahooSearchIE(InfoExtractor):
1636     """Information Extractor for Yahoo! Video search queries."""
1637
1638     _WORKING = False
1639     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1640     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1641     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1642     _MORE_PAGES_INDICATOR = r'\s*Next'
1643     _max_yahoo_results = 1000
1644     IE_NAME = u'video.yahoo:search'
1645
1646     def __init__(self, downloader=None):
1647         InfoExtractor.__init__(self, downloader)
1648
1649     def report_download_page(self, query, pagenum):
1650         """Report attempt to download playlist page with given number."""
1651         query = query.decode(preferredencoding())
1652         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1653
1654     def _real_extract(self, query):
1655         mobj = re.match(self._VALID_URL, query)
1656         if mobj is None:
1657             self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1658             return
1659
1660         prefix, query = query.split(':')
1661         prefix = prefix[8:]
1662         query = query.encode('utf-8')
1663         if prefix == '':
1664             self._download_n_results(query, 1)
1665             return
1666         elif prefix == 'all':
1667             self._download_n_results(query, self._max_yahoo_results)
1668             return
1669         else:
1670             try:
1671                 n = int(prefix)
1672                 if n <= 0:
1673                     self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1674                     return
1675                 elif n > self._max_yahoo_results:
1676                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1677                     n = self._max_yahoo_results
1678                 self._download_n_results(query, n)
1679                 return
1680             except ValueError: # parsing prefix as integer fails
1681                 self._download_n_results(query, 1)
1682                 return
1683
1684     def _download_n_results(self, query, n):
1685         """Downloads a specified number of results for a query"""
1686
1687         video_ids = []
1688         already_seen = set()
1689         pagenum = 1
1690
1691         while True:
1692             self.report_download_page(query, pagenum)
1693             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1694             request = compat_urllib_request.Request(result_url)
1695             try:
1696                 page = compat_urllib_request.urlopen(request).read()
1697             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1698                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1699                 return
1700
1701             # Extract video identifiers
1702             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703                 video_id = mobj.group(1)
1704                 if video_id not in already_seen:
1705                     video_ids.append(video_id)
1706                     already_seen.add(video_id)
1707                     if len(video_ids) == n:
1708                         # Specified n videos reached
1709                         for id in video_ids:
1710                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1711                         return
1712
1713             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1714                 for id in video_ids:
1715                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1716                 return
1717
1718             pagenum = pagenum + 1
1719
1720
1721 class YoutubePlaylistIE(InfoExtractor):
1722     """Information Extractor for YouTube playlists."""
1723
1724     _VALID_URL = r"""(?:
1725                         (?:https?://)?
1726                         (?:\w+\.)?
1727                         youtube\.com/
1728                         (?:
1729                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1730                            \? (?:.*?&)*? (?:p|a|list)=
1731                         |  user/.*?/user/
1732                         |  p/
1733                         |  user/.*?#[pg]/c/
1734                         )
1735                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1736                         .*
1737                      |
1738                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1739                      )"""
1740     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1741     _MAX_RESULTS = 50
1742     IE_NAME = u'youtube:playlist'
1743
1744     def __init__(self, downloader=None):
1745         InfoExtractor.__init__(self, downloader)
1746
1747     @classmethod
1748     def suitable(cls, url):
1749         """Receives a URL and returns True if suitable for this IE."""
1750         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1751
1752     def report_download_page(self, playlist_id, pagenum):
1753         """Report attempt to download playlist page with given number."""
1754         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1755
1756     def _real_extract(self, url):
1757         # Extract playlist id
1758         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1759         if mobj is None:
1760             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1761             return
1762
1763         # Download playlist videos from API
1764         playlist_id = mobj.group(1) or mobj.group(2)
1765         page_num = 1
1766         videos = []
1767
1768         while True:
1769             self.report_download_page(playlist_id, page_num)
1770
1771             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1772             try:
1773                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1774             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1775                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776                 return
1777
1778             try:
1779                 response = json.loads(page)
1780             except ValueError as err:
1781                 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1782                 return
1783
1784             if not 'feed' in response or not 'entry' in response['feed']:
1785                 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1786                 return
1787             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1788                         for entry in response['feed']['entry']
1789                         if 'content' in entry ]
1790
1791             if len(response['feed']['entry']) < self._MAX_RESULTS:
1792                 break
1793             page_num += 1
1794
1795         videos = [v[1] for v in sorted(videos)]
1796         total = len(videos)
1797
1798         playliststart = self._downloader.params.get('playliststart', 1) - 1
1799         playlistend = self._downloader.params.get('playlistend', -1)
1800         if playlistend == -1:
1801             videos = videos[playliststart:]
1802         else:
1803             videos = videos[playliststart:playlistend]
1804
1805         if len(videos) == total:
1806             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1807         else:
1808             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1809
1810         url_results = [self.url_result(url) for url in videos]
1811         return [self.playlist_result(url_results)]
1812
1813
1814 class YoutubeChannelIE(InfoExtractor):
1815     """Information Extractor for YouTube channels."""
1816
1817     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1818     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1819     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1820     IE_NAME = u'youtube:channel'
1821
1822     def report_download_page(self, channel_id, pagenum):
1823         """Report attempt to download channel page with given number."""
1824         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1825
1826     def _real_extract(self, url):
1827         # Extract channel id
1828         mobj = re.match(self._VALID_URL, url)
1829         if mobj is None:
1830             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1831             return
1832
1833         # Download channel pages
1834         channel_id = mobj.group(1)
1835         video_ids = []
1836         pagenum = 1
1837
1838         while True:
1839             self.report_download_page(channel_id, pagenum)
1840             url = self._TEMPLATE_URL % (channel_id, pagenum)
1841             request = compat_urllib_request.Request(url)
1842             try:
1843                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1844             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1846                 return
1847
1848             # Extract video identifiers
1849             ids_in_page = []
1850             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1851                 if mobj.group(1) not in ids_in_page:
1852                     ids_in_page.append(mobj.group(1))
1853             video_ids.extend(ids_in_page)
1854
1855             if self._MORE_PAGES_INDICATOR not in page:
1856                 break
1857             pagenum = pagenum + 1
1858
1859         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1860
1861         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1862         url_entries = [self.url_result(url) for url in urls]
1863         return [self.playlist_result(url_entries)]
1864
1865
1866 class YoutubeUserIE(InfoExtractor):
1867     """Information Extractor for YouTube users."""
1868
1869     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1870     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1871     _GDATA_PAGE_SIZE = 50
1872     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1873     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1874     IE_NAME = u'youtube:user'
1875
1876     def __init__(self, downloader=None):
1877         InfoExtractor.__init__(self, downloader)
1878
1879     def report_download_page(self, username, start_index):
1880         """Report attempt to download user page."""
1881         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1882                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1883
1884     def _real_extract(self, url):
1885         # Extract username
1886         mobj = re.match(self._VALID_URL, url)
1887         if mobj is None:
1888             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1889             return
1890
1891         username = mobj.group(1)
1892
1893         # Download video ids using YouTube Data API. Result size per
1894         # query is limited (currently to 50 videos) so we need to query
1895         # page by page until there are no video ids - it means we got
1896         # all of them.
1897
1898         video_ids = []
1899         pagenum = 0
1900
1901         while True:
1902             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1903             self.report_download_page(username, start_index)
1904
1905             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1906
1907             try:
1908                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1911                 return
1912
1913             # Extract video identifiers
1914             ids_in_page = []
1915
1916             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917                 if mobj.group(1) not in ids_in_page:
1918                     ids_in_page.append(mobj.group(1))
1919
1920             video_ids.extend(ids_in_page)
1921
1922             # A little optimization - if current page is not
1923             # "full", ie. does not contain PAGE_SIZE video ids then
1924             # we can assume that this page is the last one - there
1925             # are no more ids on further pages - no need to query
1926             # again.
1927
1928             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1929                 break
1930
1931             pagenum += 1
1932
1933         all_ids_count = len(video_ids)
1934         playliststart = self._downloader.params.get('playliststart', 1) - 1
1935         playlistend = self._downloader.params.get('playlistend', -1)
1936
1937         if playlistend == -1:
1938             video_ids = video_ids[playliststart:]
1939         else:
1940             video_ids = video_ids[playliststart:playlistend]
1941
1942         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1943                 (username, all_ids_count, len(video_ids)))
1944
1945         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1946         url_results = [self.url_result(url) for url in urls]
1947         return [self.playlist_result(url_results)]
1948
1949
1950 class BlipTVUserIE(InfoExtractor):
1951     """Information Extractor for blip.tv users."""
1952
1953     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1954     _PAGE_SIZE = 12
1955     IE_NAME = u'blip.tv:user'
1956
1957     def __init__(self, downloader=None):
1958         InfoExtractor.__init__(self, downloader)
1959
1960     def report_download_page(self, username, pagenum):
1961         """Report attempt to download user page."""
1962         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1963                 (self.IE_NAME, username, pagenum))
1964
1965     def _real_extract(self, url):
1966         # Extract username
1967         mobj = re.match(self._VALID_URL, url)
1968         if mobj is None:
1969             self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1970             return
1971
1972         username = mobj.group(1)
1973
1974         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1975
1976         request = compat_urllib_request.Request(url)
1977
1978         try:
1979             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1980             mobj = re.search(r'data-users-id="([^"]+)"', page)
1981             page_base = page_base % mobj.group(1)
1982         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1984             return
1985
1986
1987         # Download video ids using BlipTV Ajax calls. Result size per
1988         # query is limited (currently to 12 videos) so we need to query
1989         # page by page until there are no video ids - it means we got
1990         # all of them.
1991
1992         video_ids = []
1993         pagenum = 1
1994
1995         while True:
1996             self.report_download_page(username, pagenum)
1997             url = page_base + "&page=" + str(pagenum)
1998             request = compat_urllib_request.Request( url )
1999             try:
2000                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2001             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2002                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2003                 return
2004
2005             # Extract video identifiers
2006             ids_in_page = []
2007
2008             for mobj in re.finditer(r'href="/([^"]+)"', page):
2009                 if mobj.group(1) not in ids_in_page:
2010                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2011
2012             video_ids.extend(ids_in_page)
2013
2014             # A little optimization - if current page is not
2015             # "full", ie. does not contain PAGE_SIZE video ids then
2016             # we can assume that this page is the last one - there
2017             # are no more ids on further pages - no need to query
2018             # again.
2019
2020             if len(ids_in_page) < self._PAGE_SIZE:
2021                 break
2022
2023             pagenum += 1
2024
2025         all_ids_count = len(video_ids)
2026         playliststart = self._downloader.params.get('playliststart', 1) - 1
2027         playlistend = self._downloader.params.get('playlistend', -1)
2028
2029         if playlistend == -1:
2030             video_ids = video_ids[playliststart:]
2031         else:
2032             video_ids = video_ids[playliststart:playlistend]
2033
2034         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2035                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2036
2037         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2038         url_entries = [self.url_result(url) for url in urls]
2039         return [self.playlist_result(url_entries)]
2040
2041
2042 class DepositFilesIE(InfoExtractor):
2043     """Information extractor for depositfiles.com"""
2044
2045     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2046
2047     def report_download_webpage(self, file_id):
2048         """Report webpage download."""
2049         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2050
2051     def report_extraction(self, file_id):
2052         """Report information extraction."""
2053         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2054
2055     def _real_extract(self, url):
2056         file_id = url.split('/')[-1]
2057         # Rebuild url in english locale
2058         url = 'http://depositfiles.com/en/files/' + file_id
2059
2060         # Retrieve file webpage with 'Free download' button pressed
2061         free_download_indication = { 'gateway_result' : '1' }
2062         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2063         try:
2064             self.report_download_webpage(file_id)
2065             webpage = compat_urllib_request.urlopen(request).read()
2066         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2067             self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2068             return
2069
2070         # Search for the real file URL
2071         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2072         if (mobj is None) or (mobj.group(1) is None):
2073             # Try to figure out reason of the error.
2074             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2075             if (mobj is not None) and (mobj.group(1) is not None):
2076                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2077                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2078             else:
2079                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2080             return
2081
2082         file_url = mobj.group(1)
2083         file_extension = os.path.splitext(file_url)[1][1:]
2084
2085         # Search for file title
2086         mobj = re.search(r'<b title="(.*?)">', webpage)
2087         if mobj is None:
2088             self._downloader.trouble(u'ERROR: unable to extract title')
2089             return
2090         file_title = mobj.group(1).decode('utf-8')
2091
2092         return [{
2093             'id':       file_id.decode('utf-8'),
2094             'url':      file_url.decode('utf-8'),
2095             'uploader': None,
2096             'upload_date':  None,
2097             'title':    file_title,
2098             'ext':      file_extension.decode('utf-8'),
2099         }]
2100
2101
2102 class FacebookIE(InfoExtractor):
2103     """Information Extractor for Facebook"""
2104
2105     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2106     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2107     _NETRC_MACHINE = 'facebook'
2108     IE_NAME = u'facebook'
2109
2110     def report_login(self):
2111         """Report attempt to log in."""
2112         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2113
2114     def _real_initialize(self):
2115         if self._downloader is None:
2116             return
2117
2118         useremail = None
2119         password = None
2120         downloader_params = self._downloader.params
2121
2122         # Attempt to use provided username and password or .netrc data
2123         if downloader_params.get('username', None) is not None:
2124             useremail = downloader_params['username']
2125             password = downloader_params['password']
2126         elif downloader_params.get('usenetrc', False):
2127             try:
2128                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2129                 if info is not None:
2130                     useremail = info[0]
2131                     password = info[2]
2132                 else:
2133                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2134             except (IOError, netrc.NetrcParseError) as err:
2135                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2136                 return
2137
2138         if useremail is None:
2139             return
2140
2141         # Log in
2142         login_form = {
2143             'email': useremail,
2144             'pass': password,
2145             'login': 'Log+In'
2146             }
2147         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2148         try:
2149             self.report_login()
2150             login_results = compat_urllib_request.urlopen(request).read()
2151             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2152                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2153                 return
2154         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2155             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2156             return
2157
2158     def _real_extract(self, url):
2159         mobj = re.match(self._VALID_URL, url)
2160         if mobj is None:
2161             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2162             return
2163         video_id = mobj.group('ID')
2164
2165         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2166         webpage = self._download_webpage(url, video_id)
2167
2168         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2169         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2170         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2171         if not m:
2172             raise ExtractorError(u'Cannot parse data')
2173         data = dict(json.loads(m.group(1)))
2174         params_raw = compat_urllib_parse.unquote(data['params'])
2175         params = json.loads(params_raw)
2176         video_url = params['hd_src']
2177         if not video_url:
2178             video_url = params['sd_src']
2179         if not video_url:
2180             raise ExtractorError(u'Cannot find video URL')
2181         video_duration = int(params['video_duration'])
2182
2183         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2184         if not m:
2185             raise ExtractorError(u'Cannot find title in webpage')
2186         video_title = unescapeHTML(m.group(1))
2187
2188         info = {
2189             'id': video_id,
2190             'title': video_title,
2191             'url': video_url,
2192             'ext': 'mp4',
2193             'duration': video_duration,
2194             'thumbnail': params['thumbnail_src'],
2195         }
2196         return [info]
2197
2198
2199 class BlipTVIE(InfoExtractor):
2200     """Information extractor for blip.tv"""
2201
2202     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2203     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2204     IE_NAME = u'blip.tv'
2205
2206     def report_extraction(self, file_id):
2207         """Report information extraction."""
2208         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2209
2210     def report_direct_download(self, title):
2211         """Report information extraction."""
2212         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2213
2214     def _real_extract(self, url):
2215         mobj = re.match(self._VALID_URL, url)
2216         if mobj is None:
2217             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2218             return
2219
2220         urlp = compat_urllib_parse_urlparse(url)
2221         if urlp.path.startswith('/play/'):
2222             request = compat_urllib_request.Request(url)
2223             response = compat_urllib_request.urlopen(request)
2224             redirecturl = response.geturl()
2225             rurlp = compat_urllib_parse_urlparse(redirecturl)
2226             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2227             url = 'http://blip.tv/a/a-' + file_id
2228             return self._real_extract(url)
2229
2230
2231         if '?' in url:
2232             cchar = '&'
2233         else:
2234             cchar = '?'
2235         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2236         request = compat_urllib_request.Request(json_url)
2237         request.add_header('User-Agent', 'iTunes/10.6.1')
2238         self.report_extraction(mobj.group(1))
2239         info = None
2240         try:
2241             urlh = compat_urllib_request.urlopen(request)
2242             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2243                 basename = url.split('/')[-1]
2244                 title,ext = os.path.splitext(basename)
2245                 title = title.decode('UTF-8')
2246                 ext = ext.replace('.', '')
2247                 self.report_direct_download(title)
2248                 info = {
2249                     'id': title,
2250                     'url': url,
2251                     'uploader': None,
2252                     'upload_date': None,
2253                     'title': title,
2254                     'ext': ext,
2255                     'urlhandle': urlh
2256                 }
2257         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2258             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2259         if info is None: # Regular URL
2260             try:
2261                 json_code_bytes = urlh.read()
2262                 json_code = json_code_bytes.decode('utf-8')
2263             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2264                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2265                 return
2266
2267             try:
2268                 json_data = json.loads(json_code)
2269                 if 'Post' in json_data:
2270                     data = json_data['Post']
2271                 else:
2272                     data = json_data
2273
2274                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2275                 video_url = data['media']['url']
2276                 umobj = re.match(self._URL_EXT, video_url)
2277                 if umobj is None:
2278                     raise ValueError('Can not determine filename extension')
2279                 ext = umobj.group(1)
2280
2281                 info = {
2282                     'id': data['item_id'],
2283                     'url': video_url,
2284                     'uploader': data['display_name'],
2285                     'upload_date': upload_date,
2286                     'title': data['title'],
2287                     'ext': ext,
2288                     'format': data['media']['mimeType'],
2289                     'thumbnail': data['thumbnailUrl'],
2290                     'description': data['description'],
2291                     'player_url': data['embedUrl'],
2292                     'user_agent': 'iTunes/10.6.1',
2293                 }
2294             except (ValueError,KeyError) as err:
2295                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2296                 return
2297
2298         return [info]
2299
2300
2301 class MyVideoIE(InfoExtractor):
2302     """Information Extractor for myvideo.de."""
2303
2304     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2305     IE_NAME = u'myvideo'
2306
2307     def __init__(self, downloader=None):
2308         InfoExtractor.__init__(self, downloader)
2309
2310     def report_extraction(self, video_id):
2311         """Report information extraction."""
2312         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2313
2314     def _real_extract(self,url):
2315         mobj = re.match(self._VALID_URL, url)
2316         if mobj is None:
2317             self._download.trouble(u'ERROR: invalid URL: %s' % url)
2318             return
2319
2320         video_id = mobj.group(1)
2321
2322         # Get video webpage
2323         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2324         webpage = self._download_webpage(webpage_url, video_id)
2325
2326         self.report_extraction(video_id)
2327         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2328                  webpage)
2329         if mobj is None:
2330             self._downloader.trouble(u'ERROR: unable to extract media URL')
2331             return
2332         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2333
2334         mobj = re.search('<title>([^<]+)</title>', webpage)
2335         if mobj is None:
2336             self._downloader.trouble(u'ERROR: unable to extract title')
2337             return
2338
2339         video_title = mobj.group(1)
2340
2341         return [{
2342             'id':       video_id,
2343             'url':      video_url,
2344             'uploader': None,
2345             'upload_date':  None,
2346             'title':    video_title,
2347             'ext':      u'flv',
2348         }]
2349
2350 class ComedyCentralIE(InfoExtractor):
2351     """Information extractor for The Daily Show and Colbert Report """
2352
2353     # urls can be abbreviations like :thedailyshow or :colbert
2354     # urls for episodes like:
2355     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2356     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2357     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2358     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2359                       |(https?://)?(www\.)?
2360                           (?P<showname>thedailyshow|colbertnation)\.com/
2361                          (full-episodes/(?P<episode>.*)|
2362                           (?P<clip>
2363                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2364                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2365                      $"""
2366
2367     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2368
2369     _video_extensions = {
2370         '3500': 'mp4',
2371         '2200': 'mp4',
2372         '1700': 'mp4',
2373         '1200': 'mp4',
2374         '750': 'mp4',
2375         '400': 'mp4',
2376     }
2377     _video_dimensions = {
2378         '3500': '1280x720',
2379         '2200': '960x540',
2380         '1700': '768x432',
2381         '1200': '640x360',
2382         '750': '512x288',
2383         '400': '384x216',
2384     }
2385
2386     @classmethod
2387     def suitable(cls, url):
2388         """Receives a URL and returns True if suitable for this IE."""
2389         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2390
2391     def report_extraction(self, episode_id):
2392         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2393
2394     def report_config_download(self, episode_id, media_id):
2395         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2396
2397     def report_index_download(self, episode_id):
2398         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2399
2400     def _print_formats(self, formats):
2401         print('Available formats:')
2402         for x in formats:
2403             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2404
2405
2406     def _real_extract(self, url):
2407         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2408         if mobj is None:
2409             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2410             return
2411
2412         if mobj.group('shortname'):
2413             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2414                 url = u'http://www.thedailyshow.com/full-episodes/'
2415             else:
2416                 url = u'http://www.colbertnation.com/full-episodes/'
2417             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2418             assert mobj is not None
2419
2420         if mobj.group('clip'):
2421             if mobj.group('showname') == 'thedailyshow':
2422                 epTitle = mobj.group('tdstitle')
2423             else:
2424                 epTitle = mobj.group('cntitle')
2425             dlNewest = False
2426         else:
2427             dlNewest = not mobj.group('episode')
2428             if dlNewest:
2429                 epTitle = mobj.group('showname')
2430             else:
2431                 epTitle = mobj.group('episode')
2432
2433         req = compat_urllib_request.Request(url)
2434         self.report_extraction(epTitle)
2435         try:
2436             htmlHandle = compat_urllib_request.urlopen(req)
2437             html = htmlHandle.read()
2438             webpage = html.decode('utf-8')
2439         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440             self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2441             return
2442         if dlNewest:
2443             url = htmlHandle.geturl()
2444             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2445             if mobj is None:
2446                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2447                 return
2448             if mobj.group('episode') == '':
2449                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2450                 return
2451             epTitle = mobj.group('episode')
2452
2453         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2454
2455         if len(mMovieParams) == 0:
2456             # The Colbert Report embeds the information in a without
2457             # a URL prefix; so extract the alternate reference
2458             # and then add the URL prefix manually.
2459
2460             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2461             if len(altMovieParams) == 0:
2462                 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2463                 return
2464             else:
2465                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2466
2467         uri = mMovieParams[0][1]
2468         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2469         self.report_index_download(epTitle)
2470         try:
2471             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473             self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2474             return
2475
2476         results = []
2477
2478         idoc = xml.etree.ElementTree.fromstring(indexXml)
2479         itemEls = idoc.findall('.//item')
2480         for partNum,itemEl in enumerate(itemEls):
2481             mediaId = itemEl.findall('./guid')[0].text
2482             shortMediaId = mediaId.split(':')[-1]
2483             showId = mediaId.split(':')[-2].replace('.com', '')
2484             officialTitle = itemEl.findall('./title')[0].text
2485             officialDate = itemEl.findall('./pubDate')[0].text
2486
2487             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2488                         compat_urllib_parse.urlencode({'uri': mediaId}))
2489             configReq = compat_urllib_request.Request(configUrl)
2490             self.report_config_download(epTitle, shortMediaId)
2491             try:
2492                 configXml = compat_urllib_request.urlopen(configReq).read()
2493             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2494                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2495                 return
2496
2497             cdoc = xml.etree.ElementTree.fromstring(configXml)
2498             turls = []
2499             for rendition in cdoc.findall('.//rendition'):
2500                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2501                 turls.append(finfo)
2502
2503             if len(turls) == 0:
2504                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2505                 continue
2506
2507             if self._downloader.params.get('listformats', None):
2508                 self._print_formats([i[0] for i in turls])
2509                 return
2510
2511             # For now, just pick the highest bitrate
2512             format,rtmp_video_url = turls[-1]
2513
2514             # Get the format arg from the arg stream
2515             req_format = self._downloader.params.get('format', None)
2516
2517             # Select format if we can find one
2518             for f,v in turls:
2519                 if f == req_format:
2520                     format, rtmp_video_url = f, v
2521                     break
2522
2523             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2524             if not m:
2525                 raise ExtractorError(u'Cannot transform RTMP url')
2526             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2527             video_url = base + m.group('finalid')
2528
2529             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2530             info = {
2531                 'id': shortMediaId,
2532                 'url': video_url,
2533                 'uploader': showId,
2534                 'upload_date': officialDate,
2535                 'title': effTitle,
2536                 'ext': 'mp4',
2537                 'format': format,
2538                 'thumbnail': None,
2539                 'description': officialTitle,
2540             }
2541             results.append(info)
2542
2543         return results
2544
2545
2546 class EscapistIE(InfoExtractor):
2547     """Information extractor for The Escapist """
2548
2549     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2550     IE_NAME = u'escapist'
2551
2552     def report_extraction(self, showName):
2553         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2554
2555     def report_config_download(self, showName):
2556         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2557
2558     def _real_extract(self, url):
2559         mobj = re.match(self._VALID_URL, url)
2560         if mobj is None:
2561             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2562             return
2563         showName = mobj.group('showname')
2564         videoId = mobj.group('episode')
2565
2566         self.report_extraction(showName)
2567         try:
2568             webPage = compat_urllib_request.urlopen(url)
2569             webPageBytes = webPage.read()
2570             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2571             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2572         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2573             self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2574             return
2575
2576         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2577         description = unescapeHTML(descMatch.group(1))
2578         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2579         imgUrl = unescapeHTML(imgMatch.group(1))
2580         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2581         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2582         configUrlMatch = re.search('config=(.*)$', playerUrl)
2583         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2584
2585         self.report_config_download(showName)
2586         try:
2587             configJSON = compat_urllib_request.urlopen(configUrl)
2588             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2589             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2590         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2591             self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2592             return
2593
2594         # Technically, it's JavaScript, not JSON
2595         configJSON = configJSON.replace("'", '"')
2596
2597         try:
2598             config = json.loads(configJSON)
2599         except (ValueError,) as err:
2600             self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2601             return
2602
2603         playlist = config['playlist']
2604         videoUrl = playlist[1]['url']
2605
2606         info = {
2607             'id': videoId,
2608             'url': videoUrl,
2609             'uploader': showName,
2610             'upload_date': None,
2611             'title': showName,
2612             'ext': 'mp4',
2613             'thumbnail': imgUrl,
2614             'description': description,
2615             'player_url': playerUrl,
2616         }
2617
2618         return [info]
2619
2620 class CollegeHumorIE(InfoExtractor):
2621     """Information extractor for collegehumor.com"""
2622
2623     _WORKING = False
2624     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2625     IE_NAME = u'collegehumor'
2626
2627     def report_manifest(self, video_id):
2628         """Report information extraction."""
2629         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2630
2631     def report_extraction(self, video_id):
2632         """Report information extraction."""
2633         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2634
2635     def _real_extract(self, url):
2636         mobj = re.match(self._VALID_URL, url)
2637         if mobj is None:
2638             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2639             return
2640         video_id = mobj.group('videoid')
2641
2642         info = {
2643             'id': video_id,
2644             'uploader': None,
2645             'upload_date': None,
2646         }
2647
2648         self.report_extraction(video_id)
2649         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2650         try:
2651             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2652         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2654             return
2655
2656         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2657         try:
2658             videoNode = mdoc.findall('./video')[0]
2659             info['description'] = videoNode.findall('./description')[0].text
2660             info['title'] = videoNode.findall('./caption')[0].text
2661             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2662             manifest_url = videoNode.findall('./file')[0].text
2663         except IndexError:
2664             self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2665             return
2666
2667         manifest_url += '?hdcore=2.10.3'
2668         self.report_manifest(video_id)
2669         try:
2670             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2671         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2672             self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2673             return
2674
2675         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2676         try:
2677             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2678             node_id = media_node.attrib['url']
2679             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2680         except IndexError as err:
2681             self._downloader.trouble(u'\nERROR: Invalid manifest file')
2682             return
2683
2684         url_pr = compat_urllib_parse_urlparse(manifest_url)
2685         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2686
2687         info['url'] = url
2688         info['ext'] = 'f4f'
2689         return [info]
2690
2691
2692 class XVideosIE(InfoExtractor):
2693     """Information extractor for xvideos.com"""
2694
2695     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2696     IE_NAME = u'xvideos'
2697
2698     def report_extraction(self, video_id):
2699         """Report information extraction."""
2700         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2701
2702     def _real_extract(self, url):
2703         mobj = re.match(self._VALID_URL, url)
2704         if mobj is None:
2705             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2706             return
2707         video_id = mobj.group(1)
2708
2709         webpage = self._download_webpage(url, video_id)
2710
2711         self.report_extraction(video_id)
2712
2713
2714         # Extract video URL
2715         mobj = re.search(r'flv_url=(.+?)&', webpage)
2716         if mobj is None:
2717             self._downloader.trouble(u'ERROR: unable to extract video url')
2718             return
2719         video_url = compat_urllib_parse.unquote(mobj.group(1))
2720
2721
2722         # Extract title
2723         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2724         if mobj is None:
2725             self._downloader.trouble(u'ERROR: unable to extract video title')
2726             return
2727         video_title = mobj.group(1)
2728
2729
2730         # Extract video thumbnail
2731         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2732         if mobj is None:
2733             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2734             return
2735         video_thumbnail = mobj.group(0)
2736
2737         info = {
2738             'id': video_id,
2739             'url': video_url,
2740             'uploader': None,
2741             'upload_date': None,
2742             'title': video_title,
2743             'ext': 'flv',
2744             'thumbnail': video_thumbnail,
2745             'description': None,
2746         }
2747
2748         return [info]
2749
2750
2751 class SoundcloudIE(InfoExtractor):
2752     """Information extractor for soundcloud.com
2753        To access the media, the uid of the song and a stream token
2754        must be extracted from the page source and the script must make
2755        a request to media.soundcloud.com/crossdomain.xml. Then
2756        the media can be grabbed by requesting from an url composed
2757        of the stream token and uid
2758      """
2759
2760     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2761     IE_NAME = u'soundcloud'
2762
2763     def __init__(self, downloader=None):
2764         InfoExtractor.__init__(self, downloader)
2765
2766     def report_resolve(self, video_id):
2767         """Report information extraction."""
2768         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2769
2770     def report_extraction(self, video_id):
2771         """Report information extraction."""
2772         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2773
2774     def _real_extract(self, url):
2775         mobj = re.match(self._VALID_URL, url)
2776         if mobj is None:
2777             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2778             return
2779
2780         # extract uploader (which is in the url)
2781         uploader = mobj.group(1)
2782         # extract simple title (uploader + slug of song title)
2783         slug_title =  mobj.group(2)
2784         simple_title = uploader + u'-' + slug_title
2785
2786         self.report_resolve('%s/%s' % (uploader, slug_title))
2787
2788         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2789         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2790         request = compat_urllib_request.Request(resolv_url)
2791         try:
2792             info_json_bytes = compat_urllib_request.urlopen(request).read()
2793             info_json = info_json_bytes.decode('utf-8')
2794         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2795             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2796             return
2797
2798         info = json.loads(info_json)
2799         video_id = info['id']
2800         self.report_extraction('%s/%s' % (uploader, slug_title))
2801
2802         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2803         request = compat_urllib_request.Request(streams_url)
2804         try:
2805             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2806             stream_json = stream_json_bytes.decode('utf-8')
2807         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2808             self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2809             return
2810
2811         streams = json.loads(stream_json)
2812         mediaURL = streams['http_mp3_128_url']
2813
2814         return [{
2815             'id':       info['id'],
2816             'url':      mediaURL,
2817             'uploader': info['user']['username'],
2818             'upload_date':  info['created_at'],
2819             'title':    info['title'],
2820             'ext':      u'mp3',
2821             'description': info['description'],
2822         }]
2823
2824
2825 class InfoQIE(InfoExtractor):
2826     """Information extractor for infoq.com"""
2827     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2828
2829     def report_extraction(self, video_id):
2830         """Report information extraction."""
2831         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2832
2833     def _real_extract(self, url):
2834         mobj = re.match(self._VALID_URL, url)
2835         if mobj is None:
2836             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2837             return
2838
2839         webpage = self._download_webpage(url, video_id=url)
2840         self.report_extraction(url)
2841
2842         # Extract video URL
2843         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2844         if mobj is None:
2845             self._downloader.trouble(u'ERROR: unable to extract video url')
2846             return
2847         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2848         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2849
2850         # Extract title
2851         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2852         if mobj is None:
2853             self._downloader.trouble(u'ERROR: unable to extract video title')
2854             return
2855         video_title = mobj.group(1)
2856
2857         # Extract description
2858         video_description = u'No description available.'
2859         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2860         if mobj is not None:
2861             video_description = mobj.group(1)
2862
2863         video_filename = video_url.split('/')[-1]
2864         video_id, extension = video_filename.split('.')
2865
2866         info = {
2867             'id': video_id,
2868             'url': video_url,
2869             'uploader': None,
2870             'upload_date': None,
2871             'title': video_title,
2872             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2873             'thumbnail': None,
2874             'description': video_description,
2875         }
2876
2877         return [info]
2878
2879 class MixcloudIE(InfoExtractor):
2880     """Information extractor for www.mixcloud.com"""
2881
2882     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2883     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2884     IE_NAME = u'mixcloud'
2885
2886     def __init__(self, downloader=None):
2887         InfoExtractor.__init__(self, downloader)
2888
2889     def report_download_json(self, file_id):
2890         """Report JSON download."""
2891         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2892
2893     def report_extraction(self, file_id):
2894         """Report information extraction."""
2895         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2896
2897     def get_urls(self, jsonData, fmt, bitrate='best'):
2898         """Get urls from 'audio_formats' section in json"""
2899         file_url = None
2900         try:
2901             bitrate_list = jsonData[fmt]
2902             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2903                 bitrate = max(bitrate_list) # select highest
2904
2905             url_list = jsonData[fmt][bitrate]
2906         except TypeError: # we have no bitrate info.
2907             url_list = jsonData[fmt]
2908         return url_list
2909
2910     def check_urls(self, url_list):
2911         """Returns 1st active url from list"""
2912         for url in url_list:
2913             try:
2914                 compat_urllib_request.urlopen(url)
2915                 return url
2916             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2917                 url = None
2918
2919         return None
2920
2921     def _print_formats(self, formats):
2922         print('Available formats:')
2923         for fmt in formats.keys():
2924             for b in formats[fmt]:
2925                 try:
2926                     ext = formats[fmt][b][0]
2927                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2928                 except TypeError: # we have no bitrate info
2929                     ext = formats[fmt][0]
2930                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2931                     break
2932
2933     def _real_extract(self, url):
2934         mobj = re.match(self._VALID_URL, url)
2935         if mobj is None:
2936             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2937             return
2938         # extract uploader & filename from url
2939         uploader = mobj.group(1).decode('utf-8')
2940         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2941
2942         # construct API request
2943         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2944         # retrieve .json file with links to files
2945         request = compat_urllib_request.Request(file_url)
2946         try:
2947             self.report_download_json(file_url)
2948             jsonData = compat_urllib_request.urlopen(request).read()
2949         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2950             self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2951             return
2952
2953         # parse JSON
2954         json_data = json.loads(jsonData)
2955         player_url = json_data['player_swf_url']
2956         formats = dict(json_data['audio_formats'])
2957
2958         req_format = self._downloader.params.get('format', None)
2959         bitrate = None
2960
2961         if self._downloader.params.get('listformats', None):
2962             self._print_formats(formats)
2963             return
2964
2965         if req_format is None or req_format == 'best':
2966             for format_param in formats.keys():
2967                 url_list = self.get_urls(formats, format_param)
2968                 # check urls
2969                 file_url = self.check_urls(url_list)
2970                 if file_url is not None:
2971                     break # got it!
2972         else:
2973             if req_format not in formats:
2974                 self._downloader.trouble(u'ERROR: format is not available')
2975                 return
2976
2977             url_list = self.get_urls(formats, req_format)
2978             file_url = self.check_urls(url_list)
2979             format_param = req_format
2980
2981         return [{
2982             'id': file_id.decode('utf-8'),
2983             'url': file_url.decode('utf-8'),
2984             'uploader': uploader.decode('utf-8'),
2985             'upload_date': None,
2986             'title': json_data['name'],
2987             'ext': file_url.split('.')[-1].decode('utf-8'),
2988             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2989             'thumbnail': json_data['thumbnail_url'],
2990             'description': json_data['description'],
2991             'player_url': player_url.decode('utf-8'),
2992         }]
2993
2994 class StanfordOpenClassroomIE(InfoExtractor):
2995     """Information extractor for Stanford's Open ClassRoom"""
2996
2997     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2998     IE_NAME = u'stanfordoc'
2999
3000     def report_download_webpage(self, objid):
3001         """Report information extraction."""
3002         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3003
3004     def report_extraction(self, video_id):
3005         """Report information extraction."""
3006         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3007
3008     def _real_extract(self, url):
3009         mobj = re.match(self._VALID_URL, url)
3010         if mobj is None:
3011             raise ExtractorError(u'Invalid URL: %s' % url)
3012
3013         if mobj.group('course') and mobj.group('video'): # A specific video
3014             course = mobj.group('course')
3015             video = mobj.group('video')
3016             info = {
3017                 'id': course + '_' + video,
3018                 'uploader': None,
3019                 'upload_date': None,
3020             }
3021
3022             self.report_extraction(info['id'])
3023             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3024             xmlUrl = baseUrl + video + '.xml'
3025             try:
3026                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3027             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3029                 return
3030             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3031             try:
3032                 info['title'] = mdoc.findall('./title')[0].text
3033                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3034             except IndexError:
3035                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3036                 return
3037             info['ext'] = info['url'].rpartition('.')[2]
3038             return [info]
3039         elif mobj.group('course'): # A course page
3040             course = mobj.group('course')
3041             info = {
3042                 'id': course,
3043                 'type': 'playlist',
3044                 'uploader': None,
3045                 'upload_date': None,
3046             }
3047
3048             coursepage = self._download_webpage(url, info['id'],
3049                                         note='Downloading course info page',
3050                                         errnote='Unable to download course info page')
3051
3052             m = re.search('<h1>([^<]+)</h1>', coursepage)
3053             if m:
3054                 info['title'] = unescapeHTML(m.group(1))
3055             else:
3056                 info['title'] = info['id']
3057
3058             m = re.search('<description>([^<]+)</description>', coursepage)
3059             if m:
3060                 info['description'] = unescapeHTML(m.group(1))
3061
3062             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3063             info['list'] = [
3064                 {
3065                     'type': 'reference',
3066                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3067                 }
3068                     for vpage in links]
3069             results = []
3070             for entry in info['list']:
3071                 assert entry['type'] == 'reference'
3072                 results += self.extract(entry['url'])
3073             return results
3074         else: # Root page
3075             info = {
3076                 'id': 'Stanford OpenClassroom',
3077                 'type': 'playlist',
3078                 'uploader': None,
3079                 'upload_date': None,
3080             }
3081
3082             self.report_download_webpage(info['id'])
3083             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3084             try:
3085                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3086             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3087                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3088                 return
3089
3090             info['title'] = info['id']
3091
3092             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3093             info['list'] = [
3094                 {
3095                     'type': 'reference',
3096                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3097                 }
3098                     for cpage in links]
3099
3100             results = []
3101             for entry in info['list']:
3102                 assert entry['type'] == 'reference'
3103                 results += self.extract(entry['url'])
3104             return results
3105
3106 class MTVIE(InfoExtractor):
3107     """Information extractor for MTV.com"""
3108
3109     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3110     IE_NAME = u'mtv'
3111
3112     def report_extraction(self, video_id):
3113         """Report information extraction."""
3114         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3115
3116     def _real_extract(self, url):
3117         mobj = re.match(self._VALID_URL, url)
3118         if mobj is None:
3119             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3120             return
3121         if not mobj.group('proto'):
3122             url = 'http://' + url
3123         video_id = mobj.group('videoid')
3124
3125         webpage = self._download_webpage(url, video_id)
3126
3127         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3128         if mobj is None:
3129             self._downloader.trouble(u'ERROR: unable to extract song name')
3130             return
3131         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3132         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3133         if mobj is None:
3134             self._downloader.trouble(u'ERROR: unable to extract performer')
3135             return
3136         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3137         video_title = performer + ' - ' + song_name
3138
3139         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3140         if mobj is None:
3141             self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3142             return
3143         mtvn_uri = mobj.group(1)
3144
3145         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3146         if mobj is None:
3147             self._downloader.trouble(u'ERROR: unable to extract content id')
3148             return
3149         content_id = mobj.group(1)
3150
3151         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3152         self.report_extraction(video_id)
3153         request = compat_urllib_request.Request(videogen_url)
3154         try:
3155             metadataXml = compat_urllib_request.urlopen(request).read()
3156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3157             self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3158             return
3159
3160         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3161         renditions = mdoc.findall('.//rendition')
3162
3163         # For now, always pick the highest quality.
3164         rendition = renditions[-1]
3165
3166         try:
3167             _,_,ext = rendition.attrib['type'].partition('/')
3168             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3169             video_url = rendition.find('./src').text
3170         except KeyError:
3171             self._downloader.trouble('Invalid rendition field.')
3172             return
3173
3174         info = {
3175             'id': video_id,
3176             'url': video_url,
3177             'uploader': performer,
3178             'upload_date': None,
3179             'title': video_title,
3180             'ext': ext,
3181             'format': format,
3182         }
3183
3184         return [info]
3185
3186
3187 class YoukuIE(InfoExtractor):
3188     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3189
3190     def report_download_webpage(self, file_id):
3191         """Report webpage download."""
3192         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3193
3194     def report_extraction(self, file_id):
3195         """Report information extraction."""
3196         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3197
3198     def _gen_sid(self):
3199         nowTime = int(time.time() * 1000)
3200         random1 = random.randint(1000,1998)
3201         random2 = random.randint(1000,9999)
3202
3203         return "%d%d%d" %(nowTime,random1,random2)
3204
3205     def _get_file_ID_mix_string(self, seed):
3206         mixed = []
3207         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3208         seed = float(seed)
3209         for i in range(len(source)):
3210             seed  =  (seed * 211 + 30031 ) % 65536
3211             index  =  math.floor(seed / 65536 * len(source) )
3212             mixed.append(source[int(index)])
3213             source.remove(source[int(index)])
3214         #return ''.join(mixed)
3215         return mixed
3216
3217     def _get_file_id(self, fileId, seed):
3218         mixed = self._get_file_ID_mix_string(seed)
3219         ids = fileId.split('*')
3220         realId = []
3221         for ch in ids:
3222             if ch:
3223                 realId.append(mixed[int(ch)])
3224         return ''.join(realId)
3225
3226     def _real_extract(self, url):
3227         mobj = re.match(self._VALID_URL, url)
3228         if mobj is None:
3229             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3230             return
3231         video_id = mobj.group('ID')
3232
3233         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3234
3235         request = compat_urllib_request.Request(info_url, None, std_headers)
3236         try:
3237             self.report_download_webpage(video_id)
3238             jsondata = compat_urllib_request.urlopen(request).read()
3239         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3240             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3241             return
3242
3243         self.report_extraction(video_id)
3244         try:
3245             jsonstr = jsondata.decode('utf-8')
3246             config = json.loads(jsonstr)
3247
3248             video_title =  config['data'][0]['title']
3249             seed = config['data'][0]['seed']
3250
3251             format = self._downloader.params.get('format', None)
3252             supported_format = list(config['data'][0]['streamfileids'].keys())
3253
3254             if format is None or format == 'best':
3255                 if 'hd2' in supported_format:
3256                     format = 'hd2'
3257                 else:
3258                     format = 'flv'
3259                 ext = u'flv'
3260             elif format == 'worst':
3261                 format = 'mp4'
3262                 ext = u'mp4'
3263             else:
3264                 format = 'flv'
3265                 ext = u'flv'
3266
3267
3268             fileid = config['data'][0]['streamfileids'][format]
3269             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3270         except (UnicodeDecodeError, ValueError, KeyError):
3271             self._downloader.trouble(u'ERROR: unable to extract info section')
3272             return
3273
3274         files_info=[]
3275         sid = self._gen_sid()
3276         fileid = self._get_file_id(fileid, seed)
3277
3278         #column 8,9 of fileid represent the segment number
3279         #fileid[7:9] should be changed
3280         for index, key in enumerate(keys):
3281
3282             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3283             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3284
3285             info = {
3286                 'id': '%s_part%02d' % (video_id, index),
3287                 'url': download_url,
3288                 'uploader': None,
3289                 'upload_date': None,
3290                 'title': video_title,
3291                 'ext': ext,
3292             }
3293             files_info.append(info)
3294
3295         return files_info
3296
3297
3298 class XNXXIE(InfoExtractor):
3299     """Information extractor for xnxx.com"""
3300
3301     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3302     IE_NAME = u'xnxx'
3303     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3304     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3305     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3306
3307     def report_webpage(self, video_id):
3308         """Report information extraction"""
3309         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3310
3311     def report_extraction(self, video_id):
3312         """Report information extraction"""
3313         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3314
3315     def _real_extract(self, url):
3316         mobj = re.match(self._VALID_URL, url)
3317         if mobj is None:
3318             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3319             return
3320         video_id = mobj.group(1)
3321
3322         self.report_webpage(video_id)
3323
3324         # Get webpage content
3325         try:
3326             webpage_bytes = compat_urllib_request.urlopen(url).read()
3327             webpage = webpage_bytes.decode('utf-8')
3328         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3329             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3330             return
3331
3332         result = re.search(self.VIDEO_URL_RE, webpage)
3333         if result is None:
3334             self._downloader.trouble(u'ERROR: unable to extract video url')
3335             return
3336         video_url = compat_urllib_parse.unquote(result.group(1))
3337
3338         result = re.search(self.VIDEO_TITLE_RE, webpage)
3339         if result is None:
3340             self._downloader.trouble(u'ERROR: unable to extract video title')
3341             return
3342         video_title = result.group(1)
3343
3344         result = re.search(self.VIDEO_THUMB_RE, webpage)
3345         if result is None:
3346             self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3347             return
3348         video_thumbnail = result.group(1)
3349
3350         return [{
3351             'id': video_id,
3352             'url': video_url,
3353             'uploader': None,
3354             'upload_date': None,
3355             'title': video_title,
3356             'ext': 'flv',
3357             'thumbnail': video_thumbnail,
3358             'description': None,
3359         }]
3360
3361
3362 class GooglePlusIE(InfoExtractor):
3363     """Information extractor for plus.google.com."""
3364
3365     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3366     IE_NAME = u'plus.google'
3367
3368     def __init__(self, downloader=None):
3369         InfoExtractor.__init__(self, downloader)
3370
3371     def report_extract_entry(self, url):
3372         """Report downloading extry"""
3373         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3374
3375     def report_date(self, upload_date):
3376         """Report downloading extry"""
3377         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3378
3379     def report_uploader(self, uploader):
3380         """Report downloading extry"""
3381         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3382
3383     def report_title(self, video_title):
3384         """Report downloading extry"""
3385         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3386
3387     def report_extract_vid_page(self, video_page):
3388         """Report information extraction."""
3389         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3390
3391     def _real_extract(self, url):
3392         # Extract id from URL
3393         mobj = re.match(self._VALID_URL, url)
3394         if mobj is None:
3395             self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3396             return
3397
3398         post_url = mobj.group(0)
3399         video_id = mobj.group(1)
3400
3401         video_extension = 'flv'
3402
3403         # Step 1, Retrieve post webpage to extract further information
3404         self.report_extract_entry(post_url)
3405         request = compat_urllib_request.Request(post_url)
3406         try:
3407             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3408         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3409             self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3410             return
3411
3412         # Extract update date
3413         upload_date = None
3414         pattern = 'title="Timestamp">(.*?)</a>'
3415         mobj = re.search(pattern, webpage)
3416         if mobj:
3417             upload_date = mobj.group(1)
3418             # Convert timestring to a format suitable for filename
3419             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3420             upload_date = upload_date.strftime('%Y%m%d')
3421         self.report_date(upload_date)
3422
3423         # Extract uploader
3424         uploader = None
3425         pattern = r'rel\="author".*?>(.*?)</a>'
3426         mobj = re.search(pattern, webpage)
3427         if mobj:
3428             uploader = mobj.group(1)
3429         self.report_uploader(uploader)
3430
3431         # Extract title
3432         # Get the first line for title
3433         video_title = u'NA'
3434         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3435         mobj = re.search(pattern, webpage)
3436         if mobj:
3437             video_title = mobj.group(1)
3438         self.report_title(video_title)
3439
3440         # Step 2, Stimulate clicking the image box to launch video
3441         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3442         mobj = re.search(pattern, webpage)
3443         if mobj is None:
3444             self._downloader.trouble(u'ERROR: unable to extract video page URL')
3445
3446         video_page = mobj.group(1)
3447         request = compat_urllib_request.Request(video_page)
3448         try:
3449             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3452             return
3453         self.report_extract_vid_page(video_page)
3454
3455
3456         # Extract video links on video page
3457         """Extract video links of all sizes"""
3458         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3459         mobj = re.findall(pattern, webpage)
3460         if len(mobj) == 0:
3461             self._downloader.trouble(u'ERROR: unable to extract video links')
3462
3463         # Sort in resolution
3464         links = sorted(mobj)
3465
3466         # Choose the lowest of the sort, i.e. highest resolution
3467         video_url = links[-1]
3468         # Only get the url. The resolution part in the tuple has no use anymore
3469         video_url = video_url[-1]
3470         # Treat escaped \u0026 style hex
3471         try:
3472             video_url = video_url.decode("unicode_escape")
3473         except AttributeError: # Python 3
3474             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3475
3476
3477         return [{
3478             'id':       video_id,
3479             'url':      video_url,
3480             'uploader': uploader,
3481             'upload_date':  upload_date,
3482             'title':    video_title,
3483             'ext':      video_extension,
3484         }]
3485
3486 class NBAIE(InfoExtractor):
3487     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3488     IE_NAME = u'nba'
3489
3490     def _real_extract(self, url):
3491         mobj = re.match(self._VALID_URL, url)
3492         if mobj is None:
3493             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3494             return
3495
3496         video_id = mobj.group(1)
3497         if video_id.endswith('/index.html'):
3498             video_id = video_id[:-len('/index.html')]
3499
3500         webpage = self._download_webpage(url, video_id)
3501
3502         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3503         def _findProp(rexp, default=None):
3504             m = re.search(rexp, webpage)
3505             if m:
3506                 return unescapeHTML(m.group(1))
3507             else:
3508                 return default
3509
3510         shortened_video_id = video_id.rpartition('/')[2]
3511         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3512         info = {
3513             'id': shortened_video_id,
3514             'url': video_url,
3515             'ext': 'mp4',
3516             'title': title,
3517             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3518             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3519         }
3520         return [info]
3521
3522 class JustinTVIE(InfoExtractor):
3523     """Information extractor for justin.tv and twitch.tv"""
3524     # TODO: One broadcast may be split into multiple videos. The key
3525     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3526     # starts at 1 and increases. Can we treat all parts as one video?
3527
3528     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3529         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3530     _JUSTIN_PAGE_LIMIT = 100
3531     IE_NAME = u'justin.tv'
3532
3533     def report_extraction(self, file_id):
3534         """Report information extraction."""
3535         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3536
3537     def report_download_page(self, channel, offset):
3538         """Report attempt to download a single page of videos."""
3539         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3540                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3541
3542     # Return count of items, list of *valid* items
3543     def _parse_page(self, url):
3544         try:
3545             urlh = compat_urllib_request.urlopen(url)
3546             webpage_bytes = urlh.read()
3547             webpage = webpage_bytes.decode('utf-8', 'ignore')
3548         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3549             self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3550             return
3551
3552         response = json.loads(webpage)
3553         if type(response) != list:
3554             error_text = response.get('error', 'unknown error')
3555             self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3556             return
3557         info = []
3558         for clip in response:
3559             video_url = clip['video_file_url']
3560             if video_url:
3561                 video_extension = os.path.splitext(video_url)[1][1:]
3562                 video_date = re.sub('-', '', clip['start_time'][:10])
3563                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3564                 video_id = clip['id']
3565                 video_title = clip.get('title', video_id)
3566                 info.append({
3567                     'id': video_id,
3568                     'url': video_url,
3569                     'title': video_title,
3570                     'uploader': clip.get('channel_name', video_uploader_id),
3571                     'uploader_id': video_uploader_id,
3572                     'upload_date': video_date,
3573                     'ext': video_extension,
3574                 })
3575         return (len(response), info)
3576
3577     def _real_extract(self, url):
3578         mobj = re.match(self._VALID_URL, url)
3579         if mobj is None:
3580             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3581             return
3582
3583         api = 'http://api.justin.tv'
3584         video_id = mobj.group(mobj.lastindex)
3585         paged = False
3586         if mobj.lastindex == 1:
3587             paged = True
3588             api += '/channel/archives/%s.json'
3589         else:
3590             api += '/broadcast/by_archive/%s.json'
3591         api = api % (video_id,)
3592
3593         self.report_extraction(video_id)
3594
3595         info = []
3596         offset = 0
3597         limit = self._JUSTIN_PAGE_LIMIT
3598         while True:
3599             if paged:
3600                 self.report_download_page(video_id, offset)
3601             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3602             page_count, page_info = self._parse_page(page_url)
3603             info.extend(page_info)
3604             if not paged or page_count != limit:
3605                 break
3606             offset += limit
3607         return info
3608
3609 class FunnyOrDieIE(InfoExtractor):
3610     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3611
3612     def _real_extract(self, url):
3613         mobj = re.match(self._VALID_URL, url)
3614         if mobj is None:
3615             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3616             return
3617
3618         video_id = mobj.group('id')
3619         webpage = self._download_webpage(url, video_id)
3620
3621         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3622         if not m:
3623             self._downloader.trouble(u'ERROR: unable to find video information')
3624         video_url = unescapeHTML(m.group('url'))
3625
3626         m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3627         if not m:
3628             self._downloader.trouble(u'Cannot find video title')
3629         title = unescapeHTML(m.group('title'))
3630
3631         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3632         if m:
3633             desc = unescapeHTML(m.group('desc'))
3634         else:
3635             desc = None
3636
3637         info = {
3638             'id': video_id,
3639             'url': video_url,
3640             'ext': 'mp4',
3641             'title': title,
3642             'description': desc,
3643         }
3644         return [info]
3645
3646 class SteamIE(InfoExtractor):
3647     _VALID_URL = r"""http://store.steampowered.com/
3648                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3649                 (?P<gameID>\d+)/?
3650                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3651                 """
3652
3653     @classmethod
3654     def suitable(cls, url):
3655         """Receives a URL and returns True if suitable for this IE."""
3656         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3657
3658     def _real_extract(self, url):
3659         m = re.match(self._VALID_URL, url, re.VERBOSE)
3660         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3661         gameID = m.group('gameID')
3662         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3663         webpage = self._download_webpage(videourl, gameID)
3664         mweb = re.finditer(urlRE, webpage)
3665         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3666         titles = re.finditer(namesRE, webpage)
3667         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3668         thumbs = re.finditer(thumbsRE, webpage)
3669         videos = []
3670         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3671             video_id = vid.group('videoID')
3672             title = vtitle.group('videoName')
3673             video_url = vid.group('videoURL')
3674             video_thumb = thumb.group('thumbnail')
3675             if not video_url:
3676                 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3677             info = {
3678                 'id':video_id,
3679                 'url':video_url,
3680                 'ext': 'flv',
3681                 'title': unescapeHTML(title),
3682                 'thumbnail': video_thumb
3683                   }
3684             videos.append(info)
3685         return videos
3686
3687 class UstreamIE(InfoExtractor):
3688     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3689     IE_NAME = u'ustream'
3690
3691     def _real_extract(self, url):
3692         m = re.match(self._VALID_URL, url)
3693         video_id = m.group('videoID')
3694         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3695         webpage = self._download_webpage(url, video_id)
3696         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3697         title = m.group('title')
3698         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3699         uploader = m.group('uploader')
3700         info = {
3701                 'id':video_id,
3702                 'url':video_url,
3703                 'ext': 'flv',
3704                 'title': title,
3705                 'uploader': uploader
3706                   }
3707         return [info]
3708
3709 class RBMARadioIE(InfoExtractor):
3710     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3711
3712     def _real_extract(self, url):
3713         m = re.match(self._VALID_URL, url)
3714         video_id = m.group('videoID')
3715
3716         webpage = self._download_webpage(url, video_id)
3717         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3718         if not m:
3719             raise ExtractorError(u'Cannot find metadata')
3720         json_data = m.group(1)
3721
3722         try:
3723             data = json.loads(json_data)
3724         except ValueError as e:
3725             raise ExtractorError(u'Invalid JSON: ' + str(e))
3726
3727         video_url = data['akamai_url'] + '&cbr=256'
3728         url_parts = compat_urllib_parse_urlparse(video_url)
3729         video_ext = url_parts.path.rpartition('.')[2]
3730         info = {
3731                 'id': video_id,
3732                 'url': video_url,
3733                 'ext': video_ext,
3734                 'title': data['title'],
3735                 'description': data.get('teaser_text'),
3736                 'location': data.get('country_of_origin'),
3737                 'uploader': data.get('host', {}).get('name'),
3738                 'uploader_id': data.get('host', {}).get('slug'),
3739                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3740                 'duration': data.get('duration'),
3741         }
3742         return [info]
3743
3744
3745 class YouPornIE(InfoExtractor):
3746     """Information extractor for youporn.com."""
3747     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3748
3749     def _print_formats(self, formats):
3750         """Print all available formats"""
3751         print(u'Available formats:')
3752         print(u'ext\t\tformat')
3753         print(u'---------------------------------')
3754         for format in formats:
3755             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3756
3757     def _specific(self, req_format, formats):
3758         for x in formats:
3759             if(x["format"]==req_format):
3760                 return x
3761         return None
3762
3763     def _real_extract(self, url):
3764         mobj = re.match(self._VALID_URL, url)
3765         if mobj is None:
3766             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3767             return
3768
3769         video_id = mobj.group('videoid')
3770
3771         req = compat_urllib_request.Request(url)
3772         req.add_header('Cookie', 'age_verified=1')
3773         webpage = self._download_webpage(req, video_id)
3774
3775         # Get the video title
3776         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3777         if result is None:
3778             raise ExtractorError(u'Unable to extract video title')
3779         video_title = result.group('title').strip()
3780
3781         # Get the video date
3782         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3783         if result is None:
3784             self._downloader.report_warning(u'unable to extract video date')
3785             upload_date = None
3786         else:
3787             upload_date = result.group('date').strip()
3788
3789         # Get the video uploader
3790         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3791         if result is None:
3792             self._downloader.report_warning(u'unable to extract uploader')
3793             video_uploader = None
3794         else:
3795             video_uploader = result.group('uploader').strip()
3796             video_uploader = clean_html( video_uploader )
3797
3798         # Get all of the formats available
3799         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3800         result = re.search(DOWNLOAD_LIST_RE, webpage)
3801         if result is None:
3802             raise ExtractorError(u'Unable to extract download list')
3803         download_list_html = result.group('download_list').strip()
3804
3805         # Get all of the links from the page
3806         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3807         links = re.findall(LINK_RE, download_list_html)
3808         if(len(links) == 0):
3809             raise ExtractorError(u'ERROR: no known formats available for video')
3810
3811         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3812
3813         formats = []
3814         for link in links:
3815
3816             # A link looks like this:
3817             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3818             # A path looks like this:
3819             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3820             video_url = unescapeHTML( link )
3821             path = compat_urllib_parse_urlparse( video_url ).path
3822             extension = os.path.splitext( path )[1][1:]
3823             format = path.split('/')[4].split('_')[:2]
3824             size = format[0]
3825             bitrate = format[1]
3826             format = "-".join( format )
3827             title = u'%s-%s-%s' % (video_title, size, bitrate)
3828
3829             formats.append({
3830                 'id': video_id,
3831                 'url': video_url,
3832                 'uploader': video_uploader,
3833                 'upload_date': upload_date,
3834                 'title': title,
3835                 'ext': extension,
3836                 'format': format,
3837                 'thumbnail': None,
3838                 'description': None,
3839                 'player_url': None
3840             })
3841
3842         if self._downloader.params.get('listformats', None):
3843             self._print_formats(formats)
3844             return
3845
3846         req_format = self._downloader.params.get('format', None)
3847         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3848
3849         if req_format is None or req_format == 'best':
3850             return [formats[0]]
3851         elif req_format == 'worst':
3852             return [formats[-1]]
3853         elif req_format in ('-1', 'all'):
3854             return formats
3855         else:
3856             format = self._specific( req_format, formats )
3857             if result is None:
3858                 self._downloader.trouble(u'ERROR: requested format not available')
3859                 return
3860             return [format]
3861
3862
3863
3864 class PornotubeIE(InfoExtractor):
3865     """Information extractor for pornotube.com."""
3866     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3867
3868     def _real_extract(self, url):
3869         mobj = re.match(self._VALID_URL, url)
3870         if mobj is None:
3871             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3872             return
3873
3874         video_id = mobj.group('videoid')
3875         video_title = mobj.group('title')
3876
3877         # Get webpage content
3878         webpage = self._download_webpage(url, video_id)
3879
3880         # Get the video URL
3881         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3882         result = re.search(VIDEO_URL_RE, webpage)
3883         if result is None:
3884             self._downloader.trouble(u'ERROR: unable to extract video url')
3885             return
3886         video_url = compat_urllib_parse.unquote(result.group('url'))
3887
3888         #Get the uploaded date
3889         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3890         result = re.search(VIDEO_UPLOADED_RE, webpage)
3891         if result is None:
3892             self._downloader.trouble(u'ERROR: unable to extract video title')
3893             return
3894         upload_date = result.group('date')
3895
3896         info = {'id': video_id,
3897                 'url': video_url,
3898                 'uploader': None,
3899                 'upload_date': upload_date,
3900                 'title': video_title,
3901                 'ext': 'flv',
3902                 'format': 'flv'}
3903
3904         return [info]
3905
3906 class YouJizzIE(InfoExtractor):
3907     """Information extractor for youjizz.com."""
3908     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3909
3910     def _real_extract(self, url):
3911         mobj = re.match(self._VALID_URL, url)
3912         if mobj is None:
3913             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3914             return
3915
3916         video_id = mobj.group('videoid')
3917
3918         # Get webpage content
3919         webpage = self._download_webpage(url, video_id)
3920
3921         # Get the video title
3922         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3923         if result is None:
3924             raise ExtractorError(u'ERROR: unable to extract video title')
3925         video_title = result.group('title').strip()
3926
3927         # Get the embed page
3928         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3929         if result is None:
3930             raise ExtractorError(u'ERROR: unable to extract embed page')
3931
3932         embed_page_url = result.group(0).strip()
3933         video_id = result.group('videoid')
3934
3935         webpage = self._download_webpage(embed_page_url, video_id)
3936
3937         # Get the video URL
3938         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3939         if result is None:
3940             raise ExtractorError(u'ERROR: unable to extract video url')
3941         video_url = result.group('source')
3942
3943         info = {'id': video_id,
3944                 'url': video_url,
3945                 'title': video_title,
3946                 'ext': 'flv',
3947                 'format': 'flv',
3948                 'player_url': embed_page_url}
3949
3950         return [info]
3951
3952 class EightTracksIE(InfoExtractor):
3953     IE_NAME = '8tracks'
3954     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3955
3956     def _real_extract(self, url):
3957         mobj = re.match(self._VALID_URL, url)
3958         if mobj is None:
3959             raise ExtractorError(u'Invalid URL: %s' % url)
3960         playlist_id = mobj.group('id')
3961
3962         webpage = self._download_webpage(url, playlist_id)
3963
3964         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3965         if not m:
3966             raise ExtractorError(u'Cannot find trax information')
3967         json_like = m.group(1)
3968         data = json.loads(json_like)
3969
3970         session = str(random.randint(0, 1000000000))
3971         mix_id = data['id']
3972         track_count = data['tracks_count']
3973         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3974         next_url = first_url
3975         res = []
3976         for i in itertools.count():
3977             api_json = self._download_webpage(next_url, playlist_id,
3978                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3979                 errnote=u'Failed to download song information')
3980             api_data = json.loads(api_json)
3981             track_data = api_data[u'set']['track']
3982             info = {
3983                 'id': track_data['id'],
3984                 'url': track_data['track_file_stream_url'],
3985                 'title': track_data['performer'] + u' - ' + track_data['name'],
3986                 'raw_title': track_data['name'],
3987                 'uploader_id': data['user']['login'],
3988                 'ext': 'm4a',
3989             }
3990             res.append(info)
3991             if api_data['set']['at_last_track']:
3992                 break
3993             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3994         return res
3995
3996 class KeekIE(InfoExtractor):
3997     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3998     IE_NAME = u'keek'
3999
4000     def _real_extract(self, url):
4001         m = re.match(self._VALID_URL, url)
4002         video_id = m.group('videoID')
4003         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4004         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4005         webpage = self._download_webpage(url, video_id)
4006         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4007         title = unescapeHTML(m.group('title'))
4008         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4009         uploader = clean_html(m.group('uploader'))
4010         info = {
4011                 'id': video_id,
4012                 'url': video_url,
4013                 'ext': 'mp4',
4014                 'title': title,
4015                 'thumbnail': thumbnail,
4016                 'uploader': uploader
4017         }
4018         return [info]
4019
4020 class TEDIE(InfoExtractor):
4021     _VALID_URL=r'''http://www.ted.com/
4022                    (
4023                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4024                         |
4025                         ((?P<type_talk>talks)) # We have a simple talk
4026                    )
4027                    /(?P<name>\w+) # Here goes the name and then ".html"
4028                    '''
4029
4030     @classmethod
4031     def suitable(cls, url):
4032         """Receives a URL and returns True if suitable for this IE."""
4033         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4034
4035     def _real_extract(self, url):
4036         m=re.match(self._VALID_URL, url, re.VERBOSE)
4037         if m.group('type_talk'):
4038             return [self._talk_info(url)]
4039         else :
4040             playlist_id=m.group('playlist_id')
4041             name=m.group('name')
4042             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4043             return self._playlist_videos_info(url,name,playlist_id)
4044
4045     def _talk_video_link(self,mediaSlug):
4046         '''Returns the video link for that mediaSlug'''
4047         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4048
4049     def _playlist_videos_info(self,url,name,playlist_id=0):
4050         '''Returns the videos of the playlist'''
4051         video_RE=r'''
4052                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4053                      ([.\s]*?)data-playlist_item_id="(\d+)"
4054                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4055                      '''
4056         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4057         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4058         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4059         m_names=re.finditer(video_name_RE,webpage)
4060         info=[]
4061         for m_video, m_name in zip(m_videos,m_names):
4062             video_id=m_video.group('video_id')
4063             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4064             info.append(self._talk_info(talk_url,video_id))
4065         return info
4066
4067     def _talk_info(self, url, video_id=0):
4068         """Return the video for the talk in the url"""
4069         m=re.match(self._VALID_URL, url,re.VERBOSE)
4070         videoName=m.group('name')
4071         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4072         # If the url includes the language we get the title translated
4073         title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4074         title=re.search(title_RE, webpage).group('title')
4075         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4076                         "id":(?P<videoID>[\d]+).*?
4077                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4078         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4079         thumb_match=re.search(thumb_RE,webpage)
4080         info_match=re.search(info_RE,webpage,re.VERBOSE)
4081         video_id=info_match.group('videoID')
4082         mediaSlug=info_match.group('mediaSlug')
4083         video_url=self._talk_video_link(mediaSlug)
4084         info = {
4085                 'id': video_id,
4086                 'url': video_url,
4087                 'ext': 'mp4',
4088                 'title': title,
4089                 'thumbnail': thumb_match.group('thumbnail')
4090                 }
4091         return info
4092
4093 class MySpassIE(InfoExtractor):
4094     _VALID_URL = r'http://www.myspass.de/.*'
4095
4096     def _real_extract(self, url):
4097         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4098
4099         # video id is the last path element of the URL
4100         # usually there is a trailing slash, so also try the second but last
4101         url_path = compat_urllib_parse_urlparse(url).path
4102         url_parent_path, video_id = os.path.split(url_path)
4103         if not video_id:
4104             _, video_id = os.path.split(url_parent_path)
4105
4106         # get metadata
4107         metadata_url = META_DATA_URL_TEMPLATE % video_id
4108         metadata_text = self._download_webpage(metadata_url, video_id)
4109         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4110
4111         # extract values from metadata
4112         url_flv_el = metadata.find('url_flv')
4113         if url_flv_el is None:
4114             self._downloader.trouble(u'ERROR: unable to extract download url')
4115             return
4116         video_url = url_flv_el.text
4117         extension = os.path.splitext(video_url)[1][1:]
4118         title_el = metadata.find('title')
4119         if title_el is None:
4120             self._downloader.trouble(u'ERROR: unable to extract title')
4121             return
4122         title = title_el.text
4123         format_id_el = metadata.find('format_id')
4124         if format_id_el is None:
4125             format = ext
4126         else:
4127             format = format_id_el.text
4128         description_el = metadata.find('description')
4129         if description_el is not None:
4130             description = description_el.text
4131         else:
4132             description = None
4133         imagePreview_el = metadata.find('imagePreview')
4134         if imagePreview_el is not None:
4135             thumbnail = imagePreview_el.text
4136         else:
4137             thumbnail = None
4138         info = {
4139             'id': video_id,
4140             'url': video_url,
4141             'title': title,
4142             'ext': extension,
4143             'format': format,
4144             'thumbnail': thumbnail,
4145             'description': description
4146         }
4147         return [info]
4148
4149 class SpiegelIE(InfoExtractor):
4150     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?$'
4151
4152     def _real_extract(self, url):
4153         m = re.match(self._VALID_URL, url)
4154         video_id = m.group('videoID')
4155
4156         webpage = self._download_webpage(url, video_id)
4157         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4158         if not m:
4159             raise ExtractorError(u'Cannot find title')
4160         video_title = unescapeHTML(m.group(1))
4161
4162         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4163         xml_code = self._download_webpage(xml_url, video_id,
4164                     note=u'Downloading XML', errnote=u'Failed to download XML')
4165
4166         idoc = xml.etree.ElementTree.fromstring(xml_code)
4167         last_type = idoc[-1]
4168         filename = last_type.findall('./filename')[0].text
4169         duration = float(last_type.findall('./duration')[0].text)
4170
4171         video_url = 'http://video2.spiegel.de/flash/' + filename
4172         video_ext = filename.rpartition('.')[2]
4173         info = {
4174             'id': video_id,
4175             'url': video_url,
4176             'ext': video_ext,
4177             'title': video_title,
4178             'duration': duration,
4179         }
4180         return [info]
4181
4182
4183 def gen_extractors():
4184     """ Return a list of an instance of every supported extractor.
4185     The order does matter; the first extractor matched is the one handling the URL.
4186     """
4187     return [
4188         YoutubePlaylistIE(),
4189         YoutubeChannelIE(),
4190         YoutubeUserIE(),
4191         YoutubeSearchIE(),
4192         YoutubeIE(),
4193         MetacafeIE(),
4194         DailymotionIE(),
4195         GoogleSearchIE(),
4196         PhotobucketIE(),
4197         YahooIE(),
4198         YahooSearchIE(),
4199         DepositFilesIE(),
4200         FacebookIE(),
4201         BlipTVUserIE(),
4202         BlipTVIE(),
4203         VimeoIE(),
4204         MyVideoIE(),
4205         ComedyCentralIE(),
4206         EscapistIE(),
4207         CollegeHumorIE(),
4208         XVideosIE(),
4209         SoundcloudIE(),
4210         InfoQIE(),
4211         MixcloudIE(),
4212         StanfordOpenClassroomIE(),
4213         MTVIE(),
4214         YoukuIE(),
4215         XNXXIE(),
4216         YouJizzIE(),
4217         PornotubeIE(),
4218         YouPornIE(),
4219         GooglePlusIE(),
4220         ArteTvIE(),
4221         NBAIE(),
4222         JustinTVIE(),
4223         FunnyOrDieIE(),
4224         SteamIE(),
4225         UstreamIE(),
4226         RBMARadioIE(),
4227         EightTracksIE(),
4228         KeekIE(),
4229         TEDIE(),
4230         MySpassIE(),
4231         SpiegelIE(),
4232         GenericIE()
4233     ]
4234
4235