_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 119         try:
 120             return compat_urllib_request.urlopen(url_or_request)
 121         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 122             if errnote is None:
 123                 errnote = u'Unable to download webpage'
 124             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 125
 126     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 127         """ Returns the data of the page as a string """
 128         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 129         content_type = urlh.headers.get('Content-Type', '')
 130         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 131         if m:
 132             encoding = m.group(1)
 133         else:
 134             encoding = 'utf-8'
 135         webpage_bytes = urlh.read()
 136         return webpage_bytes.decode(encoding, 'replace')
 137
 138
 139 class YoutubeIE(InfoExtractor):
 140     """Information extractor for youtube.com."""
 141
 142     _VALID_URL = r"""^
 143                      (
 144                          (?:https?://)?                                       # http(s):// (optional)
 145                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 146                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 147                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 148                          (?:                                                  # the various things that can precede the ID:
 149                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 150                              |(?:                                             # or the v= param in all its forms
 151                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 152                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 153                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 154                                  v=
 155                              )
 156                          )?                                                   # optional -> youtube.com/xxxx is OK
 157                      )?                                                       # all until now is optional -> you can pass the naked ID
 158                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 159                      (?(1).+)?                                                # if we found the ID, everything can follow
 160                      $"""
 161     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 162     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 163     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 164     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 165     _NETRC_MACHINE = 'youtube'
 166     # Listed in order of quality
 167     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 168     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 169     _video_extensions = {
 170         '13': '3gp',
 171         '17': 'mp4',
 172         '18': 'mp4',
 173         '22': 'mp4',
 174         '37': 'mp4',
 175         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 176         '43': 'webm',
 177         '44': 'webm',
 178         '45': 'webm',
 179         '46': 'webm',
 180     }
 181     _video_dimensions = {
 182         '5': '240x400',
 183         '6': '???',
 184         '13': '???',
 185         '17': '144x176',
 186         '18': '360x640',
 187         '22': '720x1280',
 188         '34': '360x640',
 189         '35': '480x854',
 190         '37': '1080x1920',
 191         '38': '3072x4096',
 192         '43': '360x640',
 193         '44': '480x854',
 194         '45': '720x1280',
 195         '46': '1080x1920',
 196     }
 197     IE_NAME = u'youtube'
 198
 199     @classmethod
 200     def suitable(cls, url):
 201         """Receives a URL and returns True if suitable for this IE."""
 202         if YoutubePlaylistIE.suitable(url): return False
 203         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 204
 205     def report_lang(self):
 206         """Report attempt to set language."""
 207         self._downloader.to_screen(u'[youtube] Setting language')
 208
 209     def report_login(self):
 210         """Report attempt to log in."""
 211         self._downloader.to_screen(u'[youtube] Logging in')
 212
 213     def report_age_confirmation(self):
 214         """Report attempt to confirm age."""
 215         self._downloader.to_screen(u'[youtube] Confirming age')
 216
 217     def report_video_webpage_download(self, video_id):
 218         """Report attempt to download video webpage."""
 219         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 220
 221     def report_video_info_webpage_download(self, video_id):
 222         """Report attempt to download video info webpage."""
 223         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 224
 225     def report_video_subtitles_download(self, video_id):
 226         """Report attempt to download video info webpage."""
 227         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 228
 229     def report_video_subtitles_request(self, video_id, sub_lang, format):
 230         """Report attempt to download video info webpage."""
 231         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 232
 233     def report_video_subtitles_available(self, video_id, sub_lang_list):
 234         """Report available subtitles."""
 235         sub_lang = ",".join(list(sub_lang_list.keys()))
 236         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 237
 238     def report_information_extraction(self, video_id):
 239         """Report attempt to extract video information."""
 240         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 241
 242     def report_unavailable_format(self, video_id, format):
 243         """Report extracted video URL."""
 244         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 245
 246     def report_rtmp_download(self):
 247         """Indicate the download will use the RTMP protocol."""
 248         self._downloader.to_screen(u'[youtube] RTMP download detected')
 249
 250     def _get_available_subtitles(self, video_id):
 251         self.report_video_subtitles_download(video_id)
 252         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 253         try:
 254             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 255         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 256             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 257         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 258         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 259         if not sub_lang_list:
 260             return (u'video doesn\'t have subtitles', None)
 261         return sub_lang_list
 262
 263     def _list_available_subtitles(self, video_id):
 264         sub_lang_list = self._get_available_subtitles(video_id)
 265         self.report_video_subtitles_available(video_id, sub_lang_list)
 266
 267     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 268         """
 269         Return tuple:
 270         (error_message, sub_lang, sub)
 271         """
 272         self.report_video_subtitles_request(video_id, sub_lang, format)
 273         params = compat_urllib_parse.urlencode({
 274             'lang': sub_lang,
 275             'name': sub_name,
 276             'v': video_id,
 277             'fmt': format,
 278         })
 279         url = 'http://www.youtube.com/api/timedtext?' + params
 280         try:
 281             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 282         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 283             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 284         if not sub:
 285             return (u'Did not fetch video subtitles', None, None)
 286         return (None, sub_lang, sub)
 287
 288     def _extract_subtitle(self, video_id):
 289         """
 290         Return a list with a tuple:
 291         [(error_message, sub_lang, sub)]
 292         """
 293         sub_lang_list = self._get_available_subtitles(video_id)
 294         sub_format = self._downloader.params.get('subtitlesformat')
 295         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 296             return [(sub_lang_list[0], None, None)]
 297         if self._downloader.params.get('subtitleslang', False):
 298             sub_lang = self._downloader.params.get('subtitleslang')
 299         elif 'en' in sub_lang_list:
 300             sub_lang = 'en'
 301         else:
 302             sub_lang = list(sub_lang_list.keys())[0]
 303         if not sub_lang in sub_lang_list:
 304             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 305
 306         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 307         return [subtitle]
 308
 309     def _extract_all_subtitles(self, video_id):
 310         sub_lang_list = self._get_available_subtitles(video_id)
 311         sub_format = self._downloader.params.get('subtitlesformat')
 312         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 313             return [(sub_lang_list[0], None, None)]
 314         subtitles = []
 315         for sub_lang in sub_lang_list:
 316             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 317             subtitles.append(subtitle)
 318         return subtitles
 319
 320     def _print_formats(self, formats):
 321         print('Available formats:')
 322         for x in formats:
 323             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 324
 325     def _real_initialize(self):
 326         if self._downloader is None:
 327             return
 328
 329         username = None
 330         password = None
 331         downloader_params = self._downloader.params
 332
 333         # Attempt to use provided username and password or .netrc data
 334         if downloader_params.get('username', None) is not None:
 335             username = downloader_params['username']
 336             password = downloader_params['password']
 337         elif downloader_params.get('usenetrc', False):
 338             try:
 339                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 340                 if info is not None:
 341                     username = info[0]
 342                     password = info[2]
 343                 else:
 344                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 345             except (IOError, netrc.NetrcParseError) as err:
 346                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 347                 return
 348
 349         # Set language
 350         request = compat_urllib_request.Request(self._LANG_URL)
 351         try:
 352             self.report_lang()
 353             compat_urllib_request.urlopen(request).read()
 354         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 355             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 356             return
 357
 358         # No authentication to be performed
 359         if username is None:
 360             return
 361
 362         request = compat_urllib_request.Request(self._LOGIN_URL)
 363         try:
 364             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 365         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 366             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 367             return
 368
 369         galx = None
 370         dsh = None
 371         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 372         if match:
 373           galx = match.group(1)
 374
 375         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 376         if match:
 377           dsh = match.group(1)
 378
 379         # Log in
 380         login_form_strs = {
 381                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 382                 u'Email': username,
 383                 u'GALX': galx,
 384                 u'Passwd': password,
 385                 u'PersistentCookie': u'yes',
 386                 u'_utf8': u'霱',
 387                 u'bgresponse': u'js_disabled',
 388                 u'checkConnection': u'',
 389                 u'checkedDomains': u'youtube',
 390                 u'dnConn': u'',
 391                 u'dsh': dsh,
 392                 u'pstMsg': u'0',
 393                 u'rmShown': u'1',
 394                 u'secTok': u'',
 395                 u'signIn': u'Sign in',
 396                 u'timeStmp': u'',
 397                 u'service': u'youtube',
 398                 u'uilel': u'3',
 399                 u'hl': u'en_US',
 400         }
 401         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 402         # chokes on unicode
 403         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 404         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 405         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 406         try:
 407             self.report_login()
 408             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 410                 self._downloader.report_warning(u'unable to log in: bad username or password')
 411                 return
 412         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 413             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 414             return
 415
 416         # Confirm age
 417         age_form = {
 418                 'next_url':     '/',
 419                 'action_confirm':   'Confirm',
 420                 }
 421         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 422         try:
 423             self.report_age_confirmation()
 424             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 425         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 426             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 427             return
 428
 429     def _extract_id(self, url):
 430         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 431         if mobj is None:
 432             self._downloader.report_error(u'invalid URL: %s' % url)
 433             return
 434         video_id = mobj.group(2)
 435         return video_id
 436
 437     def _real_extract(self, url):
 438         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 439         mobj = re.search(self._NEXT_URL_RE, url)
 440         if mobj:
 441             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 442         video_id = self._extract_id(url)
 443
 444         # Get video webpage
 445         self.report_video_webpage_download(video_id)
 446         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 447         request = compat_urllib_request.Request(url)
 448         try:
 449             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 450         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 451             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 452             return
 453
 454         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 455
 456         # Attempt to extract SWF player URL
 457         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 458         if mobj is not None:
 459             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 460         else:
 461             player_url = None
 462
 463         # Get video info
 464         self.report_video_info_webpage_download(video_id)
 465         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 466             video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 467                     % (video_id, el_type))
 468             request = compat_urllib_request.Request(video_info_url)
 469             try:
 470                 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
 471                 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
 472                 video_info = compat_parse_qs(video_info_webpage)
 473                 if 'token' in video_info:
 474                     break
 475             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 476                 self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
 477                 return
 478         if 'token' not in video_info:
 479             if 'reason' in video_info:
 480                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 481             else:
 482                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 483             return
 484
 485         # Check for "rental" videos
 486         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 487             self._downloader.report_error(u'"rental" videos not supported')
 488             return
 489
 490         # Start extracting information
 491         self.report_information_extraction(video_id)
 492
 493         # uploader
 494         if 'author' not in video_info:
 495             self._downloader.report_error(u'unable to extract uploader name')
 496             return
 497         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 498
 499         # uploader_id
 500         video_uploader_id = None
 501         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 502         if mobj is not None:
 503             video_uploader_id = mobj.group(1)
 504         else:
 505             self._downloader.report_warning(u'unable to extract uploader nickname')
 506
 507         # title
 508         if 'title' not in video_info:
 509             self._downloader.report_error(u'unable to extract video title')
 510             return
 511         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 512
 513         # thumbnail image
 514         if 'thumbnail_url' not in video_info:
 515             self._downloader.report_warning(u'unable to extract video thumbnail')
 516             video_thumbnail = ''
 517         else:   # don't panic if we can't find it
 518             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 519
 520         # upload date
 521         upload_date = None
 522         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 523         if mobj is not None:
 524             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 525             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 526             for expression in format_expressions:
 527                 try:
 528                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 529                 except:
 530                     pass
 531
 532         # description
 533         video_description = get_element_by_id("eow-description", video_webpage)
 534         if video_description:
 535             video_description = clean_html(video_description)
 536         else:
 537             video_description = ''
 538
 539         # subtitles
 540         video_subtitles = None
 541
 542         if self._downloader.params.get('writesubtitles', False):
 543             video_subtitles = self._extract_subtitle(video_id)
 544             if video_subtitles:
 545                 (sub_error, sub_lang, sub) = video_subtitles[0]
 546                 if sub_error:
 547                     self._downloader.report_error(sub_error)
 548
 549         if self._downloader.params.get('allsubtitles', False):
 550             video_subtitles = self._extract_all_subtitles(video_id)
 551             for video_subtitle in video_subtitles:
 552                 (sub_error, sub_lang, sub) = video_subtitle
 553                 if sub_error:
 554                     self._downloader.report_error(sub_error)
 555
 556         if self._downloader.params.get('listsubtitles', False):
 557             sub_lang_list = self._list_available_subtitles(video_id)
 558             return
 559
 560         if 'length_seconds' not in video_info:
 561             self._downloader.report_warning(u'unable to extract video duration')
 562             video_duration = ''
 563         else:
 564             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 565
 566         # token
 567         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 568
 569         # Decide which formats to download
 570         req_format = self._downloader.params.get('format', None)
 571
 572         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 573             self.report_rtmp_download()
 574             video_url_list = [(None, video_info['conn'][0])]
 575         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 576             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 577             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 578             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 579             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 580
 581             format_limit = self._downloader.params.get('format_limit', None)
 582             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 583             if format_limit is not None and format_limit in available_formats:
 584                 format_list = available_formats[available_formats.index(format_limit):]
 585             else:
 586                 format_list = available_formats
 587             existing_formats = [x for x in format_list if x in url_map]
 588             if len(existing_formats) == 0:
 589                 self._downloader.report_error(u'no known formats available for video')
 590                 return
 591             if self._downloader.params.get('listformats', None):
 592                 self._print_formats(existing_formats)
 593                 return
 594             if req_format is None or req_format == 'best':
 595                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 596             elif req_format == 'worst':
 597                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 598             elif req_format in ('-1', 'all'):
 599                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 600             else:
 601                 # Specific formats. We pick the first in a slash-delimeted sequence.
 602                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 603                 req_formats = req_format.split('/')
 604                 video_url_list = None
 605                 for rf in req_formats:
 606                     if rf in url_map:
 607                         video_url_list = [(rf, url_map[rf])]
 608                         break
 609                 if video_url_list is None:
 610                     self._downloader.report_error(u'requested format not available')
 611                     return
 612         else:
 613             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 614             return
 615
 616         results = []
 617         for format_param, video_real_url in video_url_list:
 618             # Extension
 619             video_extension = self._video_extensions.get(format_param, 'flv')
 620
 621             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 622                                               self._video_dimensions.get(format_param, '???'))
 623
 624             results.append({
 625                 'id':       video_id,
 626                 'url':      video_real_url,
 627                 'uploader': video_uploader,
 628                 'uploader_id': video_uploader_id,
 629                 'upload_date':  upload_date,
 630                 'title':    video_title,
 631                 'ext':      video_extension,
 632                 'format':   video_format,
 633                 'thumbnail':    video_thumbnail,
 634                 'description':  video_description,
 635                 'player_url':   player_url,
 636                 'subtitles':    video_subtitles,
 637                 'duration':     video_duration
 638             })
 639         return results
 640
 641
 642 class MetacafeIE(InfoExtractor):
 643     """Information Extractor for metacafe.com."""
 644
 645     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 646     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 647     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 648     IE_NAME = u'metacafe'
 649
 650     def __init__(self, downloader=None):
 651         InfoExtractor.__init__(self, downloader)
 652
 653     def report_disclaimer(self):
 654         """Report disclaimer retrieval."""
 655         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 656
 657     def report_age_confirmation(self):
 658         """Report attempt to confirm age."""
 659         self._downloader.to_screen(u'[metacafe] Confirming age')
 660
 661     def report_download_webpage(self, video_id):
 662         """Report webpage download."""
 663         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 664
 665     def report_extraction(self, video_id):
 666         """Report information extraction."""
 667         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 668
 669     def _real_initialize(self):
 670         # Retrieve disclaimer
 671         request = compat_urllib_request.Request(self._DISCLAIMER)
 672         try:
 673             self.report_disclaimer()
 674             disclaimer = compat_urllib_request.urlopen(request).read()
 675         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 676             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 677             return
 678
 679         # Confirm age
 680         disclaimer_form = {
 681             'filters': '0',
 682             'submit': "Continue - I'm over 18",
 683             }
 684         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 685         try:
 686             self.report_age_confirmation()
 687             disclaimer = compat_urllib_request.urlopen(request).read()
 688         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 689             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 690             return
 691
 692     def _real_extract(self, url):
 693         # Extract id and simplified title from URL
 694         mobj = re.match(self._VALID_URL, url)
 695         if mobj is None:
 696             self._downloader.report_error(u'invalid URL: %s' % url)
 697             return
 698
 699         video_id = mobj.group(1)
 700
 701         # Check if video comes from YouTube
 702         mobj2 = re.match(r'^yt-(.*)$', video_id)
 703         if mobj2 is not None:
 704             self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 705             return
 706
 707         # Retrieve video webpage to extract further information
 708         request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
 709         try:
 710             self.report_download_webpage(video_id)
 711             webpage = compat_urllib_request.urlopen(request).read()
 712         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 713             self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
 714             return
 715
 716         # Extract URL, uploader and title from webpage
 717         self.report_extraction(video_id)
 718         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 719         if mobj is not None:
 720             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 721             video_extension = mediaURL[-3:]
 722
 723             # Extract gdaKey if available
 724             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 725             if mobj is None:
 726                 video_url = mediaURL
 727             else:
 728                 gdaKey = mobj.group(1)
 729                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 730         else:
 731             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 732             if mobj is None:
 733                 self._downloader.report_error(u'unable to extract media URL')
 734                 return
 735             vardict = compat_parse_qs(mobj.group(1))
 736             if 'mediaData' not in vardict:
 737                 self._downloader.report_error(u'unable to extract media URL')
 738                 return
 739             mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 740             if mobj is None:
 741                 self._downloader.report_error(u'unable to extract media URL')
 742                 return
 743             mediaURL = mobj.group(1).replace('\\/', '/')
 744             video_extension = mediaURL[-3:]
 745             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 746
 747         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 748         if mobj is None:
 749             self._downloader.report_error(u'unable to extract title')
 750             return
 751         video_title = mobj.group(1).decode('utf-8')
 752
 753         mobj = re.search(r'submitter=(.*?);', webpage)
 754         if mobj is None:
 755             self._downloader.report_error(u'unable to extract uploader nickname')
 756             return
 757         video_uploader = mobj.group(1)
 758
 759         return [{
 760             'id':       video_id.decode('utf-8'),
 761             'url':      video_url.decode('utf-8'),
 762             'uploader': video_uploader.decode('utf-8'),
 763             'upload_date':  None,
 764             'title':    video_title,
 765             'ext':      video_extension.decode('utf-8'),
 766         }]
 767
 768
 769 class DailymotionIE(InfoExtractor):
 770     """Information Extractor for Dailymotion"""
 771
 772     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 773     IE_NAME = u'dailymotion'
 774     _WORKING = False
 775
 776     def __init__(self, downloader=None):
 777         InfoExtractor.__init__(self, downloader)
 778
 779     def report_extraction(self, video_id):
 780         """Report information extraction."""
 781         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 782
 783     def _real_extract(self, url):
 784         # Extract id and simplified title from URL
 785         mobj = re.match(self._VALID_URL, url)
 786         if mobj is None:
 787             self._downloader.report_error(u'invalid URL: %s' % url)
 788             return
 789
 790         video_id = mobj.group(1).split('_')[0].split('?')[0]
 791
 792         video_extension = 'mp4'
 793
 794         # Retrieve video webpage to extract further information
 795         request = compat_urllib_request.Request(url)
 796         request.add_header('Cookie', 'family_filter=off')
 797         webpage = self._download_webpage(request, video_id)
 798
 799         # Extract URL, uploader and title from webpage
 800         self.report_extraction(video_id)
 801         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 802         if mobj is None:
 803             self._downloader.report_error(u'unable to extract media URL')
 804             return
 805         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 806
 807         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 808             if key in flashvars:
 809                 max_quality = key
 810                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 811                 break
 812         else:
 813             self._downloader.report_error(u'unable to extract video URL')
 814             return
 815
 816         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 817         if mobj is None:
 818             self._downloader.report_error(u'unable to extract video URL')
 819             return
 820
 821         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 822
 823         # TODO: support choosing qualities
 824
 825         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 826         if mobj is None:
 827             self._downloader.report_error(u'unable to extract title')
 828             return
 829         video_title = unescapeHTML(mobj.group('title'))
 830
 831         video_uploader = None
 832         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 833         if mobj is None:
 834             # lookin for official user
 835             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 836             if mobj_official is None:
 837                 self._downloader.report_warning(u'unable to extract uploader nickname')
 838             else:
 839                 video_uploader = mobj_official.group(1)
 840         else:
 841             video_uploader = mobj.group(1)
 842
 843         video_upload_date = None
 844         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 845         if mobj is not None:
 846             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 847
 848         return [{
 849             'id':       video_id,
 850             'url':      video_url,
 851             'uploader': video_uploader,
 852             'upload_date':  video_upload_date,
 853             'title':    video_title,
 854             'ext':      video_extension,
 855         }]
 856
 857
 858 class PhotobucketIE(InfoExtractor):
 859     """Information extractor for photobucket.com."""
 860
 861     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 862     IE_NAME = u'photobucket'
 863
 864     def __init__(self, downloader=None):
 865         InfoExtractor.__init__(self, downloader)
 866
 867     def report_download_webpage(self, video_id):
 868         """Report webpage download."""
 869         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 870
 871     def report_extraction(self, video_id):
 872         """Report information extraction."""
 873         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 874
 875     def _real_extract(self, url):
 876         # Extract id from URL
 877         mobj = re.match(self._VALID_URL, url)
 878         if mobj is None:
 879             self._downloader.report_error(u'Invalid URL: %s' % url)
 880             return
 881
 882         video_id = mobj.group(1)
 883
 884         video_extension = 'flv'
 885
 886         # Retrieve video webpage to extract further information
 887         request = compat_urllib_request.Request(url)
 888         try:
 889             self.report_download_webpage(video_id)
 890             webpage = compat_urllib_request.urlopen(request).read()
 891         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 892             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 893             return
 894
 895         # Extract URL, uploader, and title from webpage
 896         self.report_extraction(video_id)
 897         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 898         if mobj is None:
 899             self._downloader.report_error(u'unable to extract media URL')
 900             return
 901         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 902
 903         video_url = mediaURL
 904
 905         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 906         if mobj is None:
 907             self._downloader.report_error(u'unable to extract title')
 908             return
 909         video_title = mobj.group(1).decode('utf-8')
 910
 911         video_uploader = mobj.group(2).decode('utf-8')
 912
 913         return [{
 914             'id':       video_id.decode('utf-8'),
 915             'url':      video_url.decode('utf-8'),
 916             'uploader': video_uploader,
 917             'upload_date':  None,
 918             'title':    video_title,
 919             'ext':      video_extension.decode('utf-8'),
 920         }]
 921
 922
 923 class YahooIE(InfoExtractor):
 924     """Information extractor for video.yahoo.com."""
 925
 926     _WORKING = False
 927     # _VALID_URL matches all Yahoo! Video URLs
 928     # _VPAGE_URL matches only the extractable '/watch/' URLs
 929     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 930     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 931     IE_NAME = u'video.yahoo'
 932
 933     def __init__(self, downloader=None):
 934         InfoExtractor.__init__(self, downloader)
 935
 936     def report_download_webpage(self, video_id):
 937         """Report webpage download."""
 938         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 939
 940     def report_extraction(self, video_id):
 941         """Report information extraction."""
 942         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 943
 944     def _real_extract(self, url, new_video=True):
 945         # Extract ID from URL
 946         mobj = re.match(self._VALID_URL, url)
 947         if mobj is None:
 948             self._downloader.report_error(u'Invalid URL: %s' % url)
 949             return
 950
 951         video_id = mobj.group(2)
 952         video_extension = 'flv'
 953
 954         # Rewrite valid but non-extractable URLs as
 955         # extractable English language /watch/ URLs
 956         if re.match(self._VPAGE_URL, url) is None:
 957             request = compat_urllib_request.Request(url)
 958             try:
 959                 webpage = compat_urllib_request.urlopen(request).read()
 960             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 961                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 962                 return
 963
 964             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 965             if mobj is None:
 966                 self._downloader.report_error(u'Unable to extract id field')
 967                 return
 968             yahoo_id = mobj.group(1)
 969
 970             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 971             if mobj is None:
 972                 self._downloader.report_error(u'Unable to extract vid field')
 973                 return
 974             yahoo_vid = mobj.group(1)
 975
 976             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 977             return self._real_extract(url, new_video=False)
 978
 979         # Retrieve video webpage to extract further information
 980         request = compat_urllib_request.Request(url)
 981         try:
 982             self.report_download_webpage(video_id)
 983             webpage = compat_urllib_request.urlopen(request).read()
 984         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 985             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 986             return
 987
 988         # Extract uploader and title from webpage
 989         self.report_extraction(video_id)
 990         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 991         if mobj is None:
 992             self._downloader.report_error(u'unable to extract video title')
 993             return
 994         video_title = mobj.group(1).decode('utf-8')
 995
 996         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 997         if mobj is None:
 998             self._downloader.report_error(u'unable to extract video uploader')
 999             return
1000         video_uploader = mobj.group(1).decode('utf-8')
1001
1002         # Extract video thumbnail
1003         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1004         if mobj is None:
1005             self._downloader.report_error(u'unable to extract video thumbnail')
1006             return
1007         video_thumbnail = mobj.group(1).decode('utf-8')
1008
1009         # Extract video description
1010         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1011         if mobj is None:
1012             self._downloader.report_error(u'unable to extract video description')
1013             return
1014         video_description = mobj.group(1).decode('utf-8')
1015         if not video_description:
1016             video_description = 'No description available.'
1017
1018         # Extract video height and width
1019         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1020         if mobj is None:
1021             self._downloader.report_error(u'unable to extract video height')
1022             return
1023         yv_video_height = mobj.group(1)
1024
1025         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1026         if mobj is None:
1027             self._downloader.report_error(u'unable to extract video width')
1028             return
1029         yv_video_width = mobj.group(1)
1030
1031         # Retrieve video playlist to extract media URL
1032         # I'm not completely sure what all these options are, but we
1033         # seem to need most of them, otherwise the server sends a 401.
1034         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1035         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1036         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1037                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1038                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1039         try:
1040             self.report_download_webpage(video_id)
1041             webpage = compat_urllib_request.urlopen(request).read()
1042         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1043             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1044             return
1045
1046         # Extract media URL from playlist XML
1047         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1048         if mobj is None:
1049             self._downloader.report_error(u'Unable to extract media URL')
1050             return
1051         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1052         video_url = unescapeHTML(video_url)
1053
1054         return [{
1055             'id':       video_id.decode('utf-8'),
1056             'url':      video_url,
1057             'uploader': video_uploader,
1058             'upload_date':  None,
1059             'title':    video_title,
1060             'ext':      video_extension.decode('utf-8'),
1061             'thumbnail':    video_thumbnail.decode('utf-8'),
1062             'description':  video_description,
1063         }]
1064
1065
1066 class VimeoIE(InfoExtractor):
1067     """Information extractor for vimeo.com."""
1068
1069     # _VALID_URL matches Vimeo URLs
1070     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1071     IE_NAME = u'vimeo'
1072
1073     def __init__(self, downloader=None):
1074         InfoExtractor.__init__(self, downloader)
1075
1076     def report_download_webpage(self, video_id):
1077         """Report webpage download."""
1078         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1079
1080     def report_extraction(self, video_id):
1081         """Report information extraction."""
1082         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1083
1084     def _real_extract(self, url, new_video=True):
1085         # Extract ID from URL
1086         mobj = re.match(self._VALID_URL, url)
1087         if mobj is None:
1088             self._downloader.report_error(u'Invalid URL: %s' % url)
1089             return
1090
1091         video_id = mobj.group('id')
1092         if not mobj.group('proto'):
1093             url = 'https://' + url
1094         if mobj.group('direct_link'):
1095             url = 'https://vimeo.com/' + video_id
1096
1097         # Retrieve video webpage to extract further information
1098         request = compat_urllib_request.Request(url, None, std_headers)
1099         try:
1100             self.report_download_webpage(video_id)
1101             webpage_bytes = compat_urllib_request.urlopen(request).read()
1102             webpage = webpage_bytes.decode('utf-8')
1103         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1104             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1105             return
1106
1107         # Now we begin extracting as much information as we can from what we
1108         # retrieved. First we extract the information common to all extractors,
1109         # and latter we extract those that are Vimeo specific.
1110         self.report_extraction(video_id)
1111
1112         # Extract the config JSON
1113         try:
1114             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1115             config = json.loads(config)
1116         except:
1117             self._downloader.report_error(u'unable to extract info section')
1118             return
1119
1120         # Extract title
1121         video_title = config["video"]["title"]
1122
1123         # Extract uploader and uploader_id
1124         video_uploader = config["video"]["owner"]["name"]
1125         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1126
1127         # Extract video thumbnail
1128         video_thumbnail = config["video"]["thumbnail"]
1129
1130         # Extract video description
1131         video_description = get_element_by_attribute("itemprop", "description", webpage)
1132         if video_description: video_description = clean_html(video_description)
1133         else: video_description = ''
1134
1135         # Extract upload date
1136         video_upload_date = None
1137         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1138         if mobj is not None:
1139             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1140
1141         # Vimeo specific: extract request signature and timestamp
1142         sig = config['request']['signature']
1143         timestamp = config['request']['timestamp']
1144
1145         # Vimeo specific: extract video codec and quality information
1146         # First consider quality, then codecs, then take everything
1147         # TODO bind to format param
1148         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1149         files = { 'hd': [], 'sd': [], 'other': []}
1150         for codec_name, codec_extension in codecs:
1151             if codec_name in config["video"]["files"]:
1152                 if 'hd' in config["video"]["files"][codec_name]:
1153                     files['hd'].append((codec_name, codec_extension, 'hd'))
1154                 elif 'sd' in config["video"]["files"][codec_name]:
1155                     files['sd'].append((codec_name, codec_extension, 'sd'))
1156                 else:
1157                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1158
1159         for quality in ('hd', 'sd', 'other'):
1160             if len(files[quality]) > 0:
1161                 video_quality = files[quality][0][2]
1162                 video_codec = files[quality][0][0]
1163                 video_extension = files[quality][0][1]
1164                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1165                 break
1166         else:
1167             self._downloader.report_error(u'no known codec found')
1168             return
1169
1170         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1171                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1172
1173         return [{
1174             'id':       video_id,
1175             'url':      video_url,
1176             'uploader': video_uploader,
1177             'uploader_id': video_uploader_id,
1178             'upload_date':  video_upload_date,
1179             'title':    video_title,
1180             'ext':      video_extension,
1181             'thumbnail':    video_thumbnail,
1182             'description':  video_description,
1183         }]
1184
1185
1186 class ArteTvIE(InfoExtractor):
1187     """arte.tv information extractor."""
1188
1189     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1190     _LIVE_URL = r'index-[0-9]+\.html$'
1191
1192     IE_NAME = u'arte.tv'
1193
1194     def __init__(self, downloader=None):
1195         InfoExtractor.__init__(self, downloader)
1196
1197     def report_download_webpage(self, video_id):
1198         """Report webpage download."""
1199         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1200
1201     def report_extraction(self, video_id):
1202         """Report information extraction."""
1203         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1204
1205     def fetch_webpage(self, url):
1206         request = compat_urllib_request.Request(url)
1207         try:
1208             self.report_download_webpage(url)
1209             webpage = compat_urllib_request.urlopen(request).read()
1210         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1211             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1212             return
1213         except ValueError as err:
1214             self._downloader.report_error(u'Invalid URL: %s' % url)
1215             return
1216         return webpage
1217
1218     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1219         page = self.fetch_webpage(url)
1220         mobj = re.search(regex, page, regexFlags)
1221         info = {}
1222
1223         if mobj is None:
1224             self._downloader.report_error(u'Invalid URL: %s' % url)
1225             return
1226
1227         for (i, key, err) in matchTuples:
1228             if mobj.group(i) is None:
1229                 self._downloader.trouble(err)
1230                 return
1231             else:
1232                 info[key] = mobj.group(i)
1233
1234         return info
1235
1236     def extractLiveStream(self, url):
1237         video_lang = url.split('/')[-4]
1238         info = self.grep_webpage(
1239             url,
1240             r'src="(.*?/videothek_js.*?\.js)',
1241             0,
1242             [
1243                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1244             ]
1245         )
1246         http_host = url.split('/')[2]
1247         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1248         info = self.grep_webpage(
1249             next_url,
1250             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1251                 '(http://.*?\.swf).*?' +
1252                 '(rtmp://.*?)\'',
1253             re.DOTALL,
1254             [
1255                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1256                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1257                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1258             ]
1259         )
1260         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1261
1262     def extractPlus7Stream(self, url):
1263         video_lang = url.split('/')[-3]
1264         info = self.grep_webpage(
1265             url,
1266             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1267             0,
1268             [
1269                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1270             ]
1271         )
1272         next_url = compat_urllib_parse.unquote(info.get('url'))
1273         info = self.grep_webpage(
1274             next_url,
1275             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1276             0,
1277             [
1278                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1279             ]
1280         )
1281         next_url = compat_urllib_parse.unquote(info.get('url'))
1282
1283         info = self.grep_webpage(
1284             next_url,
1285             r'<video id="(.*?)".*?>.*?' +
1286                 '<name>(.*?)</name>.*?' +
1287                 '<dateVideo>(.*?)</dateVideo>.*?' +
1288                 '<url quality="hd">(.*?)</url>',
1289             re.DOTALL,
1290             [
1291                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1292                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1293                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1294                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1295             ]
1296         )
1297
1298         return {
1299             'id':           info.get('id'),
1300             'url':          compat_urllib_parse.unquote(info.get('url')),
1301             'uploader':     u'arte.tv',
1302             'upload_date':  info.get('date'),
1303             'title':        info.get('title').decode('utf-8'),
1304             'ext':          u'mp4',
1305             'format':       u'NA',
1306             'player_url':   None,
1307         }
1308
1309     def _real_extract(self, url):
1310         video_id = url.split('/')[-1]
1311         self.report_extraction(video_id)
1312
1313         if re.search(self._LIVE_URL, video_id) is not None:
1314             self.extractLiveStream(url)
1315             return
1316         else:
1317             info = self.extractPlus7Stream(url)
1318
1319         return [info]
1320
1321
1322 class GenericIE(InfoExtractor):
1323     """Generic last-resort information extractor."""
1324
1325     _VALID_URL = r'.*'
1326     IE_NAME = u'generic'
1327
1328     def __init__(self, downloader=None):
1329         InfoExtractor.__init__(self, downloader)
1330
1331     def report_download_webpage(self, video_id):
1332         """Report webpage download."""
1333         if not self._downloader.params.get('test', False):
1334             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1335         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1336
1337     def report_extraction(self, video_id):
1338         """Report information extraction."""
1339         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1340
1341     def report_following_redirect(self, new_url):
1342         """Report information extraction."""
1343         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1344
1345     def _test_redirect(self, url):
1346         """Check if it is a redirect, like url shorteners, in case restart chain."""
1347         class HeadRequest(compat_urllib_request.Request):
1348             def get_method(self):
1349                 return "HEAD"
1350
1351         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1352             """
1353             Subclass the HTTPRedirectHandler to make it use our
1354             HeadRequest also on the redirected URL
1355             """
1356             def redirect_request(self, req, fp, code, msg, headers, newurl):
1357                 if code in (301, 302, 303, 307):
1358                     newurl = newurl.replace(' ', '%20')
1359                     newheaders = dict((k,v) for k,v in req.headers.items()
1360                                       if k.lower() not in ("content-length", "content-type"))
1361                     return HeadRequest(newurl,
1362                                        headers=newheaders,
1363                                        origin_req_host=req.get_origin_req_host(),
1364                                        unverifiable=True)
1365                 else:
1366                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1367
1368         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1369             """
1370             Fallback to GET if HEAD is not allowed (405 HTTP error)
1371             """
1372             def http_error_405(self, req, fp, code, msg, headers):
1373                 fp.read()
1374                 fp.close()
1375
1376                 newheaders = dict((k,v) for k,v in req.headers.items()
1377                                   if k.lower() not in ("content-length", "content-type"))
1378                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1379                                                  headers=newheaders,
1380                                                  origin_req_host=req.get_origin_req_host(),
1381                                                  unverifiable=True))
1382
1383         # Build our opener
1384         opener = compat_urllib_request.OpenerDirector()
1385         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1386                         HTTPMethodFallback, HEADRedirectHandler,
1387                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1388             opener.add_handler(handler())
1389
1390         response = opener.open(HeadRequest(url))
1391         new_url = response.geturl()
1392
1393         if url == new_url:
1394             return False
1395
1396         self.report_following_redirect(new_url)
1397         self._downloader.download([new_url])
1398         return True
1399
1400     def _real_extract(self, url):
1401         if self._test_redirect(url): return
1402
1403         video_id = url.split('/')[-1]
1404         try:
1405             webpage = self._download_webpage(url, video_id)
1406         except ValueError as err:
1407             # since this is the last-resort InfoExtractor, if
1408             # this error is thrown, it'll be thrown here
1409             self._downloader.report_error(u'Invalid URL: %s' % url)
1410             return
1411
1412         self.report_extraction(video_id)
1413         # Start with something easy: JW Player in SWFObject
1414         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1415         if mobj is None:
1416             # Broaden the search a little bit
1417             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1418         if mobj is None:
1419             # Broaden the search a little bit: JWPlayer JS loader
1420             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1421         if mobj is None:
1422             self._downloader.report_error(u'Invalid URL: %s' % url)
1423             return
1424
1425         # It's possible that one of the regexes
1426         # matched, but returned an empty group:
1427         if mobj.group(1) is None:
1428             self._downloader.report_error(u'Invalid URL: %s' % url)
1429             return
1430
1431         video_url = compat_urllib_parse.unquote(mobj.group(1))
1432         video_id = os.path.basename(video_url)
1433
1434         # here's a fun little line of code for you:
1435         video_extension = os.path.splitext(video_id)[1][1:]
1436         video_id = os.path.splitext(video_id)[0]
1437
1438         # it's tempting to parse this further, but you would
1439         # have to take into account all the variations like
1440         #   Video Title - Site Name
1441         #   Site Name | Video Title
1442         #   Video Title - Tagline | Site Name
1443         # and so on and so forth; it's just not practical
1444         mobj = re.search(r'<title>(.*)</title>', webpage)
1445         if mobj is None:
1446             self._downloader.report_error(u'unable to extract title')
1447             return
1448         video_title = mobj.group(1)
1449
1450         # video uploader is domain name
1451         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1452         if mobj is None:
1453             self._downloader.report_error(u'unable to extract title')
1454             return
1455         video_uploader = mobj.group(1)
1456
1457         return [{
1458             'id':       video_id,
1459             'url':      video_url,
1460             'uploader': video_uploader,
1461             'upload_date':  None,
1462             'title':    video_title,
1463             'ext':      video_extension,
1464         }]
1465
1466
1467 class YoutubeSearchIE(InfoExtractor):
1468     """Information Extractor for YouTube search queries."""
1469     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1470     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1471     _max_youtube_results = 1000
1472     IE_NAME = u'youtube:search'
1473
1474     def __init__(self, downloader=None):
1475         InfoExtractor.__init__(self, downloader)
1476
1477     def report_download_page(self, query, pagenum):
1478         """Report attempt to download search page with given number."""
1479         query = query.decode(preferredencoding())
1480         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1481
1482     def _real_extract(self, query):
1483         mobj = re.match(self._VALID_URL, query)
1484         if mobj is None:
1485             self._downloader.report_error(u'invalid search query "%s"' % query)
1486             return
1487
1488         prefix, query = query.split(':')
1489         prefix = prefix[8:]
1490         query = query.encode('utf-8')
1491         if prefix == '':
1492             self._download_n_results(query, 1)
1493             return
1494         elif prefix == 'all':
1495             self._download_n_results(query, self._max_youtube_results)
1496             return
1497         else:
1498             try:
1499                 n = int(prefix)
1500                 if n <= 0:
1501                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1502                     return
1503                 elif n > self._max_youtube_results:
1504                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1505                     n = self._max_youtube_results
1506                 self._download_n_results(query, n)
1507                 return
1508             except ValueError: # parsing prefix as integer fails
1509                 self._download_n_results(query, 1)
1510                 return
1511
1512     def _download_n_results(self, query, n):
1513         """Downloads a specified number of results for a query"""
1514
1515         video_ids = []
1516         pagenum = 0
1517         limit = n
1518
1519         while (50 * pagenum) < limit:
1520             self.report_download_page(query, pagenum+1)
1521             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1522             request = compat_urllib_request.Request(result_url)
1523             try:
1524                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1525             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1526                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1527                 return
1528             api_response = json.loads(data)['data']
1529
1530             if not 'items' in api_response:
1531                 self._downloader.trouble(u'[youtube] No video results')
1532                 return
1533
1534             new_ids = list(video['id'] for video in api_response['items'])
1535             video_ids += new_ids
1536
1537             limit = min(n, api_response['totalItems'])
1538             pagenum += 1
1539
1540         if len(video_ids) > n:
1541             video_ids = video_ids[:n]
1542         for id in video_ids:
1543             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1544         return
1545
1546
1547 class GoogleSearchIE(InfoExtractor):
1548     """Information Extractor for Google Video search queries."""
1549     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1550     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1551     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1552     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1553     _max_google_results = 1000
1554     IE_NAME = u'video.google:search'
1555
1556     def __init__(self, downloader=None):
1557         InfoExtractor.__init__(self, downloader)
1558
1559     def report_download_page(self, query, pagenum):
1560         """Report attempt to download playlist page with given number."""
1561         query = query.decode(preferredencoding())
1562         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1563
1564     def _real_extract(self, query):
1565         mobj = re.match(self._VALID_URL, query)
1566         if mobj is None:
1567             self._downloader.report_error(u'invalid search query "%s"' % query)
1568             return
1569
1570         prefix, query = query.split(':')
1571         prefix = prefix[8:]
1572         query = query.encode('utf-8')
1573         if prefix == '':
1574             self._download_n_results(query, 1)
1575             return
1576         elif prefix == 'all':
1577             self._download_n_results(query, self._max_google_results)
1578             return
1579         else:
1580             try:
1581                 n = int(prefix)
1582                 if n <= 0:
1583                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1584                     return
1585                 elif n > self._max_google_results:
1586                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1587                     n = self._max_google_results
1588                 self._download_n_results(query, n)
1589                 return
1590             except ValueError: # parsing prefix as integer fails
1591                 self._download_n_results(query, 1)
1592                 return
1593
1594     def _download_n_results(self, query, n):
1595         """Downloads a specified number of results for a query"""
1596
1597         video_ids = []
1598         pagenum = 0
1599
1600         while True:
1601             self.report_download_page(query, pagenum)
1602             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1603             request = compat_urllib_request.Request(result_url)
1604             try:
1605                 page = compat_urllib_request.urlopen(request).read()
1606             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1607                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1608                 return
1609
1610             # Extract video identifiers
1611             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1612                 video_id = mobj.group(1)
1613                 if video_id not in video_ids:
1614                     video_ids.append(video_id)
1615                     if len(video_ids) == n:
1616                         # Specified n videos reached
1617                         for id in video_ids:
1618                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1619                         return
1620
1621             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1622                 for id in video_ids:
1623                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1624                 return
1625
1626             pagenum = pagenum + 1
1627
1628
1629 class YahooSearchIE(InfoExtractor):
1630     """Information Extractor for Yahoo! Video search queries."""
1631
1632     _WORKING = False
1633     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1634     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1635     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1636     _MORE_PAGES_INDICATOR = r'\s*Next'
1637     _max_yahoo_results = 1000
1638     IE_NAME = u'video.yahoo:search'
1639
1640     def __init__(self, downloader=None):
1641         InfoExtractor.__init__(self, downloader)
1642
1643     def report_download_page(self, query, pagenum):
1644         """Report attempt to download playlist page with given number."""
1645         query = query.decode(preferredencoding())
1646         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1647
1648     def _real_extract(self, query):
1649         mobj = re.match(self._VALID_URL, query)
1650         if mobj is None:
1651             self._downloader.report_error(u'invalid search query "%s"' % query)
1652             return
1653
1654         prefix, query = query.split(':')
1655         prefix = prefix[8:]
1656         query = query.encode('utf-8')
1657         if prefix == '':
1658             self._download_n_results(query, 1)
1659             return
1660         elif prefix == 'all':
1661             self._download_n_results(query, self._max_yahoo_results)
1662             return
1663         else:
1664             try:
1665                 n = int(prefix)
1666                 if n <= 0:
1667                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1668                     return
1669                 elif n > self._max_yahoo_results:
1670                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1671                     n = self._max_yahoo_results
1672                 self._download_n_results(query, n)
1673                 return
1674             except ValueError: # parsing prefix as integer fails
1675                 self._download_n_results(query, 1)
1676                 return
1677
1678     def _download_n_results(self, query, n):
1679         """Downloads a specified number of results for a query"""
1680
1681         video_ids = []
1682         already_seen = set()
1683         pagenum = 1
1684
1685         while True:
1686             self.report_download_page(query, pagenum)
1687             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1688             request = compat_urllib_request.Request(result_url)
1689             try:
1690                 page = compat_urllib_request.urlopen(request).read()
1691             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1692                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1693                 return
1694
1695             # Extract video identifiers
1696             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1697                 video_id = mobj.group(1)
1698                 if video_id not in already_seen:
1699                     video_ids.append(video_id)
1700                     already_seen.add(video_id)
1701                     if len(video_ids) == n:
1702                         # Specified n videos reached
1703                         for id in video_ids:
1704                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1705                         return
1706
1707             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1708                 for id in video_ids:
1709                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1710                 return
1711
1712             pagenum = pagenum + 1
1713
1714
1715 class YoutubePlaylistIE(InfoExtractor):
1716     """Information Extractor for YouTube playlists."""
1717
1718     _VALID_URL = r"""(?:
1719                         (?:https?://)?
1720                         (?:\w+\.)?
1721                         youtube\.com/
1722                         (?:
1723                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1724                            \? (?:.*?&)*? (?:p|a|list)=
1725                         |  p/
1726                         )
1727                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1728                         .*
1729                      |
1730                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1731                      )"""
1732     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1733     _MAX_RESULTS = 50
1734     IE_NAME = u'youtube:playlist'
1735
1736     def __init__(self, downloader=None):
1737         InfoExtractor.__init__(self, downloader)
1738
1739     @classmethod
1740     def suitable(cls, url):
1741         """Receives a URL and returns True if suitable for this IE."""
1742         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1743
1744     def report_download_page(self, playlist_id, pagenum):
1745         """Report attempt to download playlist page with given number."""
1746         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1747
1748     def _real_extract(self, url):
1749         # Extract playlist id
1750         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1751         if mobj is None:
1752             self._downloader.report_error(u'invalid url: %s' % url)
1753             return
1754
1755         # Download playlist videos from API
1756         playlist_id = mobj.group(1) or mobj.group(2)
1757         page_num = 1
1758         videos = []
1759
1760         while True:
1761             self.report_download_page(playlist_id, page_num)
1762
1763             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1764             try:
1765                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1766             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1767                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1768                 return
1769
1770             try:
1771                 response = json.loads(page)
1772             except ValueError as err:
1773                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1774                 return
1775
1776             if not 'feed' in response or not 'entry' in response['feed']:
1777                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1778                 return
1779             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1780                         for entry in response['feed']['entry']
1781                         if 'content' in entry ]
1782
1783             if len(response['feed']['entry']) < self._MAX_RESULTS:
1784                 break
1785             page_num += 1
1786
1787         videos = [v[1] for v in sorted(videos)]
1788         total = len(videos)
1789
1790         playliststart = self._downloader.params.get('playliststart', 1) - 1
1791         playlistend = self._downloader.params.get('playlistend', -1)
1792         if playlistend == -1:
1793             videos = videos[playliststart:]
1794         else:
1795             videos = videos[playliststart:playlistend]
1796
1797         if len(videos) == total:
1798             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1799         else:
1800             self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1801
1802         for video in videos:
1803             self._downloader.download([video])
1804         return
1805
1806
1807 class YoutubeChannelIE(InfoExtractor):
1808     """Information Extractor for YouTube channels."""
1809
1810     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1811     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1812     _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1813     IE_NAME = u'youtube:channel'
1814
1815     def report_download_page(self, channel_id, pagenum):
1816         """Report attempt to download channel page with given number."""
1817         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1818
1819     def _real_extract(self, url):
1820         # Extract channel id
1821         mobj = re.match(self._VALID_URL, url)
1822         if mobj is None:
1823             self._downloader.report_error(u'invalid url: %s' % url)
1824             return
1825
1826         # Download channel pages
1827         channel_id = mobj.group(1)
1828         video_ids = []
1829         pagenum = 1
1830
1831         while True:
1832             self.report_download_page(channel_id, pagenum)
1833             url = self._TEMPLATE_URL % (channel_id, pagenum)
1834             request = compat_urllib_request.Request(url)
1835             try:
1836                 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1837             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1839                 return
1840
1841             # Extract video identifiers
1842             ids_in_page = []
1843             for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1844                 if mobj.group(1) not in ids_in_page:
1845                     ids_in_page.append(mobj.group(1))
1846             video_ids.extend(ids_in_page)
1847
1848             if self._MORE_PAGES_INDICATOR not in page:
1849                 break
1850             pagenum = pagenum + 1
1851
1852         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1853
1854         for id in video_ids:
1855             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1856         return
1857
1858
1859 class YoutubeUserIE(InfoExtractor):
1860     """Information Extractor for YouTube users."""
1861
1862     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1863     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1864     _GDATA_PAGE_SIZE = 50
1865     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1866     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1867     IE_NAME = u'youtube:user'
1868
1869     def __init__(self, downloader=None):
1870         InfoExtractor.__init__(self, downloader)
1871
1872     def report_download_page(self, username, start_index):
1873         """Report attempt to download user page."""
1874         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1875                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1876
1877     def _real_extract(self, url):
1878         # Extract username
1879         mobj = re.match(self._VALID_URL, url)
1880         if mobj is None:
1881             self._downloader.report_error(u'invalid url: %s' % url)
1882             return
1883
1884         username = mobj.group(1)
1885
1886         # Download video ids using YouTube Data API. Result size per
1887         # query is limited (currently to 50 videos) so we need to query
1888         # page by page until there are no video ids - it means we got
1889         # all of them.
1890
1891         video_ids = []
1892         pagenum = 0
1893
1894         while True:
1895             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1896             self.report_download_page(username, start_index)
1897
1898             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1899
1900             try:
1901                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1902             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1903                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1904                 return
1905
1906             # Extract video identifiers
1907             ids_in_page = []
1908
1909             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1910                 if mobj.group(1) not in ids_in_page:
1911                     ids_in_page.append(mobj.group(1))
1912
1913             video_ids.extend(ids_in_page)
1914
1915             # A little optimization - if current page is not
1916             # "full", ie. does not contain PAGE_SIZE video ids then
1917             # we can assume that this page is the last one - there
1918             # are no more ids on further pages - no need to query
1919             # again.
1920
1921             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1922                 break
1923
1924             pagenum += 1
1925
1926         all_ids_count = len(video_ids)
1927         playliststart = self._downloader.params.get('playliststart', 1) - 1
1928         playlistend = self._downloader.params.get('playlistend', -1)
1929
1930         if playlistend == -1:
1931             video_ids = video_ids[playliststart:]
1932         else:
1933             video_ids = video_ids[playliststart:playlistend]
1934
1935         self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1936                 (username, all_ids_count, len(video_ids)))
1937
1938         for video_id in video_ids:
1939             self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1940
1941
1942 class BlipTVUserIE(InfoExtractor):
1943     """Information Extractor for blip.tv users."""
1944
1945     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1946     _PAGE_SIZE = 12
1947     IE_NAME = u'blip.tv:user'
1948
1949     def __init__(self, downloader=None):
1950         InfoExtractor.__init__(self, downloader)
1951
1952     def report_download_page(self, username, pagenum):
1953         """Report attempt to download user page."""
1954         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1955                 (self.IE_NAME, username, pagenum))
1956
1957     def _real_extract(self, url):
1958         # Extract username
1959         mobj = re.match(self._VALID_URL, url)
1960         if mobj is None:
1961             self._downloader.report_error(u'invalid url: %s' % url)
1962             return
1963
1964         username = mobj.group(1)
1965
1966         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1967
1968         request = compat_urllib_request.Request(url)
1969
1970         try:
1971             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1972             mobj = re.search(r'data-users-id="([^"]+)"', page)
1973             page_base = page_base % mobj.group(1)
1974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1975             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1976             return
1977
1978
1979         # Download video ids using BlipTV Ajax calls. Result size per
1980         # query is limited (currently to 12 videos) so we need to query
1981         # page by page until there are no video ids - it means we got
1982         # all of them.
1983
1984         video_ids = []
1985         pagenum = 1
1986
1987         while True:
1988             self.report_download_page(username, pagenum)
1989             url = page_base + "&page=" + str(pagenum)
1990             request = compat_urllib_request.Request( url )
1991             try:
1992                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1993             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1995                 return
1996
1997             # Extract video identifiers
1998             ids_in_page = []
1999
2000             for mobj in re.finditer(r'href="/([^"]+)"', page):
2001                 if mobj.group(1) not in ids_in_page:
2002                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2003
2004             video_ids.extend(ids_in_page)
2005
2006             # A little optimization - if current page is not
2007             # "full", ie. does not contain PAGE_SIZE video ids then
2008             # we can assume that this page is the last one - there
2009             # are no more ids on further pages - no need to query
2010             # again.
2011
2012             if len(ids_in_page) < self._PAGE_SIZE:
2013                 break
2014
2015             pagenum += 1
2016
2017         all_ids_count = len(video_ids)
2018         playliststart = self._downloader.params.get('playliststart', 1) - 1
2019         playlistend = self._downloader.params.get('playlistend', -1)
2020
2021         if playlistend == -1:
2022             video_ids = video_ids[playliststart:]
2023         else:
2024             video_ids = video_ids[playliststart:playlistend]
2025
2026         self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2027                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2028
2029         for video_id in video_ids:
2030             self._downloader.download([u'http://blip.tv/'+video_id])
2031
2032
2033 class DepositFilesIE(InfoExtractor):
2034     """Information extractor for depositfiles.com"""
2035
2036     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2037
2038     def report_download_webpage(self, file_id):
2039         """Report webpage download."""
2040         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2041
2042     def report_extraction(self, file_id):
2043         """Report information extraction."""
2044         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2045
2046     def _real_extract(self, url):
2047         file_id = url.split('/')[-1]
2048         # Rebuild url in english locale
2049         url = 'http://depositfiles.com/en/files/' + file_id
2050
2051         # Retrieve file webpage with 'Free download' button pressed
2052         free_download_indication = { 'gateway_result' : '1' }
2053         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2054         try:
2055             self.report_download_webpage(file_id)
2056             webpage = compat_urllib_request.urlopen(request).read()
2057         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2058             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2059             return
2060
2061         # Search for the real file URL
2062         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2063         if (mobj is None) or (mobj.group(1) is None):
2064             # Try to figure out reason of the error.
2065             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2066             if (mobj is not None) and (mobj.group(1) is not None):
2067                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2068                 self._downloader.report_error(u'%s' % restriction_message)
2069             else:
2070                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2071             return
2072
2073         file_url = mobj.group(1)
2074         file_extension = os.path.splitext(file_url)[1][1:]
2075
2076         # Search for file title
2077         mobj = re.search(r'<b title="(.*?)">', webpage)
2078         if mobj is None:
2079             self._downloader.report_error(u'unable to extract title')
2080             return
2081         file_title = mobj.group(1).decode('utf-8')
2082
2083         return [{
2084             'id':       file_id.decode('utf-8'),
2085             'url':      file_url.decode('utf-8'),
2086             'uploader': None,
2087             'upload_date':  None,
2088             'title':    file_title,
2089             'ext':      file_extension.decode('utf-8'),
2090         }]
2091
2092
2093 class FacebookIE(InfoExtractor):
2094     """Information Extractor for Facebook"""
2095
2096     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2097     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2098     _NETRC_MACHINE = 'facebook'
2099     IE_NAME = u'facebook'
2100
2101     def report_login(self):
2102         """Report attempt to log in."""
2103         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2104
2105     def _real_initialize(self):
2106         if self._downloader is None:
2107             return
2108
2109         useremail = None
2110         password = None
2111         downloader_params = self._downloader.params
2112
2113         # Attempt to use provided username and password or .netrc data
2114         if downloader_params.get('username', None) is not None:
2115             useremail = downloader_params['username']
2116             password = downloader_params['password']
2117         elif downloader_params.get('usenetrc', False):
2118             try:
2119                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2120                 if info is not None:
2121                     useremail = info[0]
2122                     password = info[2]
2123                 else:
2124                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2125             except (IOError, netrc.NetrcParseError) as err:
2126                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2127                 return
2128
2129         if useremail is None:
2130             return
2131
2132         # Log in
2133         login_form = {
2134             'email': useremail,
2135             'pass': password,
2136             'login': 'Log+In'
2137             }
2138         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2139         try:
2140             self.report_login()
2141             login_results = compat_urllib_request.urlopen(request).read()
2142             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2143                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2144                 return
2145         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2146             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2147             return
2148
2149     def _real_extract(self, url):
2150         mobj = re.match(self._VALID_URL, url)
2151         if mobj is None:
2152             self._downloader.report_error(u'invalid URL: %s' % url)
2153             return
2154         video_id = mobj.group('ID')
2155
2156         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2157         webpage = self._download_webpage(url, video_id)
2158
2159         BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2160         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2161         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2162         if not m:
2163             raise ExtractorError(u'Cannot parse data')
2164         data = dict(json.loads(m.group(1)))
2165         params_raw = compat_urllib_parse.unquote(data['params'])
2166         params = json.loads(params_raw)
2167         video_url = params['hd_src']
2168         if not video_url:
2169             video_url = params['sd_src']
2170         if not video_url:
2171             raise ExtractorError(u'Cannot find video URL')
2172         video_duration = int(params['video_duration'])
2173
2174         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2175         if not m:
2176             raise ExtractorError(u'Cannot find title in webpage')
2177         video_title = unescapeHTML(m.group(1))
2178
2179         info = {
2180             'id': video_id,
2181             'title': video_title,
2182             'url': video_url,
2183             'ext': 'mp4',
2184             'duration': video_duration,
2185             'thumbnail': params['thumbnail_src'],
2186         }
2187         return [info]
2188
2189
2190 class BlipTVIE(InfoExtractor):
2191     """Information extractor for blip.tv"""
2192
2193     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195     IE_NAME = u'blip.tv'
2196
2197     def report_extraction(self, file_id):
2198         """Report information extraction."""
2199         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200
2201     def report_direct_download(self, title):
2202         """Report information extraction."""
2203         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204
2205     def _real_extract(self, url):
2206         mobj = re.match(self._VALID_URL, url)
2207         if mobj is None:
2208             self._downloader.report_error(u'invalid URL: %s' % url)
2209             return
2210
2211         urlp = compat_urllib_parse_urlparse(url)
2212         if urlp.path.startswith('/play/'):
2213             request = compat_urllib_request.Request(url)
2214             response = compat_urllib_request.urlopen(request)
2215             redirecturl = response.geturl()
2216             rurlp = compat_urllib_parse_urlparse(redirecturl)
2217             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2218             url = 'http://blip.tv/a/a-' + file_id
2219             return self._real_extract(url)
2220
2221
2222         if '?' in url:
2223             cchar = '&'
2224         else:
2225             cchar = '?'
2226         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2227         request = compat_urllib_request.Request(json_url)
2228         request.add_header('User-Agent', 'iTunes/10.6.1')
2229         self.report_extraction(mobj.group(1))
2230         info = None
2231         try:
2232             urlh = compat_urllib_request.urlopen(request)
2233             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2234                 basename = url.split('/')[-1]
2235                 title,ext = os.path.splitext(basename)
2236                 title = title.decode('UTF-8')
2237                 ext = ext.replace('.', '')
2238                 self.report_direct_download(title)
2239                 info = {
2240                     'id': title,
2241                     'url': url,
2242                     'uploader': None,
2243                     'upload_date': None,
2244                     'title': title,
2245                     'ext': ext,
2246                     'urlhandle': urlh
2247                 }
2248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2249             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2250         if info is None: # Regular URL
2251             try:
2252                 json_code_bytes = urlh.read()
2253                 json_code = json_code_bytes.decode('utf-8')
2254             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2255                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2256                 return
2257
2258             try:
2259                 json_data = json.loads(json_code)
2260                 if 'Post' in json_data:
2261                     data = json_data['Post']
2262                 else:
2263                     data = json_data
2264
2265                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2266                 video_url = data['media']['url']
2267                 umobj = re.match(self._URL_EXT, video_url)
2268                 if umobj is None:
2269                     raise ValueError('Can not determine filename extension')
2270                 ext = umobj.group(1)
2271
2272                 info = {
2273                     'id': data['item_id'],
2274                     'url': video_url,
2275                     'uploader': data['display_name'],
2276                     'upload_date': upload_date,
2277                     'title': data['title'],
2278                     'ext': ext,
2279                     'format': data['media']['mimeType'],
2280                     'thumbnail': data['thumbnailUrl'],
2281                     'description': data['description'],
2282                     'player_url': data['embedUrl'],
2283                     'user_agent': 'iTunes/10.6.1',
2284                 }
2285             except (ValueError,KeyError) as err:
2286                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2287                 return
2288
2289         return [info]
2290
2291
2292 class MyVideoIE(InfoExtractor):
2293     """Information Extractor for myvideo.de."""
2294
2295     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2296     IE_NAME = u'myvideo'
2297
2298     def __init__(self, downloader=None):
2299         InfoExtractor.__init__(self, downloader)
2300
2301     def report_extraction(self, video_id):
2302         """Report information extraction."""
2303         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2304
2305     def _real_extract(self,url):
2306         mobj = re.match(self._VALID_URL, url)
2307         if mobj is None:
2308             self._download.report_error(u'invalid URL: %s' % url)
2309             return
2310
2311         video_id = mobj.group(1)
2312
2313         # Get video webpage
2314         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2315         webpage = self._download_webpage(webpage_url, video_id)
2316
2317         self.report_extraction(video_id)
2318         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2319                  webpage)
2320         if mobj is None:
2321             self._downloader.report_error(u'unable to extract media URL')
2322             return
2323         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2324
2325         mobj = re.search('<title>([^<]+)</title>', webpage)
2326         if mobj is None:
2327             self._downloader.report_error(u'unable to extract title')
2328             return
2329
2330         video_title = mobj.group(1)
2331
2332         return [{
2333             'id':       video_id,
2334             'url':      video_url,
2335             'uploader': None,
2336             'upload_date':  None,
2337             'title':    video_title,
2338             'ext':      u'flv',
2339         }]
2340
2341 class ComedyCentralIE(InfoExtractor):
2342     """Information extractor for The Daily Show and Colbert Report """
2343
2344     # urls can be abbreviations like :thedailyshow or :colbert
2345     # urls for episodes like:
2346     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2347     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2348     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2349     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2350                       |(https?://)?(www\.)?
2351                           (?P<showname>thedailyshow|colbertnation)\.com/
2352                          (full-episodes/(?P<episode>.*)|
2353                           (?P<clip>
2354                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2355                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2356                      $"""
2357
2358     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2359
2360     _video_extensions = {
2361         '3500': 'mp4',
2362         '2200': 'mp4',
2363         '1700': 'mp4',
2364         '1200': 'mp4',
2365         '750': 'mp4',
2366         '400': 'mp4',
2367     }
2368     _video_dimensions = {
2369         '3500': '1280x720',
2370         '2200': '960x540',
2371         '1700': '768x432',
2372         '1200': '640x360',
2373         '750': '512x288',
2374         '400': '384x216',
2375     }
2376
2377     @classmethod
2378     def suitable(cls, url):
2379         """Receives a URL and returns True if suitable for this IE."""
2380         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2381
2382     def report_extraction(self, episode_id):
2383         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2384
2385     def report_config_download(self, episode_id, media_id):
2386         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2387
2388     def report_index_download(self, episode_id):
2389         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2390
2391     def _print_formats(self, formats):
2392         print('Available formats:')
2393         for x in formats:
2394             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2395
2396
2397     def _real_extract(self, url):
2398         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2399         if mobj is None:
2400             self._downloader.report_error(u'invalid URL: %s' % url)
2401             return
2402
2403         if mobj.group('shortname'):
2404             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2405                 url = u'http://www.thedailyshow.com/full-episodes/'
2406             else:
2407                 url = u'http://www.colbertnation.com/full-episodes/'
2408             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409             assert mobj is not None
2410
2411         if mobj.group('clip'):
2412             if mobj.group('showname') == 'thedailyshow':
2413                 epTitle = mobj.group('tdstitle')
2414             else:
2415                 epTitle = mobj.group('cntitle')
2416             dlNewest = False
2417         else:
2418             dlNewest = not mobj.group('episode')
2419             if dlNewest:
2420                 epTitle = mobj.group('showname')
2421             else:
2422                 epTitle = mobj.group('episode')
2423
2424         req = compat_urllib_request.Request(url)
2425         self.report_extraction(epTitle)
2426         try:
2427             htmlHandle = compat_urllib_request.urlopen(req)
2428             html = htmlHandle.read()
2429             webpage = html.decode('utf-8')
2430         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2431             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2432             return
2433         if dlNewest:
2434             url = htmlHandle.geturl()
2435             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2436             if mobj is None:
2437                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2438                 return
2439             if mobj.group('episode') == '':
2440                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2441                 return
2442             epTitle = mobj.group('episode')
2443
2444         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2445
2446         if len(mMovieParams) == 0:
2447             # The Colbert Report embeds the information in a without
2448             # a URL prefix; so extract the alternate reference
2449             # and then add the URL prefix manually.
2450
2451             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2452             if len(altMovieParams) == 0:
2453                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2454                 return
2455             else:
2456                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2457
2458         uri = mMovieParams[0][1]
2459         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460         self.report_index_download(epTitle)
2461         try:
2462             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2465             return
2466
2467         results = []
2468
2469         idoc = xml.etree.ElementTree.fromstring(indexXml)
2470         itemEls = idoc.findall('.//item')
2471         for partNum,itemEl in enumerate(itemEls):
2472             mediaId = itemEl.findall('./guid')[0].text
2473             shortMediaId = mediaId.split(':')[-1]
2474             showId = mediaId.split(':')[-2].replace('.com', '')
2475             officialTitle = itemEl.findall('./title')[0].text
2476             officialDate = itemEl.findall('./pubDate')[0].text
2477
2478             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479                         compat_urllib_parse.urlencode({'uri': mediaId}))
2480             configReq = compat_urllib_request.Request(configUrl)
2481             self.report_config_download(epTitle, shortMediaId)
2482             try:
2483                 configXml = compat_urllib_request.urlopen(configReq).read()
2484             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2486                 return
2487
2488             cdoc = xml.etree.ElementTree.fromstring(configXml)
2489             turls = []
2490             for rendition in cdoc.findall('.//rendition'):
2491                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2492                 turls.append(finfo)
2493
2494             if len(turls) == 0:
2495                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2496                 continue
2497
2498             if self._downloader.params.get('listformats', None):
2499                 self._print_formats([i[0] for i in turls])
2500                 return
2501
2502             # For now, just pick the highest bitrate
2503             format,rtmp_video_url = turls[-1]
2504
2505             # Get the format arg from the arg stream
2506             req_format = self._downloader.params.get('format', None)
2507
2508             # Select format if we can find one
2509             for f,v in turls:
2510                 if f == req_format:
2511                     format, rtmp_video_url = f, v
2512                     break
2513
2514             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2515             if not m:
2516                 raise ExtractorError(u'Cannot transform RTMP url')
2517             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2518             video_url = base + m.group('finalid')
2519
2520             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2521             info = {
2522                 'id': shortMediaId,
2523                 'url': video_url,
2524                 'uploader': showId,
2525                 'upload_date': officialDate,
2526                 'title': effTitle,
2527                 'ext': 'mp4',
2528                 'format': format,
2529                 'thumbnail': None,
2530                 'description': officialTitle,
2531             }
2532             results.append(info)
2533
2534         return results
2535
2536
2537 class EscapistIE(InfoExtractor):
2538     """Information extractor for The Escapist """
2539
2540     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2541     IE_NAME = u'escapist'
2542
2543     def report_extraction(self, showName):
2544         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2545
2546     def report_config_download(self, showName):
2547         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2548
2549     def _real_extract(self, url):
2550         mobj = re.match(self._VALID_URL, url)
2551         if mobj is None:
2552             self._downloader.report_error(u'invalid URL: %s' % url)
2553             return
2554         showName = mobj.group('showname')
2555         videoId = mobj.group('episode')
2556
2557         self.report_extraction(showName)
2558         try:
2559             webPage = compat_urllib_request.urlopen(url)
2560             webPageBytes = webPage.read()
2561             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2562             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2563         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2564             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2565             return
2566
2567         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2568         description = unescapeHTML(descMatch.group(1))
2569         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2570         imgUrl = unescapeHTML(imgMatch.group(1))
2571         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2572         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2573         configUrlMatch = re.search('config=(.*)$', playerUrl)
2574         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2575
2576         self.report_config_download(showName)
2577         try:
2578             configJSON = compat_urllib_request.urlopen(configUrl)
2579             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2580             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2581         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2583             return
2584
2585         # Technically, it's JavaScript, not JSON
2586         configJSON = configJSON.replace("'", '"')
2587
2588         try:
2589             config = json.loads(configJSON)
2590         except (ValueError,) as err:
2591             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2592             return
2593
2594         playlist = config['playlist']
2595         videoUrl = playlist[1]['url']
2596
2597         info = {
2598             'id': videoId,
2599             'url': videoUrl,
2600             'uploader': showName,
2601             'upload_date': None,
2602             'title': showName,
2603             'ext': 'mp4',
2604             'thumbnail': imgUrl,
2605             'description': description,
2606             'player_url': playerUrl,
2607         }
2608
2609         return [info]
2610
2611 class CollegeHumorIE(InfoExtractor):
2612     """Information extractor for collegehumor.com"""
2613
2614     _WORKING = False
2615     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616     IE_NAME = u'collegehumor'
2617
2618     def report_manifest(self, video_id):
2619         """Report information extraction."""
2620         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621
2622     def report_extraction(self, video_id):
2623         """Report information extraction."""
2624         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625
2626     def _real_extract(self, url):
2627         mobj = re.match(self._VALID_URL, url)
2628         if mobj is None:
2629             self._downloader.report_error(u'invalid URL: %s' % url)
2630             return
2631         video_id = mobj.group('videoid')
2632
2633         info = {
2634             'id': video_id,
2635             'uploader': None,
2636             'upload_date': None,
2637         }
2638
2639         self.report_extraction(video_id)
2640         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641         try:
2642             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2645             return
2646
2647         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648         try:
2649             videoNode = mdoc.findall('./video')[0]
2650             info['description'] = videoNode.findall('./description')[0].text
2651             info['title'] = videoNode.findall('./caption')[0].text
2652             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653             manifest_url = videoNode.findall('./file')[0].text
2654         except IndexError:
2655             self._downloader.report_error(u'Invalid metadata XML file')
2656             return
2657
2658         manifest_url += '?hdcore=2.10.3'
2659         self.report_manifest(video_id)
2660         try:
2661             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2664             return
2665
2666         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667         try:
2668             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669             node_id = media_node.attrib['url']
2670             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671         except IndexError as err:
2672             self._downloader.report_error(u'Invalid manifest file')
2673             return
2674
2675         url_pr = compat_urllib_parse_urlparse(manifest_url)
2676         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2677
2678         info['url'] = url
2679         info['ext'] = 'f4f'
2680         return [info]
2681
2682
2683 class XVideosIE(InfoExtractor):
2684     """Information extractor for xvideos.com"""
2685
2686     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687     IE_NAME = u'xvideos'
2688
2689     def report_extraction(self, video_id):
2690         """Report information extraction."""
2691         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2692
2693     def _real_extract(self, url):
2694         mobj = re.match(self._VALID_URL, url)
2695         if mobj is None:
2696             self._downloader.report_error(u'invalid URL: %s' % url)
2697             return
2698         video_id = mobj.group(1)
2699
2700         webpage = self._download_webpage(url, video_id)
2701
2702         self.report_extraction(video_id)
2703
2704
2705         # Extract video URL
2706         mobj = re.search(r'flv_url=(.+?)&', webpage)
2707         if mobj is None:
2708             self._downloader.report_error(u'unable to extract video url')
2709             return
2710         video_url = compat_urllib_parse.unquote(mobj.group(1))
2711
2712
2713         # Extract title
2714         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2715         if mobj is None:
2716             self._downloader.report_error(u'unable to extract video title')
2717             return
2718         video_title = mobj.group(1)
2719
2720
2721         # Extract video thumbnail
2722         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2723         if mobj is None:
2724             self._downloader.report_error(u'unable to extract video thumbnail')
2725             return
2726         video_thumbnail = mobj.group(0)
2727
2728         info = {
2729             'id': video_id,
2730             'url': video_url,
2731             'uploader': None,
2732             'upload_date': None,
2733             'title': video_title,
2734             'ext': 'flv',
2735             'thumbnail': video_thumbnail,
2736             'description': None,
2737         }
2738
2739         return [info]
2740
2741
2742 class SoundcloudIE(InfoExtractor):
2743     """Information extractor for soundcloud.com
2744        To access the media, the uid of the song and a stream token
2745        must be extracted from the page source and the script must make
2746        a request to media.soundcloud.com/crossdomain.xml. Then
2747        the media can be grabbed by requesting from an url composed
2748        of the stream token and uid
2749      """
2750
2751     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752     IE_NAME = u'soundcloud'
2753
2754     def __init__(self, downloader=None):
2755         InfoExtractor.__init__(self, downloader)
2756
2757     def report_resolve(self, video_id):
2758         """Report information extraction."""
2759         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2760
2761     def report_extraction(self, video_id):
2762         """Report information extraction."""
2763         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2764
2765     def _real_extract(self, url):
2766         mobj = re.match(self._VALID_URL, url)
2767         if mobj is None:
2768             self._downloader.report_error(u'invalid URL: %s' % url)
2769             return
2770
2771         # extract uploader (which is in the url)
2772         uploader = mobj.group(1)
2773         # extract simple title (uploader + slug of song title)
2774         slug_title =  mobj.group(2)
2775         simple_title = uploader + u'-' + slug_title
2776
2777         self.report_resolve('%s/%s' % (uploader, slug_title))
2778
2779         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2780         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2781         request = compat_urllib_request.Request(resolv_url)
2782         try:
2783             info_json_bytes = compat_urllib_request.urlopen(request).read()
2784             info_json = info_json_bytes.decode('utf-8')
2785         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2786             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2787             return
2788
2789         info = json.loads(info_json)
2790         video_id = info['id']
2791         self.report_extraction('%s/%s' % (uploader, slug_title))
2792
2793         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794         request = compat_urllib_request.Request(streams_url)
2795         try:
2796             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2797             stream_json = stream_json_bytes.decode('utf-8')
2798         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2800             return
2801
2802         streams = json.loads(stream_json)
2803         mediaURL = streams['http_mp3_128_url']
2804
2805         return [{
2806             'id':       info['id'],
2807             'url':      mediaURL,
2808             'uploader': info['user']['username'],
2809             'upload_date':  info['created_at'],
2810             'title':    info['title'],
2811             'ext':      u'mp3',
2812             'description': info['description'],
2813         }]
2814
2815 class SoundcloudSetIE(InfoExtractor):
2816     """Information extractor for soundcloud.com sets
2817        To access the media, the uid of the song and a stream token
2818        must be extracted from the page source and the script must make
2819        a request to media.soundcloud.com/crossdomain.xml. Then
2820        the media can be grabbed by requesting from an url composed
2821        of the stream token and uid
2822      """
2823
2824     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2825     IE_NAME = u'soundcloud'
2826
2827     def __init__(self, downloader=None):
2828         InfoExtractor.__init__(self, downloader)
2829
2830     def report_resolve(self, video_id):
2831         """Report information extraction."""
2832         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2833
2834     def report_extraction(self, video_id):
2835         """Report information extraction."""
2836         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2837
2838     def _real_extract(self, url):
2839         mobj = re.match(self._VALID_URL, url)
2840         if mobj is None:
2841             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2842             return
2843
2844         # extract uploader (which is in the url)
2845         uploader = mobj.group(1)
2846         # extract simple title (uploader + slug of song title)
2847         slug_title =  mobj.group(2)
2848         simple_title = uploader + u'-' + slug_title
2849
2850         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2851
2852         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2853         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2854         request = compat_urllib_request.Request(resolv_url)
2855         try:
2856             info_json_bytes = compat_urllib_request.urlopen(request).read()
2857             info_json = info_json_bytes.decode('utf-8')
2858         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2860             return
2861
2862         videos = []
2863         info = json.loads(info_json)
2864         if 'errors' in info:
2865             for err in info['errors']:
2866                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2867             return
2868
2869         for track in info['tracks']:
2870             video_id = track['id']
2871             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2872
2873             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2874             request = compat_urllib_request.Request(streams_url)
2875             try:
2876                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2877                 stream_json = stream_json_bytes.decode('utf-8')
2878             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2880                 return
2881
2882             streams = json.loads(stream_json)
2883             mediaURL = streams['http_mp3_128_url']
2884
2885             videos.append({
2886                 'id':       video_id,
2887                 'url':      mediaURL,
2888                 'uploader': track['user']['username'],
2889                 'upload_date':  track['created_at'],
2890                 'title':    track['title'],
2891                 'ext':      u'mp3',
2892                 'description': track['description'],
2893             })
2894         return videos
2895
2896
2897 class InfoQIE(InfoExtractor):
2898     """Information extractor for infoq.com"""
2899     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2900
2901     def report_extraction(self, video_id):
2902         """Report information extraction."""
2903         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2904
2905     def _real_extract(self, url):
2906         mobj = re.match(self._VALID_URL, url)
2907         if mobj is None:
2908             self._downloader.report_error(u'invalid URL: %s' % url)
2909             return
2910
2911         webpage = self._download_webpage(url, video_id=url)
2912         self.report_extraction(url)
2913
2914         # Extract video URL
2915         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2916         if mobj is None:
2917             self._downloader.report_error(u'unable to extract video url')
2918             return
2919         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2920         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2921
2922         # Extract title
2923         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2924         if mobj is None:
2925             self._downloader.report_error(u'unable to extract video title')
2926             return
2927         video_title = mobj.group(1)
2928
2929         # Extract description
2930         video_description = u'No description available.'
2931         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2932         if mobj is not None:
2933             video_description = mobj.group(1)
2934
2935         video_filename = video_url.split('/')[-1]
2936         video_id, extension = video_filename.split('.')
2937
2938         info = {
2939             'id': video_id,
2940             'url': video_url,
2941             'uploader': None,
2942             'upload_date': None,
2943             'title': video_title,
2944             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2945             'thumbnail': None,
2946             'description': video_description,
2947         }
2948
2949         return [info]
2950
2951 class MixcloudIE(InfoExtractor):
2952     """Information extractor for www.mixcloud.com"""
2953
2954     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2955     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2956     IE_NAME = u'mixcloud'
2957
2958     def __init__(self, downloader=None):
2959         InfoExtractor.__init__(self, downloader)
2960
2961     def report_download_json(self, file_id):
2962         """Report JSON download."""
2963         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2964
2965     def report_extraction(self, file_id):
2966         """Report information extraction."""
2967         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2968
2969     def get_urls(self, jsonData, fmt, bitrate='best'):
2970         """Get urls from 'audio_formats' section in json"""
2971         file_url = None
2972         try:
2973             bitrate_list = jsonData[fmt]
2974             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2975                 bitrate = max(bitrate_list) # select highest
2976
2977             url_list = jsonData[fmt][bitrate]
2978         except TypeError: # we have no bitrate info.
2979             url_list = jsonData[fmt]
2980         return url_list
2981
2982     def check_urls(self, url_list):
2983         """Returns 1st active url from list"""
2984         for url in url_list:
2985             try:
2986                 compat_urllib_request.urlopen(url)
2987                 return url
2988             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2989                 url = None
2990
2991         return None
2992
2993     def _print_formats(self, formats):
2994         print('Available formats:')
2995         for fmt in formats.keys():
2996             for b in formats[fmt]:
2997                 try:
2998                     ext = formats[fmt][b][0]
2999                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3000                 except TypeError: # we have no bitrate info
3001                     ext = formats[fmt][0]
3002                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3003                     break
3004
3005     def _real_extract(self, url):
3006         mobj = re.match(self._VALID_URL, url)
3007         if mobj is None:
3008             self._downloader.report_error(u'invalid URL: %s' % url)
3009             return
3010         # extract uploader & filename from url
3011         uploader = mobj.group(1).decode('utf-8')
3012         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3013
3014         # construct API request
3015         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3016         # retrieve .json file with links to files
3017         request = compat_urllib_request.Request(file_url)
3018         try:
3019             self.report_download_json(file_url)
3020             jsonData = compat_urllib_request.urlopen(request).read()
3021         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3023             return
3024
3025         # parse JSON
3026         json_data = json.loads(jsonData)
3027         player_url = json_data['player_swf_url']
3028         formats = dict(json_data['audio_formats'])
3029
3030         req_format = self._downloader.params.get('format', None)
3031         bitrate = None
3032
3033         if self._downloader.params.get('listformats', None):
3034             self._print_formats(formats)
3035             return
3036
3037         if req_format is None or req_format == 'best':
3038             for format_param in formats.keys():
3039                 url_list = self.get_urls(formats, format_param)
3040                 # check urls
3041                 file_url = self.check_urls(url_list)
3042                 if file_url is not None:
3043                     break # got it!
3044         else:
3045             if req_format not in formats:
3046                 self._downloader.report_error(u'format is not available')
3047                 return
3048
3049             url_list = self.get_urls(formats, req_format)
3050             file_url = self.check_urls(url_list)
3051             format_param = req_format
3052
3053         return [{
3054             'id': file_id.decode('utf-8'),
3055             'url': file_url.decode('utf-8'),
3056             'uploader': uploader.decode('utf-8'),
3057             'upload_date': None,
3058             'title': json_data['name'],
3059             'ext': file_url.split('.')[-1].decode('utf-8'),
3060             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3061             'thumbnail': json_data['thumbnail_url'],
3062             'description': json_data['description'],
3063             'player_url': player_url.decode('utf-8'),
3064         }]
3065
3066 class StanfordOpenClassroomIE(InfoExtractor):
3067     """Information extractor for Stanford's Open ClassRoom"""
3068
3069     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3070     IE_NAME = u'stanfordoc'
3071
3072     def report_download_webpage(self, objid):
3073         """Report information extraction."""
3074         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3075
3076     def report_extraction(self, video_id):
3077         """Report information extraction."""
3078         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3079
3080     def _real_extract(self, url):
3081         mobj = re.match(self._VALID_URL, url)
3082         if mobj is None:
3083             raise ExtractorError(u'Invalid URL: %s' % url)
3084
3085         if mobj.group('course') and mobj.group('video'): # A specific video
3086             course = mobj.group('course')
3087             video = mobj.group('video')
3088             info = {
3089                 'id': course + '_' + video,
3090                 'uploader': None,
3091                 'upload_date': None,
3092             }
3093
3094             self.report_extraction(info['id'])
3095             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3096             xmlUrl = baseUrl + video + '.xml'
3097             try:
3098                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3099             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3100                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3101                 return
3102             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3103             try:
3104                 info['title'] = mdoc.findall('./title')[0].text
3105                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3106             except IndexError:
3107                 self._downloader.report_error(u'Invalid metadata XML file')
3108                 return
3109             info['ext'] = info['url'].rpartition('.')[2]
3110             return [info]
3111         elif mobj.group('course'): # A course page
3112             course = mobj.group('course')
3113             info = {
3114                 'id': course,
3115                 'type': 'playlist',
3116                 'uploader': None,
3117                 'upload_date': None,
3118             }
3119
3120             coursepage = self._download_webpage(url, info['id'],
3121                                         note='Downloading course info page',
3122                                         errnote='Unable to download course info page')
3123
3124             m = re.search('<h1>([^<]+)</h1>', coursepage)
3125             if m:
3126                 info['title'] = unescapeHTML(m.group(1))
3127             else:
3128                 info['title'] = info['id']
3129
3130             m = re.search('<description>([^<]+)</description>', coursepage)
3131             if m:
3132                 info['description'] = unescapeHTML(m.group(1))
3133
3134             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3135             info['list'] = [
3136                 {
3137                     'type': 'reference',
3138                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3139                 }
3140                     for vpage in links]
3141             results = []
3142             for entry in info['list']:
3143                 assert entry['type'] == 'reference'
3144                 results += self.extract(entry['url'])
3145             return results
3146         else: # Root page
3147             info = {
3148                 'id': 'Stanford OpenClassroom',
3149                 'type': 'playlist',
3150                 'uploader': None,
3151                 'upload_date': None,
3152             }
3153
3154             self.report_download_webpage(info['id'])
3155             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3156             try:
3157                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3158             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3160                 return
3161
3162             info['title'] = info['id']
3163
3164             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3165             info['list'] = [
3166                 {
3167                     'type': 'reference',
3168                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3169                 }
3170                     for cpage in links]
3171
3172             results = []
3173             for entry in info['list']:
3174                 assert entry['type'] == 'reference'
3175                 results += self.extract(entry['url'])
3176             return results
3177
3178 class MTVIE(InfoExtractor):
3179     """Information extractor for MTV.com"""
3180
3181     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3182     IE_NAME = u'mtv'
3183
3184     def report_extraction(self, video_id):
3185         """Report information extraction."""
3186         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187
3188     def _real_extract(self, url):
3189         mobj = re.match(self._VALID_URL, url)
3190         if mobj is None:
3191             self._downloader.report_error(u'invalid URL: %s' % url)
3192             return
3193         if not mobj.group('proto'):
3194             url = 'http://' + url
3195         video_id = mobj.group('videoid')
3196
3197         webpage = self._download_webpage(url, video_id)
3198
3199         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3200         if mobj is None:
3201             self._downloader.report_error(u'unable to extract song name')
3202             return
3203         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3204         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3205         if mobj is None:
3206             self._downloader.report_error(u'unable to extract performer')
3207             return
3208         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209         video_title = performer + ' - ' + song_name
3210
3211         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3212         if mobj is None:
3213             self._downloader.report_error(u'unable to mtvn_uri')
3214             return
3215         mtvn_uri = mobj.group(1)
3216
3217         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3218         if mobj is None:
3219             self._downloader.report_error(u'unable to extract content id')
3220             return
3221         content_id = mobj.group(1)
3222
3223         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3224         self.report_extraction(video_id)
3225         request = compat_urllib_request.Request(videogen_url)
3226         try:
3227             metadataXml = compat_urllib_request.urlopen(request).read()
3228         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3229             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3230             return
3231
3232         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3233         renditions = mdoc.findall('.//rendition')
3234
3235         # For now, always pick the highest quality.
3236         rendition = renditions[-1]
3237
3238         try:
3239             _,_,ext = rendition.attrib['type'].partition('/')
3240             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3241             video_url = rendition.find('./src').text
3242         except KeyError:
3243             self._downloader.trouble('Invalid rendition field.')
3244             return
3245
3246         info = {
3247             'id': video_id,
3248             'url': video_url,
3249             'uploader': performer,
3250             'upload_date': None,
3251             'title': video_title,
3252             'ext': ext,
3253             'format': format,
3254         }
3255
3256         return [info]
3257
3258
3259 class YoukuIE(InfoExtractor):
3260     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3261
3262     def report_download_webpage(self, file_id):
3263         """Report webpage download."""
3264         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3265
3266     def report_extraction(self, file_id):
3267         """Report information extraction."""
3268         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3269
3270     def _gen_sid(self):
3271         nowTime = int(time.time() * 1000)
3272         random1 = random.randint(1000,1998)
3273         random2 = random.randint(1000,9999)
3274
3275         return "%d%d%d" %(nowTime,random1,random2)
3276
3277     def _get_file_ID_mix_string(self, seed):
3278         mixed = []
3279         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3280         seed = float(seed)
3281         for i in range(len(source)):
3282             seed  =  (seed * 211 + 30031 ) % 65536
3283             index  =  math.floor(seed / 65536 * len(source) )
3284             mixed.append(source[int(index)])
3285             source.remove(source[int(index)])
3286         #return ''.join(mixed)
3287         return mixed
3288
3289     def _get_file_id(self, fileId, seed):
3290         mixed = self._get_file_ID_mix_string(seed)
3291         ids = fileId.split('*')
3292         realId = []
3293         for ch in ids:
3294             if ch:
3295                 realId.append(mixed[int(ch)])
3296         return ''.join(realId)
3297
3298     def _real_extract(self, url):
3299         mobj = re.match(self._VALID_URL, url)
3300         if mobj is None:
3301             self._downloader.report_error(u'invalid URL: %s' % url)
3302             return
3303         video_id = mobj.group('ID')
3304
3305         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3306
3307         request = compat_urllib_request.Request(info_url, None, std_headers)
3308         try:
3309             self.report_download_webpage(video_id)
3310             jsondata = compat_urllib_request.urlopen(request).read()
3311         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3312             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3313             return
3314
3315         self.report_extraction(video_id)
3316         try:
3317             jsonstr = jsondata.decode('utf-8')
3318             config = json.loads(jsonstr)
3319
3320             video_title =  config['data'][0]['title']
3321             seed = config['data'][0]['seed']
3322
3323             format = self._downloader.params.get('format', None)
3324             supported_format = list(config['data'][0]['streamfileids'].keys())
3325
3326             if format is None or format == 'best':
3327                 if 'hd2' in supported_format:
3328                     format = 'hd2'
3329                 else:
3330                     format = 'flv'
3331                 ext = u'flv'
3332             elif format == 'worst':
3333                 format = 'mp4'
3334                 ext = u'mp4'
3335             else:
3336                 format = 'flv'
3337                 ext = u'flv'
3338
3339
3340             fileid = config['data'][0]['streamfileids'][format]
3341             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3342         except (UnicodeDecodeError, ValueError, KeyError):
3343             self._downloader.report_error(u'unable to extract info section')
3344             return
3345
3346         files_info=[]
3347         sid = self._gen_sid()
3348         fileid = self._get_file_id(fileid, seed)
3349
3350         #column 8,9 of fileid represent the segment number
3351         #fileid[7:9] should be changed
3352         for index, key in enumerate(keys):
3353
3354             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3355             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3356
3357             info = {
3358                 'id': '%s_part%02d' % (video_id, index),
3359                 'url': download_url,
3360                 'uploader': None,
3361                 'upload_date': None,
3362                 'title': video_title,
3363                 'ext': ext,
3364             }
3365             files_info.append(info)
3366
3367         return files_info
3368
3369
3370 class XNXXIE(InfoExtractor):
3371     """Information extractor for xnxx.com"""
3372
3373     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3374     IE_NAME = u'xnxx'
3375     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3376     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3377     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3378
3379     def report_webpage(self, video_id):
3380         """Report information extraction"""
3381         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3382
3383     def report_extraction(self, video_id):
3384         """Report information extraction"""
3385         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3386
3387     def _real_extract(self, url):
3388         mobj = re.match(self._VALID_URL, url)
3389         if mobj is None:
3390             self._downloader.report_error(u'invalid URL: %s' % url)
3391             return
3392         video_id = mobj.group(1)
3393
3394         self.report_webpage(video_id)
3395
3396         # Get webpage content
3397         try:
3398             webpage_bytes = compat_urllib_request.urlopen(url).read()
3399             webpage = webpage_bytes.decode('utf-8')
3400         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3402             return
3403
3404         result = re.search(self.VIDEO_URL_RE, webpage)
3405         if result is None:
3406             self._downloader.report_error(u'unable to extract video url')
3407             return
3408         video_url = compat_urllib_parse.unquote(result.group(1))
3409
3410         result = re.search(self.VIDEO_TITLE_RE, webpage)
3411         if result is None:
3412             self._downloader.report_error(u'unable to extract video title')
3413             return
3414         video_title = result.group(1)
3415
3416         result = re.search(self.VIDEO_THUMB_RE, webpage)
3417         if result is None:
3418             self._downloader.report_error(u'unable to extract video thumbnail')
3419             return
3420         video_thumbnail = result.group(1)
3421
3422         return [{
3423             'id': video_id,
3424             'url': video_url,
3425             'uploader': None,
3426             'upload_date': None,
3427             'title': video_title,
3428             'ext': 'flv',
3429             'thumbnail': video_thumbnail,
3430             'description': None,
3431         }]
3432
3433
3434 class GooglePlusIE(InfoExtractor):
3435     """Information extractor for plus.google.com."""
3436
3437     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3438     IE_NAME = u'plus.google'
3439
3440     def __init__(self, downloader=None):
3441         InfoExtractor.__init__(self, downloader)
3442
3443     def report_extract_entry(self, url):
3444         """Report downloading extry"""
3445         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3446
3447     def report_date(self, upload_date):
3448         """Report downloading extry"""
3449         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3450
3451     def report_uploader(self, uploader):
3452         """Report downloading extry"""
3453         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3454
3455     def report_title(self, video_title):
3456         """Report downloading extry"""
3457         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3458
3459     def report_extract_vid_page(self, video_page):
3460         """Report information extraction."""
3461         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3462
3463     def _real_extract(self, url):
3464         # Extract id from URL
3465         mobj = re.match(self._VALID_URL, url)
3466         if mobj is None:
3467             self._downloader.report_error(u'Invalid URL: %s' % url)
3468             return
3469
3470         post_url = mobj.group(0)
3471         video_id = mobj.group(1)
3472
3473         video_extension = 'flv'
3474
3475         # Step 1, Retrieve post webpage to extract further information
3476         self.report_extract_entry(post_url)
3477         request = compat_urllib_request.Request(post_url)
3478         try:
3479             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3480         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3481             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3482             return
3483
3484         # Extract update date
3485         upload_date = None
3486         pattern = 'title="Timestamp">(.*?)</a>'
3487         mobj = re.search(pattern, webpage)
3488         if mobj:
3489             upload_date = mobj.group(1)
3490             # Convert timestring to a format suitable for filename
3491             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3492             upload_date = upload_date.strftime('%Y%m%d')
3493         self.report_date(upload_date)
3494
3495         # Extract uploader
3496         uploader = None
3497         pattern = r'rel\="author".*?>(.*?)</a>'
3498         mobj = re.search(pattern, webpage)
3499         if mobj:
3500             uploader = mobj.group(1)
3501         self.report_uploader(uploader)
3502
3503         # Extract title
3504         # Get the first line for title
3505         video_title = u'NA'
3506         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3507         mobj = re.search(pattern, webpage)
3508         if mobj:
3509             video_title = mobj.group(1)
3510         self.report_title(video_title)
3511
3512         # Step 2, Stimulate clicking the image box to launch video
3513         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3514         mobj = re.search(pattern, webpage)
3515         if mobj is None:
3516             self._downloader.report_error(u'unable to extract video page URL')
3517
3518         video_page = mobj.group(1)
3519         request = compat_urllib_request.Request(video_page)
3520         try:
3521             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3522         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3523             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3524             return
3525         self.report_extract_vid_page(video_page)
3526
3527
3528         # Extract video links on video page
3529         """Extract video links of all sizes"""
3530         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3531         mobj = re.findall(pattern, webpage)
3532         if len(mobj) == 0:
3533             self._downloader.report_error(u'unable to extract video links')
3534
3535         # Sort in resolution
3536         links = sorted(mobj)
3537
3538         # Choose the lowest of the sort, i.e. highest resolution
3539         video_url = links[-1]
3540         # Only get the url. The resolution part in the tuple has no use anymore
3541         video_url = video_url[-1]
3542         # Treat escaped \u0026 style hex
3543         try:
3544             video_url = video_url.decode("unicode_escape")
3545         except AttributeError: # Python 3
3546             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3547
3548
3549         return [{
3550             'id':       video_id,
3551             'url':      video_url,
3552             'uploader': uploader,
3553             'upload_date':  upload_date,
3554             'title':    video_title,
3555             'ext':      video_extension,
3556         }]
3557
3558 class NBAIE(InfoExtractor):
3559     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3560     IE_NAME = u'nba'
3561
3562     def _real_extract(self, url):
3563         mobj = re.match(self._VALID_URL, url)
3564         if mobj is None:
3565             self._downloader.report_error(u'invalid URL: %s' % url)
3566             return
3567
3568         video_id = mobj.group(1)
3569         if video_id.endswith('/index.html'):
3570             video_id = video_id[:-len('/index.html')]
3571
3572         webpage = self._download_webpage(url, video_id)
3573
3574         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3575         def _findProp(rexp, default=None):
3576             m = re.search(rexp, webpage)
3577             if m:
3578                 return unescapeHTML(m.group(1))
3579             else:
3580                 return default
3581
3582         shortened_video_id = video_id.rpartition('/')[2]
3583         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3584         info = {
3585             'id': shortened_video_id,
3586             'url': video_url,
3587             'ext': 'mp4',
3588             'title': title,
3589             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3590             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3591         }
3592         return [info]
3593
3594 class JustinTVIE(InfoExtractor):
3595     """Information extractor for justin.tv and twitch.tv"""
3596     # TODO: One broadcast may be split into multiple videos. The key
3597     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3598     # starts at 1 and increases. Can we treat all parts as one video?
3599
3600     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3601         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3602     _JUSTIN_PAGE_LIMIT = 100
3603     IE_NAME = u'justin.tv'
3604
3605     def report_extraction(self, file_id):
3606         """Report information extraction."""
3607         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3608
3609     def report_download_page(self, channel, offset):
3610         """Report attempt to download a single page of videos."""
3611         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3612                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3613
3614     # Return count of items, list of *valid* items
3615     def _parse_page(self, url):
3616         try:
3617             urlh = compat_urllib_request.urlopen(url)
3618             webpage_bytes = urlh.read()
3619             webpage = webpage_bytes.decode('utf-8', 'ignore')
3620         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3621             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3622             return
3623
3624         response = json.loads(webpage)
3625         if type(response) != list:
3626             error_text = response.get('error', 'unknown error')
3627             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3628             return
3629         info = []
3630         for clip in response:
3631             video_url = clip['video_file_url']
3632             if video_url:
3633                 video_extension = os.path.splitext(video_url)[1][1:]
3634                 video_date = re.sub('-', '', clip['start_time'][:10])
3635                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3636                 video_id = clip['id']
3637                 video_title = clip.get('title', video_id)
3638                 info.append({
3639                     'id': video_id,
3640                     'url': video_url,
3641                     'title': video_title,
3642                     'uploader': clip.get('channel_name', video_uploader_id),
3643                     'uploader_id': video_uploader_id,
3644                     'upload_date': video_date,
3645                     'ext': video_extension,
3646                 })
3647         return (len(response), info)
3648
3649     def _real_extract(self, url):
3650         mobj = re.match(self._VALID_URL, url)
3651         if mobj is None:
3652             self._downloader.report_error(u'invalid URL: %s' % url)
3653             return
3654
3655         api = 'http://api.justin.tv'
3656         video_id = mobj.group(mobj.lastindex)
3657         paged = False
3658         if mobj.lastindex == 1:
3659             paged = True
3660             api += '/channel/archives/%s.json'
3661         else:
3662             api += '/broadcast/by_archive/%s.json'
3663         api = api % (video_id,)
3664
3665         self.report_extraction(video_id)
3666
3667         info = []
3668         offset = 0
3669         limit = self._JUSTIN_PAGE_LIMIT
3670         while True:
3671             if paged:
3672                 self.report_download_page(video_id, offset)
3673             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3674             page_count, page_info = self._parse_page(page_url)
3675             info.extend(page_info)
3676             if not paged or page_count != limit:
3677                 break
3678             offset += limit
3679         return info
3680
3681 class FunnyOrDieIE(InfoExtractor):
3682     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3683
3684     def _real_extract(self, url):
3685         mobj = re.match(self._VALID_URL, url)
3686         if mobj is None:
3687             self._downloader.report_error(u'invalid URL: %s' % url)
3688             return
3689
3690         video_id = mobj.group('id')
3691         webpage = self._download_webpage(url, video_id)
3692
3693         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3694         if not m:
3695             self._downloader.report_error(u'unable to find video information')
3696         video_url = unescapeHTML(m.group('url'))
3697
3698         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3699         if not m:
3700             self._downloader.trouble(u'Cannot find video title')
3701         title = clean_html(m.group('title'))
3702
3703         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3704         if m:
3705             desc = unescapeHTML(m.group('desc'))
3706         else:
3707             desc = None
3708
3709         info = {
3710             'id': video_id,
3711             'url': video_url,
3712             'ext': 'mp4',
3713             'title': title,
3714             'description': desc,
3715         }
3716         return [info]
3717
3718 class SteamIE(InfoExtractor):
3719     _VALID_URL = r"""http://store.steampowered.com/
3720                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3721                 (?P<gameID>\d+)/?
3722                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3723                 """
3724
3725     @classmethod
3726     def suitable(cls, url):
3727         """Receives a URL and returns True if suitable for this IE."""
3728         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3729
3730     def _real_extract(self, url):
3731         m = re.match(self._VALID_URL, url, re.VERBOSE)
3732         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3733         gameID = m.group('gameID')
3734         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3735         webpage = self._download_webpage(videourl, gameID)
3736         mweb = re.finditer(urlRE, webpage)
3737         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3738         titles = re.finditer(namesRE, webpage)
3739         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3740         thumbs = re.finditer(thumbsRE, webpage)
3741         videos = []
3742         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3743             video_id = vid.group('videoID')
3744             title = vtitle.group('videoName')
3745             video_url = vid.group('videoURL')
3746             video_thumb = thumb.group('thumbnail')
3747             if not video_url:
3748                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3749             info = {
3750                 'id':video_id,
3751                 'url':video_url,
3752                 'ext': 'flv',
3753                 'title': unescapeHTML(title),
3754                 'thumbnail': video_thumb
3755                   }
3756             videos.append(info)
3757         return videos
3758
3759 class UstreamIE(InfoExtractor):
3760     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3761     IE_NAME = u'ustream'
3762
3763     def _real_extract(self, url):
3764         m = re.match(self._VALID_URL, url)
3765         video_id = m.group('videoID')
3766         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3767         webpage = self._download_webpage(url, video_id)
3768         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3769         title = m.group('title')
3770         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3771         uploader = m.group('uploader')
3772         info = {
3773                 'id':video_id,
3774                 'url':video_url,
3775                 'ext': 'flv',
3776                 'title': title,
3777                 'uploader': uploader
3778                   }
3779         return [info]
3780
3781 class WorldStarHipHopIE(InfoExtractor):
3782     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3783     IE_NAME = u'WorldStarHipHop'
3784
3785     def _real_extract(self, url):
3786         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3787
3788         webpage_src = compat_urllib_request.urlopen(url).read()
3789         webpage_src = webpage_src.decode('utf-8')
3790
3791         mobj = re.search(_src_url, webpage_src)
3792
3793         m = re.match(self._VALID_URL, url)
3794         video_id = m.group('id')
3795
3796         if mobj is not None:
3797             video_url = mobj.group()
3798             if 'mp4' in video_url:
3799                 ext = 'mp4'
3800             else:
3801                 ext = 'flv'
3802         else:
3803             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3804             return
3805
3806         _title = r"""<title>(.*)</title>"""
3807
3808         mobj = re.search(_title, webpage_src)
3809
3810         if mobj is not None:
3811             title = mobj.group(1)
3812         else:
3813             title = 'World Start Hip Hop - %s' % time.ctime()
3814
3815         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3816         mobj = re.search(_thumbnail, webpage_src)
3817
3818         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3819         if mobj is not None:
3820             thumbnail = mobj.group(1)
3821         else:
3822             _title = r"""candytitles.*>(.*)</span>"""
3823             mobj = re.search(_title, webpage_src)
3824             if mobj is not None:
3825                 title = mobj.group(1)
3826             thumbnail = None
3827
3828         results = [{
3829                     'id': video_id,
3830                     'url' : video_url,
3831                     'title' : title,
3832                     'thumbnail' : thumbnail,
3833                     'ext' : ext,
3834                     }]
3835         return results
3836
3837 class RBMARadioIE(InfoExtractor):
3838     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3839
3840     def _real_extract(self, url):
3841         m = re.match(self._VALID_URL, url)
3842         video_id = m.group('videoID')
3843
3844         webpage = self._download_webpage(url, video_id)
3845         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3846         if not m:
3847             raise ExtractorError(u'Cannot find metadata')
3848         json_data = m.group(1)
3849
3850         try:
3851             data = json.loads(json_data)
3852         except ValueError as e:
3853             raise ExtractorError(u'Invalid JSON: ' + str(e))
3854
3855         video_url = data['akamai_url'] + '&cbr=256'
3856         url_parts = compat_urllib_parse_urlparse(video_url)
3857         video_ext = url_parts.path.rpartition('.')[2]
3858         info = {
3859                 'id': video_id,
3860                 'url': video_url,
3861                 'ext': video_ext,
3862                 'title': data['title'],
3863                 'description': data.get('teaser_text'),
3864                 'location': data.get('country_of_origin'),
3865                 'uploader': data.get('host', {}).get('name'),
3866                 'uploader_id': data.get('host', {}).get('slug'),
3867                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3868                 'duration': data.get('duration'),
3869         }
3870         return [info]
3871
3872
3873 class YouPornIE(InfoExtractor):
3874     """Information extractor for youporn.com."""
3875     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3876
3877     def _print_formats(self, formats):
3878         """Print all available formats"""
3879         print(u'Available formats:')
3880         print(u'ext\t\tformat')
3881         print(u'---------------------------------')
3882         for format in formats:
3883             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3884
3885     def _specific(self, req_format, formats):
3886         for x in formats:
3887             if(x["format"]==req_format):
3888                 return x
3889         return None
3890
3891     def _real_extract(self, url):
3892         mobj = re.match(self._VALID_URL, url)
3893         if mobj is None:
3894             self._downloader.report_error(u'invalid URL: %s' % url)
3895             return
3896
3897         video_id = mobj.group('videoid')
3898
3899         req = compat_urllib_request.Request(url)
3900         req.add_header('Cookie', 'age_verified=1')
3901         webpage = self._download_webpage(req, video_id)
3902
3903         # Get the video title
3904         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3905         if result is None:
3906             raise ExtractorError(u'Unable to extract video title')
3907         video_title = result.group('title').strip()
3908
3909         # Get the video date
3910         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3911         if result is None:
3912             self._downloader.report_warning(u'unable to extract video date')
3913             upload_date = None
3914         else:
3915             upload_date = result.group('date').strip()
3916
3917         # Get the video uploader
3918         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3919         if result is None:
3920             self._downloader.report_warning(u'unable to extract uploader')
3921             video_uploader = None
3922         else:
3923             video_uploader = result.group('uploader').strip()
3924             video_uploader = clean_html( video_uploader )
3925
3926         # Get all of the formats available
3927         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3928         result = re.search(DOWNLOAD_LIST_RE, webpage)
3929         if result is None:
3930             raise ExtractorError(u'Unable to extract download list')
3931         download_list_html = result.group('download_list').strip()
3932
3933         # Get all of the links from the page
3934         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3935         links = re.findall(LINK_RE, download_list_html)
3936         if(len(links) == 0):
3937             raise ExtractorError(u'ERROR: no known formats available for video')
3938
3939         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3940
3941         formats = []
3942         for link in links:
3943
3944             # A link looks like this:
3945             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3946             # A path looks like this:
3947             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3948             video_url = unescapeHTML( link )
3949             path = compat_urllib_parse_urlparse( video_url ).path
3950             extension = os.path.splitext( path )[1][1:]
3951             format = path.split('/')[4].split('_')[:2]
3952             size = format[0]
3953             bitrate = format[1]
3954             format = "-".join( format )
3955             title = u'%s-%s-%s' % (video_title, size, bitrate)
3956
3957             formats.append({
3958                 'id': video_id,
3959                 'url': video_url,
3960                 'uploader': video_uploader,
3961                 'upload_date': upload_date,
3962                 'title': title,
3963                 'ext': extension,
3964                 'format': format,
3965                 'thumbnail': None,
3966                 'description': None,
3967                 'player_url': None
3968             })
3969
3970         if self._downloader.params.get('listformats', None):
3971             self._print_formats(formats)
3972             return
3973
3974         req_format = self._downloader.params.get('format', None)
3975         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3976
3977         if req_format is None or req_format == 'best':
3978             return [formats[0]]
3979         elif req_format == 'worst':
3980             return [formats[-1]]
3981         elif req_format in ('-1', 'all'):
3982             return formats
3983         else:
3984             format = self._specific( req_format, formats )
3985             if result is None:
3986                 self._downloader.report_error(u'requested format not available')
3987                 return
3988             return [format]
3989
3990
3991
3992 class PornotubeIE(InfoExtractor):
3993     """Information extractor for pornotube.com."""
3994     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3995
3996     def _real_extract(self, url):
3997         mobj = re.match(self._VALID_URL, url)
3998         if mobj is None:
3999             self._downloader.report_error(u'invalid URL: %s' % url)
4000             return
4001
4002         video_id = mobj.group('videoid')
4003         video_title = mobj.group('title')
4004
4005         # Get webpage content
4006         webpage = self._download_webpage(url, video_id)
4007
4008         # Get the video URL
4009         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4010         result = re.search(VIDEO_URL_RE, webpage)
4011         if result is None:
4012             self._downloader.report_error(u'unable to extract video url')
4013             return
4014         video_url = compat_urllib_parse.unquote(result.group('url'))
4015
4016         #Get the uploaded date
4017         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4018         result = re.search(VIDEO_UPLOADED_RE, webpage)
4019         if result is None:
4020             self._downloader.report_error(u'unable to extract video title')
4021             return
4022         upload_date = result.group('date')
4023
4024         info = {'id': video_id,
4025                 'url': video_url,
4026                 'uploader': None,
4027                 'upload_date': upload_date,
4028                 'title': video_title,
4029                 'ext': 'flv',
4030                 'format': 'flv'}
4031
4032         return [info]
4033
4034 class YouJizzIE(InfoExtractor):
4035     """Information extractor for youjizz.com."""
4036     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4037
4038     def _real_extract(self, url):
4039         mobj = re.match(self._VALID_URL, url)
4040         if mobj is None:
4041             self._downloader.report_error(u'invalid URL: %s' % url)
4042             return
4043
4044         video_id = mobj.group('videoid')
4045
4046         # Get webpage content
4047         webpage = self._download_webpage(url, video_id)
4048
4049         # Get the video title
4050         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4051         if result is None:
4052             raise ExtractorError(u'ERROR: unable to extract video title')
4053         video_title = result.group('title').strip()
4054
4055         # Get the embed page
4056         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4057         if result is None:
4058             raise ExtractorError(u'ERROR: unable to extract embed page')
4059
4060         embed_page_url = result.group(0).strip()
4061         video_id = result.group('videoid')
4062
4063         webpage = self._download_webpage(embed_page_url, video_id)
4064
4065         # Get the video URL
4066         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4067         if result is None:
4068             raise ExtractorError(u'ERROR: unable to extract video url')
4069         video_url = result.group('source')
4070
4071         info = {'id': video_id,
4072                 'url': video_url,
4073                 'title': video_title,
4074                 'ext': 'flv',
4075                 'format': 'flv',
4076                 'player_url': embed_page_url}
4077
4078         return [info]
4079
4080 class EightTracksIE(InfoExtractor):
4081     IE_NAME = '8tracks'
4082     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4083
4084     def _real_extract(self, url):
4085         mobj = re.match(self._VALID_URL, url)
4086         if mobj is None:
4087             raise ExtractorError(u'Invalid URL: %s' % url)
4088         playlist_id = mobj.group('id')
4089
4090         webpage = self._download_webpage(url, playlist_id)
4091
4092         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4093         if not m:
4094             raise ExtractorError(u'Cannot find trax information')
4095         json_like = m.group(1)
4096         data = json.loads(json_like)
4097
4098         session = str(random.randint(0, 1000000000))
4099         mix_id = data['id']
4100         track_count = data['tracks_count']
4101         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4102         next_url = first_url
4103         res = []
4104         for i in itertools.count():
4105             api_json = self._download_webpage(next_url, playlist_id,
4106                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4107                 errnote=u'Failed to download song information')
4108             api_data = json.loads(api_json)
4109             track_data = api_data[u'set']['track']
4110             info = {
4111                 'id': track_data['id'],
4112                 'url': track_data['track_file_stream_url'],
4113                 'title': track_data['performer'] + u' - ' + track_data['name'],
4114                 'raw_title': track_data['name'],
4115                 'uploader_id': data['user']['login'],
4116                 'ext': 'm4a',
4117             }
4118             res.append(info)
4119             if api_data['set']['at_last_track']:
4120                 break
4121             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4122         return res
4123
4124 class KeekIE(InfoExtractor):
4125     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4126     IE_NAME = u'keek'
4127
4128     def _real_extract(self, url):
4129         m = re.match(self._VALID_URL, url)
4130         video_id = m.group('videoID')
4131         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4132         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4133         webpage = self._download_webpage(url, video_id)
4134         m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
4135         title = unescapeHTML(m.group('title'))
4136         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4137         uploader = clean_html(m.group('uploader'))
4138         info = {
4139                 'id': video_id,
4140                 'url': video_url,
4141                 'ext': 'mp4',
4142                 'title': title,
4143                 'thumbnail': thumbnail,
4144                 'uploader': uploader
4145         }
4146         return [info]
4147
4148 class TEDIE(InfoExtractor):
4149     _VALID_URL=r'''http://www.ted.com/
4150                    (
4151                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4152                         |
4153                         ((?P<type_talk>talks)) # We have a simple talk
4154                    )
4155                    /(?P<name>\w+) # Here goes the name and then ".html"
4156                    '''
4157
4158     @classmethod
4159     def suitable(cls, url):
4160         """Receives a URL and returns True if suitable for this IE."""
4161         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4162
4163     def _real_extract(self, url):
4164         m=re.match(self._VALID_URL, url, re.VERBOSE)
4165         if m.group('type_talk'):
4166             return [self._talk_info(url)]
4167         else :
4168             playlist_id=m.group('playlist_id')
4169             name=m.group('name')
4170             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4171             return self._playlist_videos_info(url,name,playlist_id)
4172
4173     def _talk_video_link(self,mediaSlug):
4174         '''Returns the video link for that mediaSlug'''
4175         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4176
4177     def _playlist_videos_info(self,url,name,playlist_id=0):
4178         '''Returns the videos of the playlist'''
4179         video_RE=r'''
4180                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4181                      ([.\s]*?)data-playlist_item_id="(\d+)"
4182                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4183                      '''
4184         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4185         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4186         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4187         m_names=re.finditer(video_name_RE,webpage)
4188         info=[]
4189         for m_video, m_name in zip(m_videos,m_names):
4190             video_id=m_video.group('video_id')
4191             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4192             info.append(self._talk_info(talk_url,video_id))
4193         return info
4194
4195     def _talk_info(self, url, video_id=0):
4196         """Return the video for the talk in the url"""
4197         m=re.match(self._VALID_URL, url,re.VERBOSE)
4198         videoName=m.group('name')
4199         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4200         # If the url includes the language we get the title translated
4201         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4202         title=re.search(title_RE, webpage).group('title')
4203         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4204                         "id":(?P<videoID>[\d]+).*?
4205                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4206         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4207         thumb_match=re.search(thumb_RE,webpage)
4208         info_match=re.search(info_RE,webpage,re.VERBOSE)
4209         video_id=info_match.group('videoID')
4210         mediaSlug=info_match.group('mediaSlug')
4211         video_url=self._talk_video_link(mediaSlug)
4212         info = {
4213                 'id': video_id,
4214                 'url': video_url,
4215                 'ext': 'mp4',
4216                 'title': title,
4217                 'thumbnail': thumb_match.group('thumbnail')
4218                 }
4219         return info
4220
4221 class MySpassIE(InfoExtractor):
4222     _VALID_URL = r'http://www.myspass.de/.*'
4223
4224     def _real_extract(self, url):
4225         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4226
4227         # video id is the last path element of the URL
4228         # usually there is a trailing slash, so also try the second but last
4229         url_path = compat_urllib_parse_urlparse(url).path
4230         url_parent_path, video_id = os.path.split(url_path)
4231         if not video_id:
4232             _, video_id = os.path.split(url_parent_path)
4233
4234         # get metadata
4235         metadata_url = META_DATA_URL_TEMPLATE % video_id
4236         metadata_text = self._download_webpage(metadata_url, video_id)
4237         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4238
4239         # extract values from metadata
4240         url_flv_el = metadata.find('url_flv')
4241         if url_flv_el is None:
4242             self._downloader.report_error(u'unable to extract download url')
4243             return
4244         video_url = url_flv_el.text
4245         extension = os.path.splitext(video_url)[1][1:]
4246         title_el = metadata.find('title')
4247         if title_el is None:
4248             self._downloader.report_error(u'unable to extract title')
4249             return
4250         title = title_el.text
4251         format_id_el = metadata.find('format_id')
4252         if format_id_el is None:
4253             format = ext
4254         else:
4255             format = format_id_el.text
4256         description_el = metadata.find('description')
4257         if description_el is not None:
4258             description = description_el.text
4259         else:
4260             description = None
4261         imagePreview_el = metadata.find('imagePreview')
4262         if imagePreview_el is not None:
4263             thumbnail = imagePreview_el.text
4264         else:
4265             thumbnail = None
4266         info = {
4267             'id': video_id,
4268             'url': video_url,
4269             'title': title,
4270             'ext': extension,
4271             'format': format,
4272             'thumbnail': thumbnail,
4273             'description': description
4274         }
4275         return [info]
4276
4277 class SpiegelIE(InfoExtractor):
4278     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4279
4280     def _real_extract(self, url):
4281         m = re.match(self._VALID_URL, url)
4282         video_id = m.group('videoID')
4283
4284         webpage = self._download_webpage(url, video_id)
4285         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4286         if not m:
4287             raise ExtractorError(u'Cannot find title')
4288         video_title = unescapeHTML(m.group(1))
4289
4290         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4291         xml_code = self._download_webpage(xml_url, video_id,
4292                     note=u'Downloading XML', errnote=u'Failed to download XML')
4293
4294         idoc = xml.etree.ElementTree.fromstring(xml_code)
4295         last_type = idoc[-1]
4296         filename = last_type.findall('./filename')[0].text
4297         duration = float(last_type.findall('./duration')[0].text)
4298
4299         video_url = 'http://video2.spiegel.de/flash/' + filename
4300         video_ext = filename.rpartition('.')[2]
4301         info = {
4302             'id': video_id,
4303             'url': video_url,
4304             'ext': video_ext,
4305             'title': video_title,
4306             'duration': duration,
4307         }
4308         return [info]
4309
4310 class LiveLeakIE(InfoExtractor):
4311
4312     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4313     IE_NAME = u'liveleak'
4314
4315     def _real_extract(self, url):
4316         mobj = re.match(self._VALID_URL, url)
4317         if mobj is None:
4318             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4319             return
4320
4321         video_id = mobj.group('video_id')
4322
4323         webpage = self._download_webpage(url, video_id)
4324
4325         m = re.search(r'file: "(.*?)",', webpage)
4326         if not m:
4327             self._downloader.report_error(u'unable to find video url')
4328             return
4329         video_url = m.group(1)
4330
4331         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4332         if not m:
4333             self._downloader.trouble(u'Cannot find video title')
4334         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4335
4336         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4337         if m:
4338             desc = unescapeHTML(m.group('desc'))
4339         else:
4340             desc = None
4341
4342         m = re.search(r'By:.*?(\w+)</a>', webpage)
4343         if m:
4344             uploader = clean_html(m.group(1))
4345         else:
4346             uploader = None
4347
4348         info = {
4349             'id':  video_id,
4350             'url': video_url,
4351             'ext': 'mp4',
4352             'title': title,
4353             'description': desc,
4354             'uploader': uploader
4355         }
4356
4357         return [info]
4358
4359
4360 def gen_extractors():
4361     """ Return a list of an instance of every supported extractor.
4362     The order does matter; the first extractor matched is the one handling the URL.
4363     """
4364     return [
4365         YoutubePlaylistIE(),
4366         YoutubeChannelIE(),
4367         YoutubeUserIE(),
4368         YoutubeSearchIE(),
4369         YoutubeIE(),
4370         MetacafeIE(),
4371         DailymotionIE(),
4372         GoogleSearchIE(),
4373         PhotobucketIE(),
4374         YahooIE(),
4375         YahooSearchIE(),
4376         DepositFilesIE(),
4377         FacebookIE(),
4378         BlipTVUserIE(),
4379         BlipTVIE(),
4380         VimeoIE(),
4381         MyVideoIE(),
4382         ComedyCentralIE(),
4383         EscapistIE(),
4384         CollegeHumorIE(),
4385         XVideosIE(),
4386         SoundcloudSetIE(),
4387         SoundcloudIE(),
4388         InfoQIE(),
4389         MixcloudIE(),
4390         StanfordOpenClassroomIE(),
4391         MTVIE(),
4392         YoukuIE(),
4393         XNXXIE(),
4394         YouJizzIE(),
4395         PornotubeIE(),
4396         YouPornIE(),
4397         GooglePlusIE(),
4398         ArteTvIE(),
4399         NBAIE(),
4400         WorldStarHipHopIE(),
4401         JustinTVIE(),
4402         FunnyOrDieIE(),
4403         SteamIE(),
4404         UstreamIE(),
4405         RBMARadioIE(),
4406         EightTracksIE(),
4407         KeekIE(),
4408         TEDIE(),
4409         MySpassIE(),
4410         SpiegelIE(),
4411         LiveLeakIE(),
4412         GenericIE()
4413     ]