git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self._downloader.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     #Methods for following #608
 148     #They set the correct value of the '_type' key
 149     def video_result(self, video_info):
 150         """Returns a video"""
 151         video_info['_type'] = 'video'
 152         return video_info
 153     def url_result(self, url, ie=None):
 154         """Returns a url that points to a page that should be processed"""
 155         #TODO: ie should be the class used for getting the info
 156         video_info = {'_type': 'url',
 157                       'url': url,
 158                       'ie_key': ie}
 159         return video_info
 160     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 161         """Returns a playlist"""
 162         video_info = {'_type': 'playlist',
 163                       'entries': entries}
 164         if playlist_id:
 165             video_info['id'] = playlist_id
 166         if playlist_title:
 167             video_info['title'] = playlist_title
 168         return video_info
 169
 170
 171 class YoutubeIE(InfoExtractor):
 172     """Information extractor for youtube.com."""
 173
 174     _VALID_URL = r"""^
 175                      (
 176                          (?:https?://)?                                       # http(s):// (optional)
 177                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 178                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 179                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 180                          (?:                                                  # the various things that can precede the ID:
 181                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 182                              |(?:                                             # or the v= param in all its forms
 183                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 184                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 185                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 186                                  v=
 187                              )
 188                          )?                                                   # optional -> youtube.com/xxxx is OK
 189                      )?                                                       # all until now is optional -> you can pass the naked ID
 190                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 191                      (?(1).+)?                                                # if we found the ID, everything can follow
 192                      $"""
 193     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 194     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 195     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 196     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 197     _NETRC_MACHINE = 'youtube'
 198     # Listed in order of quality
 199     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 200     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 201     _video_extensions = {
 202         '13': '3gp',
 203         '17': 'mp4',
 204         '18': 'mp4',
 205         '22': 'mp4',
 206         '37': 'mp4',
 207         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 208         '43': 'webm',
 209         '44': 'webm',
 210         '45': 'webm',
 211         '46': 'webm',
 212     }
 213     _video_dimensions = {
 214         '5': '240x400',
 215         '6': '???',
 216         '13': '???',
 217         '17': '144x176',
 218         '18': '360x640',
 219         '22': '720x1280',
 220         '34': '360x640',
 221         '35': '480x854',
 222         '37': '1080x1920',
 223         '38': '3072x4096',
 224         '43': '360x640',
 225         '44': '480x854',
 226         '45': '720x1280',
 227         '46': '1080x1920',
 228     }
 229     IE_NAME = u'youtube'
 230
 231     @classmethod
 232     def suitable(cls, url):
 233         """Receives a URL and returns True if suitable for this IE."""
 234         if YoutubePlaylistIE.suitable(url): return False
 235         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 236
 237     def report_lang(self):
 238         """Report attempt to set language."""
 239         self._downloader.to_screen(u'[youtube] Setting language')
 240
 241     def report_login(self):
 242         """Report attempt to log in."""
 243         self._downloader.to_screen(u'[youtube] Logging in')
 244
 245     def report_age_confirmation(self):
 246         """Report attempt to confirm age."""
 247         self._downloader.to_screen(u'[youtube] Confirming age')
 248
 249     def report_video_webpage_download(self, video_id):
 250         """Report attempt to download video webpage."""
 251         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 252
 253     def report_video_info_webpage_download(self, video_id):
 254         """Report attempt to download video info webpage."""
 255         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 256
 257     def report_video_subtitles_download(self, video_id):
 258         """Report attempt to download video info webpage."""
 259         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 260
 261     def report_video_subtitles_request(self, video_id, sub_lang, format):
 262         """Report attempt to download video info webpage."""
 263         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 264
 265     def report_video_subtitles_available(self, video_id, sub_lang_list):
 266         """Report available subtitles."""
 267         sub_lang = ",".join(list(sub_lang_list.keys()))
 268         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 269
 270     def report_information_extraction(self, video_id):
 271         """Report attempt to extract video information."""
 272         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 273
 274     def report_unavailable_format(self, video_id, format):
 275         """Report extracted video URL."""
 276         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 277
 278     def report_rtmp_download(self):
 279         """Indicate the download will use the RTMP protocol."""
 280         self._downloader.to_screen(u'[youtube] RTMP download detected')
 281
 282     def _get_available_subtitles(self, video_id):
 283         self.report_video_subtitles_download(video_id)
 284         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 285         try:
 286             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 287         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 288             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 289         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 290         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 291         if not sub_lang_list:
 292             return (u'video doesn\'t have subtitles', None)
 293         return sub_lang_list
 294
 295     def _list_available_subtitles(self, video_id):
 296         sub_lang_list = self._get_available_subtitles(video_id)
 297         self.report_video_subtitles_available(video_id, sub_lang_list)
 298
 299     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 300         """
 301         Return tuple:
 302         (error_message, sub_lang, sub)
 303         """
 304         self.report_video_subtitles_request(video_id, sub_lang, format)
 305         params = compat_urllib_parse.urlencode({
 306             'lang': sub_lang,
 307             'name': sub_name,
 308             'v': video_id,
 309             'fmt': format,
 310         })
 311         url = 'http://www.youtube.com/api/timedtext?' + params
 312         try:
 313             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 315             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 316         if not sub:
 317             return (u'Did not fetch video subtitles', None, None)
 318         return (None, sub_lang, sub)
 319
 320     def _extract_subtitle(self, video_id):
 321         """
 322         Return a list with a tuple:
 323         [(error_message, sub_lang, sub)]
 324         """
 325         sub_lang_list = self._get_available_subtitles(video_id)
 326         sub_format = self._downloader.params.get('subtitlesformat')
 327         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 328             return [(sub_lang_list[0], None, None)]
 329         if self._downloader.params.get('subtitleslang', False):
 330             sub_lang = self._downloader.params.get('subtitleslang')
 331         elif 'en' in sub_lang_list:
 332             sub_lang = 'en'
 333         else:
 334             sub_lang = list(sub_lang_list.keys())[0]
 335         if not sub_lang in sub_lang_list:
 336             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 337
 338         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 339         return [subtitle]
 340
 341     def _extract_all_subtitles(self, video_id):
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         subtitles = []
 347         for sub_lang in sub_lang_list:
 348             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 349             subtitles.append(subtitle)
 350         return subtitles
 351
 352     def _print_formats(self, formats):
 353         print('Available formats:')
 354         for x in formats:
 355             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 356
 357     def _real_initialize(self):
 358         if self._downloader is None:
 359             return
 360
 361         username = None
 362         password = None
 363         downloader_params = self._downloader.params
 364
 365         # Attempt to use provided username and password or .netrc data
 366         if downloader_params.get('username', None) is not None:
 367             username = downloader_params['username']
 368             password = downloader_params['password']
 369         elif downloader_params.get('usenetrc', False):
 370             try:
 371                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 372                 if info is not None:
 373                     username = info[0]
 374                     password = info[2]
 375                 else:
 376                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 377             except (IOError, netrc.NetrcParseError) as err:
 378                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 379                 return
 380
 381         # Set language
 382         request = compat_urllib_request.Request(self._LANG_URL)
 383         try:
 384             self.report_lang()
 385             compat_urllib_request.urlopen(request).read()
 386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 387             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 388             return
 389
 390         # No authentication to be performed
 391         if username is None:
 392             return
 393
 394         request = compat_urllib_request.Request(self._LOGIN_URL)
 395         try:
 396             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 397         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 398             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 399             return
 400
 401         galx = None
 402         dsh = None
 403         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 404         if match:
 405           galx = match.group(1)
 406
 407         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 408         if match:
 409           dsh = match.group(1)
 410
 411         # Log in
 412         login_form_strs = {
 413                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 414                 u'Email': username,
 415                 u'GALX': galx,
 416                 u'Passwd': password,
 417                 u'PersistentCookie': u'yes',
 418                 u'_utf8': u'霱',
 419                 u'bgresponse': u'js_disabled',
 420                 u'checkConnection': u'',
 421                 u'checkedDomains': u'youtube',
 422                 u'dnConn': u'',
 423                 u'dsh': dsh,
 424                 u'pstMsg': u'0',
 425                 u'rmShown': u'1',
 426                 u'secTok': u'',
 427                 u'signIn': u'Sign in',
 428                 u'timeStmp': u'',
 429                 u'service': u'youtube',
 430                 u'uilel': u'3',
 431                 u'hl': u'en_US',
 432         }
 433         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 434         # chokes on unicode
 435         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 436         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 437         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 438         try:
 439             self.report_login()
 440             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 441             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 442                 self._downloader.report_warning(u'unable to log in: bad username or password')
 443                 return
 444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 445             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 446             return
 447
 448         # Confirm age
 449         age_form = {
 450                 'next_url':     '/',
 451                 'action_confirm':   'Confirm',
 452                 }
 453         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 454         try:
 455             self.report_age_confirmation()
 456             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 457         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 458             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 459             return
 460
 461     def _extract_id(self, url):
 462         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 463         if mobj is None:
 464             self._downloader.report_error(u'invalid URL: %s' % url)
 465             return
 466         video_id = mobj.group(2)
 467         return video_id
 468
 469     def _real_extract(self, url):
 470         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 471         mobj = re.search(self._NEXT_URL_RE, url)
 472         if mobj:
 473             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 474         video_id = self._extract_id(url)
 475
 476         # Get video webpage
 477         self.report_video_webpage_download(video_id)
 478         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 479         request = compat_urllib_request.Request(url)
 480         try:
 481             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 483             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 484             return
 485
 486         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 487
 488         # Attempt to extract SWF player URL
 489         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 490         if mobj is not None:
 491             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 492         else:
 493             player_url = None
 494
 495         # Get video info
 496         self.report_video_info_webpage_download(video_id)
 497         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 498             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 499                     % (video_id, el_type))
 500             video_info_webpage = self._download_webpage(video_info_url, video_id,
 501                                     note=False,
 502                                     errnote='unable to download video info webpage')
 503             video_info = compat_parse_qs(video_info_webpage)
 504             if 'token' in video_info:
 505                 break
 506         if 'token' not in video_info:
 507             if 'reason' in video_info:
 508                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 509             else:
 510                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 511             return
 512
 513         # Check for "rental" videos
 514         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 515             self._downloader.report_error(u'"rental" videos not supported')
 516             return
 517
 518         # Start extracting information
 519         self.report_information_extraction(video_id)
 520
 521         # uploader
 522         if 'author' not in video_info:
 523             self._downloader.report_error(u'unable to extract uploader name')
 524             return
 525         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 526
 527         # uploader_id
 528         video_uploader_id = None
 529         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 530         if mobj is not None:
 531             video_uploader_id = mobj.group(1)
 532         else:
 533             self._downloader.report_warning(u'unable to extract uploader nickname')
 534
 535         # title
 536         if 'title' not in video_info:
 537             self._downloader.report_error(u'unable to extract video title')
 538             return
 539         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 540
 541         # thumbnail image
 542         if 'thumbnail_url' not in video_info:
 543             self._downloader.report_warning(u'unable to extract video thumbnail')
 544             video_thumbnail = ''
 545         else:   # don't panic if we can't find it
 546             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 547
 548         # upload date
 549         upload_date = None
 550         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 551         if mobj is not None:
 552             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 553             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 554             for expression in format_expressions:
 555                 try:
 556                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 557                 except:
 558                     pass
 559
 560         # description
 561         video_description = get_element_by_id("eow-description", video_webpage)
 562         if video_description:
 563             video_description = clean_html(video_description)
 564         else:
 565             video_description = ''
 566
 567         # subtitles
 568         video_subtitles = None
 569
 570         if self._downloader.params.get('writesubtitles', False):
 571             video_subtitles = self._extract_subtitle(video_id)
 572             if video_subtitles:
 573                 (sub_error, sub_lang, sub) = video_subtitles[0]
 574                 if sub_error:
 575                     self._downloader.report_error(sub_error)
 576
 577         if self._downloader.params.get('allsubtitles', False):
 578             video_subtitles = self._extract_all_subtitles(video_id)
 579             for video_subtitle in video_subtitles:
 580                 (sub_error, sub_lang, sub) = video_subtitle
 581                 if sub_error:
 582                     self._downloader.report_error(sub_error)
 583
 584         if self._downloader.params.get('listsubtitles', False):
 585             sub_lang_list = self._list_available_subtitles(video_id)
 586             return
 587
 588         if 'length_seconds' not in video_info:
 589             self._downloader.report_warning(u'unable to extract video duration')
 590             video_duration = ''
 591         else:
 592             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 593
 594         # token
 595         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 596
 597         # Decide which formats to download
 598         req_format = self._downloader.params.get('format', None)
 599
 600         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 601             self.report_rtmp_download()
 602             video_url_list = [(None, video_info['conn'][0])]
 603         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 604             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 605             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 606             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 607             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 608
 609             format_limit = self._downloader.params.get('format_limit', None)
 610             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 611             if format_limit is not None and format_limit in available_formats:
 612                 format_list = available_formats[available_formats.index(format_limit):]
 613             else:
 614                 format_list = available_formats
 615             existing_formats = [x for x in format_list if x in url_map]
 616             if len(existing_formats) == 0:
 617                 self._downloader.report_error(u'no known formats available for video')
 618                 return
 619             if self._downloader.params.get('listformats', None):
 620                 self._print_formats(existing_formats)
 621                 return
 622             if req_format is None or req_format == 'best':
 623                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 624             elif req_format == 'worst':
 625                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 626             elif req_format in ('-1', 'all'):
 627                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 628             else:
 629                 # Specific formats. We pick the first in a slash-delimeted sequence.
 630                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 631                 req_formats = req_format.split('/')
 632                 video_url_list = None
 633                 for rf in req_formats:
 634                     if rf in url_map:
 635                         video_url_list = [(rf, url_map[rf])]
 636                         break
 637                 if video_url_list is None:
 638                     self._downloader.report_error(u'requested format not available')
 639                     return
 640         else:
 641             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 642             return
 643
 644         results = []
 645         for format_param, video_real_url in video_url_list:
 646             # Extension
 647             video_extension = self._video_extensions.get(format_param, 'flv')
 648
 649             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 650                                               self._video_dimensions.get(format_param, '???'))
 651
 652             results.append({
 653                 'id':       video_id,
 654                 'url':      video_real_url,
 655                 'uploader': video_uploader,
 656                 'uploader_id': video_uploader_id,
 657                 'upload_date':  upload_date,
 658                 'title':    video_title,
 659                 'ext':      video_extension,
 660                 'format':   video_format,
 661                 'thumbnail':    video_thumbnail,
 662                 'description':  video_description,
 663                 'player_url':   player_url,
 664                 'subtitles':    video_subtitles,
 665                 'duration':     video_duration
 666             })
 667         return results
 668
 669
 670 class MetacafeIE(InfoExtractor):
 671     """Information Extractor for metacafe.com."""
 672
 673     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 674     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 675     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 676     IE_NAME = u'metacafe'
 677
 678     def __init__(self, downloader=None):
 679         InfoExtractor.__init__(self, downloader)
 680
 681     def report_disclaimer(self):
 682         """Report disclaimer retrieval."""
 683         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 684
 685     def report_age_confirmation(self):
 686         """Report attempt to confirm age."""
 687         self._downloader.to_screen(u'[metacafe] Confirming age')
 688
 689     def report_download_webpage(self, video_id):
 690         """Report webpage download."""
 691         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 692
 693     def report_extraction(self, video_id):
 694         """Report information extraction."""
 695         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 696
 697     def _real_initialize(self):
 698         # Retrieve disclaimer
 699         request = compat_urllib_request.Request(self._DISCLAIMER)
 700         try:
 701             self.report_disclaimer()
 702             disclaimer = compat_urllib_request.urlopen(request).read()
 703         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 704             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 705             return
 706
 707         # Confirm age
 708         disclaimer_form = {
 709             'filters': '0',
 710             'submit': "Continue - I'm over 18",
 711             }
 712         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 713         try:
 714             self.report_age_confirmation()
 715             disclaimer = compat_urllib_request.urlopen(request).read()
 716         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 717             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 718             return
 719
 720     def _real_extract(self, url):
 721         # Extract id and simplified title from URL
 722         mobj = re.match(self._VALID_URL, url)
 723         if mobj is None:
 724             self._downloader.report_error(u'invalid URL: %s' % url)
 725             return
 726
 727         video_id = mobj.group(1)
 728
 729         # Check if video comes from YouTube
 730         mobj2 = re.match(r'^yt-(.*)$', video_id)
 731         if mobj2 is not None:
 732             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 733
 734         # Retrieve video webpage to extract further information
 735         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 736
 737         # Extract URL, uploader and title from webpage
 738         self.report_extraction(video_id)
 739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 740         if mobj is not None:
 741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 742             video_extension = mediaURL[-3:]
 743
 744             # Extract gdaKey if available
 745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 746             if mobj is None:
 747                 video_url = mediaURL
 748             else:
 749                 gdaKey = mobj.group(1)
 750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 751         else:
 752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 753             if mobj is None:
 754                 self._downloader.report_error(u'unable to extract media URL')
 755                 return
 756             vardict = compat_parse_qs(mobj.group(1))
 757             if 'mediaData' not in vardict:
 758                 self._downloader.report_error(u'unable to extract media URL')
 759                 return
 760             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 761             if mobj is None:
 762                 self._downloader.report_error(u'unable to extract media URL')
 763                 return
 764             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 765             video_extension = mediaURL[-3:]
 766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 767
 768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 769         if mobj is None:
 770             self._downloader.report_error(u'unable to extract title')
 771             return
 772         video_title = mobj.group(1).decode('utf-8')
 773
 774         mobj = re.search(r'submitter=(.*?);', webpage)
 775         if mobj is None:
 776             self._downloader.report_error(u'unable to extract uploader nickname')
 777             return
 778         video_uploader = mobj.group(1)
 779
 780         return [{
 781             'id':       video_id.decode('utf-8'),
 782             'url':      video_url.decode('utf-8'),
 783             'uploader': video_uploader.decode('utf-8'),
 784             'upload_date':  None,
 785             'title':    video_title,
 786             'ext':      video_extension.decode('utf-8'),
 787         }]
 788
 789
 790 class DailymotionIE(InfoExtractor):
 791     """Information Extractor for Dailymotion"""
 792
 793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 794     IE_NAME = u'dailymotion'
 795     _WORKING = False
 796
 797     def __init__(self, downloader=None):
 798         InfoExtractor.__init__(self, downloader)
 799
 800     def report_extraction(self, video_id):
 801         """Report information extraction."""
 802         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 803
 804     def _real_extract(self, url):
 805         # Extract id and simplified title from URL
 806         mobj = re.match(self._VALID_URL, url)
 807         if mobj is None:
 808             self._downloader.report_error(u'invalid URL: %s' % url)
 809             return
 810
 811         video_id = mobj.group(1).split('_')[0].split('?')[0]
 812
 813         video_extension = 'mp4'
 814
 815         # Retrieve video webpage to extract further information
 816         request = compat_urllib_request.Request(url)
 817         request.add_header('Cookie', 'family_filter=off')
 818         webpage = self._download_webpage(request, video_id)
 819
 820         # Extract URL, uploader and title from webpage
 821         self.report_extraction(video_id)
 822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 823         if mobj is None:
 824             self._downloader.report_error(u'unable to extract media URL')
 825             return
 826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 827
 828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 829             if key in flashvars:
 830                 max_quality = key
 831                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 832                 break
 833         else:
 834             self._downloader.report_error(u'unable to extract video URL')
 835             return
 836
 837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 838         if mobj is None:
 839             self._downloader.report_error(u'unable to extract video URL')
 840             return
 841
 842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 843
 844         # TODO: support choosing qualities
 845
 846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 847         if mobj is None:
 848             self._downloader.report_error(u'unable to extract title')
 849             return
 850         video_title = unescapeHTML(mobj.group('title'))
 851
 852         video_uploader = None
 853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 854         if mobj is None:
 855             # lookin for official user
 856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 857             if mobj_official is None:
 858                 self._downloader.report_warning(u'unable to extract uploader nickname')
 859             else:
 860                 video_uploader = mobj_official.group(1)
 861         else:
 862             video_uploader = mobj.group(1)
 863
 864         video_upload_date = None
 865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 866         if mobj is not None:
 867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 868
 869         return [{
 870             'id':       video_id,
 871             'url':      video_url,
 872             'uploader': video_uploader,
 873             'upload_date':  video_upload_date,
 874             'title':    video_title,
 875             'ext':      video_extension,
 876         }]
 877
 878
 879 class PhotobucketIE(InfoExtractor):
 880     """Information extractor for photobucket.com."""
 881
 882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 883     IE_NAME = u'photobucket'
 884
 885     def __init__(self, downloader=None):
 886         InfoExtractor.__init__(self, downloader)
 887
 888     def report_download_webpage(self, video_id):
 889         """Report webpage download."""
 890         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 891
 892     def report_extraction(self, video_id):
 893         """Report information extraction."""
 894         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 895
 896     def _real_extract(self, url):
 897         # Extract id from URL
 898         mobj = re.match(self._VALID_URL, url)
 899         if mobj is None:
 900             self._downloader.report_error(u'Invalid URL: %s' % url)
 901             return
 902
 903         video_id = mobj.group(1)
 904
 905         video_extension = 'flv'
 906
 907         # Retrieve video webpage to extract further information
 908         request = compat_urllib_request.Request(url)
 909         try:
 910             self.report_download_webpage(video_id)
 911             webpage = compat_urllib_request.urlopen(request).read()
 912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 913             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 914             return
 915
 916         # Extract URL, uploader, and title from webpage
 917         self.report_extraction(video_id)
 918         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 919         if mobj is None:
 920             self._downloader.report_error(u'unable to extract media URL')
 921             return
 922         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 923
 924         video_url = mediaURL
 925
 926         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 927         if mobj is None:
 928             self._downloader.report_error(u'unable to extract title')
 929             return
 930         video_title = mobj.group(1).decode('utf-8')
 931
 932         video_uploader = mobj.group(2).decode('utf-8')
 933
 934         return [{
 935             'id':       video_id.decode('utf-8'),
 936             'url':      video_url.decode('utf-8'),
 937             'uploader': video_uploader,
 938             'upload_date':  None,
 939             'title':    video_title,
 940             'ext':      video_extension.decode('utf-8'),
 941         }]
 942
 943
 944 class YahooIE(InfoExtractor):
 945     """Information extractor for video.yahoo.com."""
 946
 947     _WORKING = False
 948     # _VALID_URL matches all Yahoo! Video URLs
 949     # _VPAGE_URL matches only the extractable '/watch/' URLs
 950     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 951     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 952     IE_NAME = u'video.yahoo'
 953
 954     def __init__(self, downloader=None):
 955         InfoExtractor.__init__(self, downloader)
 956
 957     def report_download_webpage(self, video_id):
 958         """Report webpage download."""
 959         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 960
 961     def report_extraction(self, video_id):
 962         """Report information extraction."""
 963         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 964
 965     def _real_extract(self, url, new_video=True):
 966         # Extract ID from URL
 967         mobj = re.match(self._VALID_URL, url)
 968         if mobj is None:
 969             self._downloader.report_error(u'Invalid URL: %s' % url)
 970             return
 971
 972         video_id = mobj.group(2)
 973         video_extension = 'flv'
 974
 975         # Rewrite valid but non-extractable URLs as
 976         # extractable English language /watch/ URLs
 977         if re.match(self._VPAGE_URL, url) is None:
 978             request = compat_urllib_request.Request(url)
 979             try:
 980                 webpage = compat_urllib_request.urlopen(request).read()
 981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 982                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 983                 return
 984
 985             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 986             if mobj is None:
 987                 self._downloader.report_error(u'Unable to extract id field')
 988                 return
 989             yahoo_id = mobj.group(1)
 990
 991             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 992             if mobj is None:
 993                 self._downloader.report_error(u'Unable to extract vid field')
 994                 return
 995             yahoo_vid = mobj.group(1)
 996
 997             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 998             return self._real_extract(url, new_video=False)
 999
1000         # Retrieve video webpage to extract further information
1001         request = compat_urllib_request.Request(url)
1002         try:
1003             self.report_download_webpage(video_id)
1004             webpage = compat_urllib_request.urlopen(request).read()
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Extract uploader and title from webpage
1010         self.report_extraction(video_id)
1011         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1012         if mobj is None:
1013             self._downloader.report_error(u'unable to extract video title')
1014             return
1015         video_title = mobj.group(1).decode('utf-8')
1016
1017         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video uploader')
1020             return
1021         video_uploader = mobj.group(1).decode('utf-8')
1022
1023         # Extract video thumbnail
1024         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video thumbnail')
1027             return
1028         video_thumbnail = mobj.group(1).decode('utf-8')
1029
1030         # Extract video description
1031         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video description')
1034             return
1035         video_description = mobj.group(1).decode('utf-8')
1036         if not video_description:
1037             video_description = 'No description available.'
1038
1039         # Extract video height and width
1040         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1041         if mobj is None:
1042             self._downloader.report_error(u'unable to extract video height')
1043             return
1044         yv_video_height = mobj.group(1)
1045
1046         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1047         if mobj is None:
1048             self._downloader.report_error(u'unable to extract video width')
1049             return
1050         yv_video_width = mobj.group(1)
1051
1052         # Retrieve video playlist to extract media URL
1053         # I'm not completely sure what all these options are, but we
1054         # seem to need most of them, otherwise the server sends a 401.
1055         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1056         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1057         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Extract media URL from playlist XML
1068         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1069         if mobj is None:
1070             self._downloader.report_error(u'Unable to extract media URL')
1071             return
1072         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073         video_url = unescapeHTML(video_url)
1074
1075         return [{
1076             'id':       video_id.decode('utf-8'),
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'upload_date':  None,
1080             'title':    video_title,
1081             'ext':      video_extension.decode('utf-8'),
1082             'thumbnail':    video_thumbnail.decode('utf-8'),
1083             'description':  video_description,
1084         }]
1085
1086
1087 class VimeoIE(InfoExtractor):
1088     """Information extractor for vimeo.com."""
1089
1090     # _VALID_URL matches Vimeo URLs
1091     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092     IE_NAME = u'vimeo'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1104
1105     def _real_extract(self, url, new_video=True):
1106         # Extract ID from URL
1107         mobj = re.match(self._VALID_URL, url)
1108         if mobj is None:
1109             self._downloader.report_error(u'Invalid URL: %s' % url)
1110             return
1111
1112         video_id = mobj.group('id')
1113         if not mobj.group('proto'):
1114             url = 'https://' + url
1115         if mobj.group('direct_link'):
1116             url = 'https://vimeo.com/' + video_id
1117
1118         # Retrieve video webpage to extract further information
1119         request = compat_urllib_request.Request(url, None, std_headers)
1120         try:
1121             self.report_download_webpage(video_id)
1122             webpage_bytes = compat_urllib_request.urlopen(request).read()
1123             webpage = webpage_bytes.decode('utf-8')
1124         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1126             return
1127
1128         # Now we begin extracting as much information as we can from what we
1129         # retrieved. First we extract the information common to all extractors,
1130         # and latter we extract those that are Vimeo specific.
1131         self.report_extraction(video_id)
1132
1133         # Extract the config JSON
1134         try:
1135             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136             config = json.loads(config)
1137         except:
1138             self._downloader.report_error(u'unable to extract info section')
1139             return
1140
1141         # Extract title
1142         video_title = config["video"]["title"]
1143
1144         # Extract uploader and uploader_id
1145         video_uploader = config["video"]["owner"]["name"]
1146         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1147
1148         # Extract video thumbnail
1149         video_thumbnail = config["video"]["thumbnail"]
1150
1151         # Extract video description
1152         video_description = get_element_by_attribute("itemprop", "description", webpage)
1153         if video_description: video_description = clean_html(video_description)
1154         else: video_description = u''
1155
1156         # Extract upload date
1157         video_upload_date = None
1158         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159         if mobj is not None:
1160             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1161
1162         # Vimeo specific: extract request signature and timestamp
1163         sig = config['request']['signature']
1164         timestamp = config['request']['timestamp']
1165
1166         # Vimeo specific: extract video codec and quality information
1167         # First consider quality, then codecs, then take everything
1168         # TODO bind to format param
1169         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170         files = { 'hd': [], 'sd': [], 'other': []}
1171         for codec_name, codec_extension in codecs:
1172             if codec_name in config["video"]["files"]:
1173                 if 'hd' in config["video"]["files"][codec_name]:
1174                     files['hd'].append((codec_name, codec_extension, 'hd'))
1175                 elif 'sd' in config["video"]["files"][codec_name]:
1176                     files['sd'].append((codec_name, codec_extension, 'sd'))
1177                 else:
1178                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1179
1180         for quality in ('hd', 'sd', 'other'):
1181             if len(files[quality]) > 0:
1182                 video_quality = files[quality][0][2]
1183                 video_codec = files[quality][0][0]
1184                 video_extension = files[quality][0][1]
1185                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1186                 break
1187         else:
1188             self._downloader.report_error(u'no known codec found')
1189             return
1190
1191         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1193
1194         return [{
1195             'id':       video_id,
1196             'url':      video_url,
1197             'uploader': video_uploader,
1198             'uploader_id': video_uploader_id,
1199             'upload_date':  video_upload_date,
1200             'title':    video_title,
1201             'ext':      video_extension,
1202             'thumbnail':    video_thumbnail,
1203             'description':  video_description,
1204         }]
1205
1206
1207 class ArteTvIE(InfoExtractor):
1208     """arte.tv information extractor."""
1209
1210     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211     _LIVE_URL = r'index-[0-9]+\.html$'
1212
1213     IE_NAME = u'arte.tv'
1214
1215     def __init__(self, downloader=None):
1216         InfoExtractor.__init__(self, downloader)
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1221
1222     def report_extraction(self, video_id):
1223         """Report information extraction."""
1224         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233             return
1234         except ValueError as err:
1235             self._downloader.report_error(u'Invalid URL: %s' % url)
1236             return
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             self._downloader.report_error(u'Invalid URL: %s' % url)
1246             return
1247
1248         for (i, key, err) in matchTuples:
1249             if mobj.group(i) is None:
1250                 self._downloader.trouble(err)
1251                 return
1252             else:
1253                 info[key] = mobj.group(i)
1254
1255         return info
1256
1257     def extractLiveStream(self, url):
1258         video_lang = url.split('/')[-4]
1259         info = self.grep_webpage(
1260             url,
1261             r'src="(.*?/videothek_js.*?\.js)',
1262             0,
1263             [
1264                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1265             ]
1266         )
1267         http_host = url.split('/')[2]
1268         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269         info = self.grep_webpage(
1270             next_url,
1271             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272                 '(http://.*?\.swf).*?' +
1273                 '(rtmp://.*?)\'',
1274             re.DOTALL,
1275             [
1276                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1277                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1279             ]
1280         )
1281         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1282
1283     def extractPlus7Stream(self, url):
1284         video_lang = url.split('/')[-3]
1285         info = self.grep_webpage(
1286             url,
1287             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288             0,
1289             [
1290                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1291             ]
1292         )
1293         next_url = compat_urllib_parse.unquote(info.get('url'))
1294         info = self.grep_webpage(
1295             next_url,
1296             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297             0,
1298             [
1299                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1300             ]
1301         )
1302         next_url = compat_urllib_parse.unquote(info.get('url'))
1303
1304         info = self.grep_webpage(
1305             next_url,
1306             r'<video id="(.*?)".*?>.*?' +
1307                 '<name>(.*?)</name>.*?' +
1308                 '<dateVideo>(.*?)</dateVideo>.*?' +
1309                 '<url quality="hd">(.*?)</url>',
1310             re.DOTALL,
1311             [
1312                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1313                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1315                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1316             ]
1317         )
1318
1319         return {
1320             'id':           info.get('id'),
1321             'url':          compat_urllib_parse.unquote(info.get('url')),
1322             'uploader':     u'arte.tv',
1323             'upload_date':  info.get('date'),
1324             'title':        info.get('title').decode('utf-8'),
1325             'ext':          u'mp4',
1326             'format':       u'NA',
1327             'player_url':   None,
1328         }
1329
1330     def _real_extract(self, url):
1331         video_id = url.split('/')[-1]
1332         self.report_extraction(video_id)
1333
1334         if re.search(self._LIVE_URL, video_id) is not None:
1335             self.extractLiveStream(url)
1336             return
1337         else:
1338             info = self.extractPlus7Stream(url)
1339
1340         return [info]
1341
1342
1343 class GenericIE(InfoExtractor):
1344     """Generic last-resort information extractor."""
1345
1346     _VALID_URL = r'.*'
1347     IE_NAME = u'generic'
1348
1349     def __init__(self, downloader=None):
1350         InfoExtractor.__init__(self, downloader)
1351
1352     def report_download_webpage(self, video_id):
1353         """Report webpage download."""
1354         if not self._downloader.params.get('test', False):
1355             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1357
1358     def report_extraction(self, video_id):
1359         """Report information extraction."""
1360         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1361
1362     def report_following_redirect(self, new_url):
1363         """Report information extraction."""
1364         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1365
1366     def _test_redirect(self, url):
1367         """Check if it is a redirect, like url shorteners, in case return the new url."""
1368         class HeadRequest(compat_urllib_request.Request):
1369             def get_method(self):
1370                 return "HEAD"
1371
1372         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1373             """
1374             Subclass the HTTPRedirectHandler to make it use our
1375             HeadRequest also on the redirected URL
1376             """
1377             def redirect_request(self, req, fp, code, msg, headers, newurl):
1378                 if code in (301, 302, 303, 307):
1379                     newurl = newurl.replace(' ', '%20')
1380                     newheaders = dict((k,v) for k,v in req.headers.items()
1381                                       if k.lower() not in ("content-length", "content-type"))
1382                     return HeadRequest(newurl,
1383                                        headers=newheaders,
1384                                        origin_req_host=req.get_origin_req_host(),
1385                                        unverifiable=True)
1386                 else:
1387                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1388
1389         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1390             """
1391             Fallback to GET if HEAD is not allowed (405 HTTP error)
1392             """
1393             def http_error_405(self, req, fp, code, msg, headers):
1394                 fp.read()
1395                 fp.close()
1396
1397                 newheaders = dict((k,v) for k,v in req.headers.items()
1398                                   if k.lower() not in ("content-length", "content-type"))
1399                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1400                                                  headers=newheaders,
1401                                                  origin_req_host=req.get_origin_req_host(),
1402                                                  unverifiable=True))
1403
1404         # Build our opener
1405         opener = compat_urllib_request.OpenerDirector()
1406         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407                         HTTPMethodFallback, HEADRedirectHandler,
1408                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409             opener.add_handler(handler())
1410
1411         response = opener.open(HeadRequest(url))
1412         new_url = response.geturl()
1413
1414         if url == new_url:
1415             return False
1416
1417         self.report_following_redirect(new_url)
1418         return new_url
1419
1420     def _real_extract(self, url):
1421         new_url = self._test_redirect(url)
1422         if new_url: return [self.url_result(new_url)]
1423
1424         video_id = url.split('/')[-1]
1425         try:
1426             webpage = self._download_webpage(url, video_id)
1427         except ValueError as err:
1428             # since this is the last-resort InfoExtractor, if
1429             # this error is thrown, it'll be thrown here
1430             self._downloader.report_error(u'Invalid URL: %s' % url)
1431             return
1432
1433         self.report_extraction(video_id)
1434         # Start with something easy: JW Player in SWFObject
1435         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1436         if mobj is None:
1437             # Broaden the search a little bit
1438             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1439         if mobj is None:
1440             # Broaden the search a little bit: JWPlayer JS loader
1441             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'Invalid URL: %s' % url)
1444             return
1445
1446         # It's possible that one of the regexes
1447         # matched, but returned an empty group:
1448         if mobj.group(1) is None:
1449             self._downloader.report_error(u'Invalid URL: %s' % url)
1450             return
1451
1452         video_url = compat_urllib_parse.unquote(mobj.group(1))
1453         video_id = os.path.basename(video_url)
1454
1455         # here's a fun little line of code for you:
1456         video_extension = os.path.splitext(video_id)[1][1:]
1457         video_id = os.path.splitext(video_id)[0]
1458
1459         # it's tempting to parse this further, but you would
1460         # have to take into account all the variations like
1461         #   Video Title - Site Name
1462         #   Site Name | Video Title
1463         #   Video Title - Tagline | Site Name
1464         # and so on and so forth; it's just not practical
1465         mobj = re.search(r'<title>(.*)</title>', webpage)
1466         if mobj is None:
1467             self._downloader.report_error(u'unable to extract title')
1468             return
1469         video_title = mobj.group(1)
1470
1471         # video uploader is domain name
1472         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1473         if mobj is None:
1474             self._downloader.report_error(u'unable to extract title')
1475             return
1476         video_uploader = mobj.group(1)
1477
1478         return [{
1479             'id':       video_id,
1480             'url':      video_url,
1481             'uploader': video_uploader,
1482             'upload_date':  None,
1483             'title':    video_title,
1484             'ext':      video_extension,
1485         }]
1486
1487
1488 class YoutubeSearchIE(InfoExtractor):
1489     """Information Extractor for YouTube search queries."""
1490     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492     _max_youtube_results = 1000
1493     IE_NAME = u'youtube:search'
1494
1495     def __init__(self, downloader=None):
1496         InfoExtractor.__init__(self, downloader)
1497
1498     def report_download_page(self, query, pagenum):
1499         """Report attempt to download search page with given number."""
1500         query = query.decode(preferredencoding())
1501         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1502
1503     def _real_extract(self, query):
1504         mobj = re.match(self._VALID_URL, query)
1505         if mobj is None:
1506             self._downloader.report_error(u'invalid search query "%s"' % query)
1507             return
1508
1509         prefix, query = query.split(':')
1510         prefix = prefix[8:]
1511         query = query.encode('utf-8')
1512         if prefix == '':
1513             self._download_n_results(query, 1)
1514             return
1515         elif prefix == 'all':
1516             self._download_n_results(query, self._max_youtube_results)
1517             return
1518         else:
1519             try:
1520                 n = int(prefix)
1521                 if n <= 0:
1522                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1523                     return
1524                 elif n > self._max_youtube_results:
1525                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526                     n = self._max_youtube_results
1527                 self._download_n_results(query, n)
1528                 return
1529             except ValueError: # parsing prefix as integer fails
1530                 self._download_n_results(query, 1)
1531                 return
1532
1533     def _download_n_results(self, query, n):
1534         """Downloads a specified number of results for a query"""
1535
1536         video_ids = []
1537         pagenum = 0
1538         limit = n
1539
1540         while (50 * pagenum) < limit:
1541             self.report_download_page(query, pagenum+1)
1542             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543             request = compat_urllib_request.Request(result_url)
1544             try:
1545                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1548                 return
1549             api_response = json.loads(data)['data']
1550
1551             if not 'items' in api_response:
1552                 self._downloader.trouble(u'[youtube] No video results')
1553                 return
1554
1555             new_ids = list(video['id'] for video in api_response['items'])
1556             video_ids += new_ids
1557
1558             limit = min(n, api_response['totalItems'])
1559             pagenum += 1
1560
1561         if len(video_ids) > n:
1562             video_ids = video_ids[:n]
1563         for id in video_ids:
1564             self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1565         return
1566
1567
1568 class GoogleSearchIE(InfoExtractor):
1569     """Information Extractor for Google Video search queries."""
1570     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574     _max_google_results = 1000
1575     IE_NAME = u'video.google:search'
1576
1577     def __init__(self, downloader=None):
1578         InfoExtractor.__init__(self, downloader)
1579
1580     def report_download_page(self, query, pagenum):
1581         """Report attempt to download playlist page with given number."""
1582         query = query.decode(preferredencoding())
1583         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1584
1585     def _real_extract(self, query):
1586         mobj = re.match(self._VALID_URL, query)
1587         if mobj is None:
1588             self._downloader.report_error(u'invalid search query "%s"' % query)
1589             return
1590
1591         prefix, query = query.split(':')
1592         prefix = prefix[8:]
1593         query = query.encode('utf-8')
1594         if prefix == '':
1595             self._download_n_results(query, 1)
1596             return
1597         elif prefix == 'all':
1598             self._download_n_results(query, self._max_google_results)
1599             return
1600         else:
1601             try:
1602                 n = int(prefix)
1603                 if n <= 0:
1604                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605                     return
1606                 elif n > self._max_google_results:
1607                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608                     n = self._max_google_results
1609                 self._download_n_results(query, n)
1610                 return
1611             except ValueError: # parsing prefix as integer fails
1612                 self._download_n_results(query, 1)
1613                 return
1614
1615     def _download_n_results(self, query, n):
1616         """Downloads a specified number of results for a query"""
1617
1618         video_ids = []
1619         pagenum = 0
1620
1621         while True:
1622             self.report_download_page(query, pagenum)
1623             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624             request = compat_urllib_request.Request(result_url)
1625             try:
1626                 page = compat_urllib_request.urlopen(request).read()
1627             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1629                 return
1630
1631             # Extract video identifiers
1632             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633                 video_id = mobj.group(1)
1634                 if video_id not in video_ids:
1635                     video_ids.append(video_id)
1636                     if len(video_ids) == n:
1637                         # Specified n videos reached
1638                         for id in video_ids:
1639                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1640                         return
1641
1642             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643                 for id in video_ids:
1644                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1645                 return
1646
1647             pagenum = pagenum + 1
1648
1649
1650 class YahooSearchIE(InfoExtractor):
1651     """Information Extractor for Yahoo! Video search queries."""
1652
1653     _WORKING = False
1654     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657     _MORE_PAGES_INDICATOR = r'\s*Next'
1658     _max_yahoo_results = 1000
1659     IE_NAME = u'video.yahoo:search'
1660
1661     def __init__(self, downloader=None):
1662         InfoExtractor.__init__(self, downloader)
1663
1664     def report_download_page(self, query, pagenum):
1665         """Report attempt to download playlist page with given number."""
1666         query = query.decode(preferredencoding())
1667         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1668
1669     def _real_extract(self, query):
1670         mobj = re.match(self._VALID_URL, query)
1671         if mobj is None:
1672             self._downloader.report_error(u'invalid search query "%s"' % query)
1673             return
1674
1675         prefix, query = query.split(':')
1676         prefix = prefix[8:]
1677         query = query.encode('utf-8')
1678         if prefix == '':
1679             self._download_n_results(query, 1)
1680             return
1681         elif prefix == 'all':
1682             self._download_n_results(query, self._max_yahoo_results)
1683             return
1684         else:
1685             try:
1686                 n = int(prefix)
1687                 if n <= 0:
1688                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1689                     return
1690                 elif n > self._max_yahoo_results:
1691                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692                     n = self._max_yahoo_results
1693                 self._download_n_results(query, n)
1694                 return
1695             except ValueError: # parsing prefix as integer fails
1696                 self._download_n_results(query, 1)
1697                 return
1698
1699     def _download_n_results(self, query, n):
1700         """Downloads a specified number of results for a query"""
1701
1702         video_ids = []
1703         already_seen = set()
1704         pagenum = 1
1705
1706         while True:
1707             self.report_download_page(query, pagenum)
1708             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709             request = compat_urllib_request.Request(result_url)
1710             try:
1711                 page = compat_urllib_request.urlopen(request).read()
1712             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1714                 return
1715
1716             # Extract video identifiers
1717             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718                 video_id = mobj.group(1)
1719                 if video_id not in already_seen:
1720                     video_ids.append(video_id)
1721                     already_seen.add(video_id)
1722                     if len(video_ids) == n:
1723                         # Specified n videos reached
1724                         for id in video_ids:
1725                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1726                         return
1727
1728             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729                 for id in video_ids:
1730                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1731                 return
1732
1733             pagenum = pagenum + 1
1734
1735
1736 class YoutubePlaylistIE(InfoExtractor):
1737     """Information Extractor for YouTube playlists."""
1738
1739     _VALID_URL = r"""(?:
1740                         (?:https?://)?
1741                         (?:\w+\.)?
1742                         youtube\.com/
1743                         (?:
1744                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745                            \? (?:.*?&)*? (?:p|a|list)=
1746                         |  p/
1747                         )
1748                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1749                         .*
1750                      |
1751                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1752                      )"""
1753     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1754     _MAX_RESULTS = 50
1755     IE_NAME = u'youtube:playlist'
1756
1757     def __init__(self, downloader=None):
1758         InfoExtractor.__init__(self, downloader)
1759
1760     @classmethod
1761     def suitable(cls, url):
1762         """Receives a URL and returns True if suitable for this IE."""
1763         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1764
1765     def report_download_page(self, playlist_id, pagenum):
1766         """Report attempt to download playlist page with given number."""
1767         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1768
1769     def _real_extract(self, url):
1770         # Extract playlist id
1771         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1772         if mobj is None:
1773             self._downloader.report_error(u'invalid url: %s' % url)
1774             return
1775
1776         # Download playlist videos from API
1777         playlist_id = mobj.group(1) or mobj.group(2)
1778         page_num = 1
1779         videos = []
1780
1781         while True:
1782             self.report_download_page(playlist_id, page_num)
1783
1784             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1785             try:
1786                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1789                 return
1790
1791             try:
1792                 response = json.loads(page)
1793             except ValueError as err:
1794                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1795                 return
1796
1797             if 'feed' not in response:
1798                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1799                 return
1800             if 'entry' not in response['feed']:
1801                 # Number of videos is a multiple of self._MAX_RESULTS
1802                 break
1803
1804             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1805                         for entry in response['feed']['entry']
1806                         if 'content' in entry ]
1807
1808             if len(response['feed']['entry']) < self._MAX_RESULTS:
1809                 break
1810             page_num += 1
1811
1812         videos = [v[1] for v in sorted(videos)]
1813
1814         url_results = [self.url_result(url, 'Youtube') for url in videos]
1815         return [self.playlist_result(url_results, playlist_id)]
1816
1817
1818 class YoutubeChannelIE(InfoExtractor):
1819     """Information Extractor for YouTube channels."""
1820
1821     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1822     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1823     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1824     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1825     IE_NAME = u'youtube:channel'
1826
1827     def report_download_page(self, channel_id, pagenum):
1828         """Report attempt to download channel page with given number."""
1829         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1830
1831     def extract_videos_from_page(self, page):
1832         ids_in_page = []
1833         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1834             if mobj.group(1) not in ids_in_page:
1835                 ids_in_page.append(mobj.group(1))
1836         return ids_in_page
1837
1838     def _real_extract(self, url):
1839         # Extract channel id
1840         mobj = re.match(self._VALID_URL, url)
1841         if mobj is None:
1842             self._downloader.report_error(u'invalid url: %s' % url)
1843             return
1844
1845         # Download channel page
1846         channel_id = mobj.group(1)
1847         video_ids = []
1848         pagenum = 1
1849
1850         self.report_download_page(channel_id, pagenum)
1851         url = self._TEMPLATE_URL % (channel_id, pagenum)
1852         request = compat_urllib_request.Request(url)
1853         try:
1854             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1855         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1857             return
1858
1859         # Extract video identifiers
1860         ids_in_page = self.extract_videos_from_page(page)
1861         video_ids.extend(ids_in_page)
1862
1863         # Download any subsequent channel pages using the json-based channel_ajax query
1864         if self._MORE_PAGES_INDICATOR in page:
1865             while True:
1866                 pagenum = pagenum + 1
1867
1868                 self.report_download_page(channel_id, pagenum)
1869                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1870                 request = compat_urllib_request.Request(url)
1871                 try:
1872                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1873                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1875                     return
1876
1877                 page = json.loads(page)
1878
1879                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1880                 video_ids.extend(ids_in_page)
1881
1882                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1883                     break
1884
1885         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1886
1887         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1888         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1889         return [self.playlist_result(url_entries, channel_id)]
1890
1891
1892 class YoutubeUserIE(InfoExtractor):
1893     """Information Extractor for YouTube users."""
1894
1895     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1896     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1897     _GDATA_PAGE_SIZE = 50
1898     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1899     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1900     IE_NAME = u'youtube:user'
1901
1902     def __init__(self, downloader=None):
1903         InfoExtractor.__init__(self, downloader)
1904
1905     def report_download_page(self, username, start_index):
1906         """Report attempt to download user page."""
1907         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1908                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1909
1910     def _real_extract(self, url):
1911         # Extract username
1912         mobj = re.match(self._VALID_URL, url)
1913         if mobj is None:
1914             self._downloader.report_error(u'invalid url: %s' % url)
1915             return
1916
1917         username = mobj.group(1)
1918
1919         # Download video ids using YouTube Data API. Result size per
1920         # query is limited (currently to 50 videos) so we need to query
1921         # page by page until there are no video ids - it means we got
1922         # all of them.
1923
1924         video_ids = []
1925         pagenum = 0
1926
1927         while True:
1928             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1929             self.report_download_page(username, start_index)
1930
1931             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1932
1933             try:
1934                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1935             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1936                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1937                 return
1938
1939             # Extract video identifiers
1940             ids_in_page = []
1941
1942             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1943                 if mobj.group(1) not in ids_in_page:
1944                     ids_in_page.append(mobj.group(1))
1945
1946             video_ids.extend(ids_in_page)
1947
1948             # A little optimization - if current page is not
1949             # "full", ie. does not contain PAGE_SIZE video ids then
1950             # we can assume that this page is the last one - there
1951             # are no more ids on further pages - no need to query
1952             # again.
1953
1954             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1955                 break
1956
1957             pagenum += 1
1958
1959         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1960         url_results = [self.url_result(url, 'Youtube') for url in urls]
1961         return [self.playlist_result(url_results, playlist_title = username)]
1962
1963
1964 class BlipTVUserIE(InfoExtractor):
1965     """Information Extractor for blip.tv users."""
1966
1967     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1968     _PAGE_SIZE = 12
1969     IE_NAME = u'blip.tv:user'
1970
1971     def __init__(self, downloader=None):
1972         InfoExtractor.__init__(self, downloader)
1973
1974     def report_download_page(self, username, pagenum):
1975         """Report attempt to download user page."""
1976         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1977                 (self.IE_NAME, username, pagenum))
1978
1979     def _real_extract(self, url):
1980         # Extract username
1981         mobj = re.match(self._VALID_URL, url)
1982         if mobj is None:
1983             self._downloader.report_error(u'invalid url: %s' % url)
1984             return
1985
1986         username = mobj.group(1)
1987
1988         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1989
1990         request = compat_urllib_request.Request(url)
1991
1992         try:
1993             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1994             mobj = re.search(r'data-users-id="([^"]+)"', page)
1995             page_base = page_base % mobj.group(1)
1996         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1998             return
1999
2000
2001         # Download video ids using BlipTV Ajax calls. Result size per
2002         # query is limited (currently to 12 videos) so we need to query
2003         # page by page until there are no video ids - it means we got
2004         # all of them.
2005
2006         video_ids = []
2007         pagenum = 1
2008
2009         while True:
2010             self.report_download_page(username, pagenum)
2011             url = page_base + "&page=" + str(pagenum)
2012             request = compat_urllib_request.Request( url )
2013             try:
2014                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2015             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2017                 return
2018
2019             # Extract video identifiers
2020             ids_in_page = []
2021
2022             for mobj in re.finditer(r'href="/([^"]+)"', page):
2023                 if mobj.group(1) not in ids_in_page:
2024                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2025
2026             video_ids.extend(ids_in_page)
2027
2028             # A little optimization - if current page is not
2029             # "full", ie. does not contain PAGE_SIZE video ids then
2030             # we can assume that this page is the last one - there
2031             # are no more ids on further pages - no need to query
2032             # again.
2033
2034             if len(ids_in_page) < self._PAGE_SIZE:
2035                 break
2036
2037             pagenum += 1
2038
2039         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2040         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2041         return [self.playlist_result(url_entries, playlist_title = username)]
2042
2043
2044 class DepositFilesIE(InfoExtractor):
2045     """Information extractor for depositfiles.com"""
2046
2047     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2048
2049     def report_download_webpage(self, file_id):
2050         """Report webpage download."""
2051         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2052
2053     def report_extraction(self, file_id):
2054         """Report information extraction."""
2055         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2056
2057     def _real_extract(self, url):
2058         file_id = url.split('/')[-1]
2059         # Rebuild url in english locale
2060         url = 'http://depositfiles.com/en/files/' + file_id
2061
2062         # Retrieve file webpage with 'Free download' button pressed
2063         free_download_indication = { 'gateway_result' : '1' }
2064         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2065         try:
2066             self.report_download_webpage(file_id)
2067             webpage = compat_urllib_request.urlopen(request).read()
2068         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2070             return
2071
2072         # Search for the real file URL
2073         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2074         if (mobj is None) or (mobj.group(1) is None):
2075             # Try to figure out reason of the error.
2076             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2077             if (mobj is not None) and (mobj.group(1) is not None):
2078                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2079                 self._downloader.report_error(u'%s' % restriction_message)
2080             else:
2081                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2082             return
2083
2084         file_url = mobj.group(1)
2085         file_extension = os.path.splitext(file_url)[1][1:]
2086
2087         # Search for file title
2088         mobj = re.search(r'<b title="(.*?)">', webpage)
2089         if mobj is None:
2090             self._downloader.report_error(u'unable to extract title')
2091             return
2092         file_title = mobj.group(1).decode('utf-8')
2093
2094         return [{
2095             'id':       file_id.decode('utf-8'),
2096             'url':      file_url.decode('utf-8'),
2097             'uploader': None,
2098             'upload_date':  None,
2099             'title':    file_title,
2100             'ext':      file_extension.decode('utf-8'),
2101         }]
2102
2103
2104 class FacebookIE(InfoExtractor):
2105     """Information Extractor for Facebook"""
2106
2107     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2108     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2109     _NETRC_MACHINE = 'facebook'
2110     IE_NAME = u'facebook'
2111
2112     def report_login(self):
2113         """Report attempt to log in."""
2114         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2115
2116     def _real_initialize(self):
2117         if self._downloader is None:
2118             return
2119
2120         useremail = None
2121         password = None
2122         downloader_params = self._downloader.params
2123
2124         # Attempt to use provided username and password or .netrc data
2125         if downloader_params.get('username', None) is not None:
2126             useremail = downloader_params['username']
2127             password = downloader_params['password']
2128         elif downloader_params.get('usenetrc', False):
2129             try:
2130                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2131                 if info is not None:
2132                     useremail = info[0]
2133                     password = info[2]
2134                 else:
2135                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2136             except (IOError, netrc.NetrcParseError) as err:
2137                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2138                 return
2139
2140         if useremail is None:
2141             return
2142
2143         # Log in
2144         login_form = {
2145             'email': useremail,
2146             'pass': password,
2147             'login': 'Log+In'
2148             }
2149         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2150         try:
2151             self.report_login()
2152             login_results = compat_urllib_request.urlopen(request).read()
2153             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2154                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2155                 return
2156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2157             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2158             return
2159
2160     def _real_extract(self, url):
2161         mobj = re.match(self._VALID_URL, url)
2162         if mobj is None:
2163             self._downloader.report_error(u'invalid URL: %s' % url)
2164             return
2165         video_id = mobj.group('ID')
2166
2167         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2168         webpage = self._download_webpage(url, video_id)
2169
2170         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2171         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2172         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2173         if not m:
2174             raise ExtractorError(u'Cannot parse data')
2175         data = dict(json.loads(m.group(1)))
2176         params_raw = compat_urllib_parse.unquote(data['params'])
2177         params = json.loads(params_raw)
2178         video_data = params['video_data'][0]
2179         video_url = video_data.get('hd_src')
2180         if not video_url:
2181             video_url = video_data['sd_src']
2182         if not video_url:
2183             raise ExtractorError(u'Cannot find video URL')
2184         video_duration = int(video_data['video_duration'])
2185         thumbnail = video_data['thumbnail_src']
2186
2187         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2188         if not m:
2189             raise ExtractorError(u'Cannot find title in webpage')
2190         video_title = unescapeHTML(m.group(1))
2191
2192         info = {
2193             'id': video_id,
2194             'title': video_title,
2195             'url': video_url,
2196             'ext': 'mp4',
2197             'duration': video_duration,
2198             'thumbnail': thumbnail,
2199         }
2200         return [info]
2201
2202
2203 class BlipTVIE(InfoExtractor):
2204     """Information extractor for blip.tv"""
2205
2206     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2207     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2208     IE_NAME = u'blip.tv'
2209
2210     def report_extraction(self, file_id):
2211         """Report information extraction."""
2212         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2213
2214     def report_direct_download(self, title):
2215         """Report information extraction."""
2216         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2217
2218     def _real_extract(self, url):
2219         mobj = re.match(self._VALID_URL, url)
2220         if mobj is None:
2221             self._downloader.report_error(u'invalid URL: %s' % url)
2222             return
2223
2224         urlp = compat_urllib_parse_urlparse(url)
2225         if urlp.path.startswith('/play/'):
2226             request = compat_urllib_request.Request(url)
2227             response = compat_urllib_request.urlopen(request)
2228             redirecturl = response.geturl()
2229             rurlp = compat_urllib_parse_urlparse(redirecturl)
2230             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2231             url = 'http://blip.tv/a/a-' + file_id
2232             return self._real_extract(url)
2233
2234
2235         if '?' in url:
2236             cchar = '&'
2237         else:
2238             cchar = '?'
2239         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2240         request = compat_urllib_request.Request(json_url)
2241         request.add_header('User-Agent', 'iTunes/10.6.1')
2242         self.report_extraction(mobj.group(1))
2243         info = None
2244         try:
2245             urlh = compat_urllib_request.urlopen(request)
2246             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2247                 basename = url.split('/')[-1]
2248                 title,ext = os.path.splitext(basename)
2249                 title = title.decode('UTF-8')
2250                 ext = ext.replace('.', '')
2251                 self.report_direct_download(title)
2252                 info = {
2253                     'id': title,
2254                     'url': url,
2255                     'uploader': None,
2256                     'upload_date': None,
2257                     'title': title,
2258                     'ext': ext,
2259                     'urlhandle': urlh
2260                 }
2261         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2263         if info is None: # Regular URL
2264             try:
2265                 json_code_bytes = urlh.read()
2266                 json_code = json_code_bytes.decode('utf-8')
2267             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2268                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2269                 return
2270
2271             try:
2272                 json_data = json.loads(json_code)
2273                 if 'Post' in json_data:
2274                     data = json_data['Post']
2275                 else:
2276                     data = json_data
2277
2278                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2279                 video_url = data['media']['url']
2280                 umobj = re.match(self._URL_EXT, video_url)
2281                 if umobj is None:
2282                     raise ValueError('Can not determine filename extension')
2283                 ext = umobj.group(1)
2284
2285                 info = {
2286                     'id': data['item_id'],
2287                     'url': video_url,
2288                     'uploader': data['display_name'],
2289                     'upload_date': upload_date,
2290                     'title': data['title'],
2291                     'ext': ext,
2292                     'format': data['media']['mimeType'],
2293                     'thumbnail': data['thumbnailUrl'],
2294                     'description': data['description'],
2295                     'player_url': data['embedUrl'],
2296                     'user_agent': 'iTunes/10.6.1',
2297                 }
2298             except (ValueError,KeyError) as err:
2299                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2300                 return
2301
2302         return [info]
2303
2304
2305 class MyVideoIE(InfoExtractor):
2306     """Information Extractor for myvideo.de."""
2307
2308     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2309     IE_NAME = u'myvideo'
2310
2311     def __init__(self, downloader=None):
2312         InfoExtractor.__init__(self, downloader)
2313
2314     def report_extraction(self, video_id):
2315         """Report information extraction."""
2316         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2317
2318     def _real_extract(self,url):
2319         mobj = re.match(self._VALID_URL, url)
2320         if mobj is None:
2321             self._download.report_error(u'invalid URL: %s' % url)
2322             return
2323
2324         video_id = mobj.group(1)
2325
2326         # Get video webpage
2327         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2328         webpage = self._download_webpage(webpage_url, video_id)
2329
2330         self.report_extraction(video_id)
2331         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2332                  webpage)
2333         if mobj is None:
2334             self._downloader.report_error(u'unable to extract media URL')
2335             return
2336         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2337
2338         mobj = re.search('<title>([^<]+)</title>', webpage)
2339         if mobj is None:
2340             self._downloader.report_error(u'unable to extract title')
2341             return
2342
2343         video_title = mobj.group(1)
2344
2345         return [{
2346             'id':       video_id,
2347             'url':      video_url,
2348             'uploader': None,
2349             'upload_date':  None,
2350             'title':    video_title,
2351             'ext':      u'flv',
2352         }]
2353
2354 class ComedyCentralIE(InfoExtractor):
2355     """Information extractor for The Daily Show and Colbert Report """
2356
2357     # urls can be abbreviations like :thedailyshow or :colbert
2358     # urls for episodes like:
2359     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2360     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2361     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2362     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2363                       |(https?://)?(www\.)?
2364                           (?P<showname>thedailyshow|colbertnation)\.com/
2365                          (full-episodes/(?P<episode>.*)|
2366                           (?P<clip>
2367                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2368                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2369                      $"""
2370
2371     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2372
2373     _video_extensions = {
2374         '3500': 'mp4',
2375         '2200': 'mp4',
2376         '1700': 'mp4',
2377         '1200': 'mp4',
2378         '750': 'mp4',
2379         '400': 'mp4',
2380     }
2381     _video_dimensions = {
2382         '3500': '1280x720',
2383         '2200': '960x540',
2384         '1700': '768x432',
2385         '1200': '640x360',
2386         '750': '512x288',
2387         '400': '384x216',
2388     }
2389
2390     @classmethod
2391     def suitable(cls, url):
2392         """Receives a URL and returns True if suitable for this IE."""
2393         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2394
2395     def report_extraction(self, episode_id):
2396         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2397
2398     def report_config_download(self, episode_id, media_id):
2399         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2400
2401     def report_index_download(self, episode_id):
2402         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2403
2404     def _print_formats(self, formats):
2405         print('Available formats:')
2406         for x in formats:
2407             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2408
2409
2410     def _real_extract(self, url):
2411         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2412         if mobj is None:
2413             self._downloader.report_error(u'invalid URL: %s' % url)
2414             return
2415
2416         if mobj.group('shortname'):
2417             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2418                 url = u'http://www.thedailyshow.com/full-episodes/'
2419             else:
2420                 url = u'http://www.colbertnation.com/full-episodes/'
2421             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422             assert mobj is not None
2423
2424         if mobj.group('clip'):
2425             if mobj.group('showname') == 'thedailyshow':
2426                 epTitle = mobj.group('tdstitle')
2427             else:
2428                 epTitle = mobj.group('cntitle')
2429             dlNewest = False
2430         else:
2431             dlNewest = not mobj.group('episode')
2432             if dlNewest:
2433                 epTitle = mobj.group('showname')
2434             else:
2435                 epTitle = mobj.group('episode')
2436
2437         req = compat_urllib_request.Request(url)
2438         self.report_extraction(epTitle)
2439         try:
2440             htmlHandle = compat_urllib_request.urlopen(req)
2441             html = htmlHandle.read()
2442             webpage = html.decode('utf-8')
2443         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2445             return
2446         if dlNewest:
2447             url = htmlHandle.geturl()
2448             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2449             if mobj is None:
2450                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2451                 return
2452             if mobj.group('episode') == '':
2453                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2454                 return
2455             epTitle = mobj.group('episode')
2456
2457         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2458
2459         if len(mMovieParams) == 0:
2460             # The Colbert Report embeds the information in a without
2461             # a URL prefix; so extract the alternate reference
2462             # and then add the URL prefix manually.
2463
2464             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2465             if len(altMovieParams) == 0:
2466                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2467                 return
2468             else:
2469                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2470
2471         uri = mMovieParams[0][1]
2472         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2473         self.report_index_download(epTitle)
2474         try:
2475             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2476         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2477             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2478             return
2479
2480         results = []
2481
2482         idoc = xml.etree.ElementTree.fromstring(indexXml)
2483         itemEls = idoc.findall('.//item')
2484         for partNum,itemEl in enumerate(itemEls):
2485             mediaId = itemEl.findall('./guid')[0].text
2486             shortMediaId = mediaId.split(':')[-1]
2487             showId = mediaId.split(':')[-2].replace('.com', '')
2488             officialTitle = itemEl.findall('./title')[0].text
2489             officialDate = itemEl.findall('./pubDate')[0].text
2490
2491             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2492                         compat_urllib_parse.urlencode({'uri': mediaId}))
2493             configReq = compat_urllib_request.Request(configUrl)
2494             self.report_config_download(epTitle, shortMediaId)
2495             try:
2496                 configXml = compat_urllib_request.urlopen(configReq).read()
2497             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2499                 return
2500
2501             cdoc = xml.etree.ElementTree.fromstring(configXml)
2502             turls = []
2503             for rendition in cdoc.findall('.//rendition'):
2504                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2505                 turls.append(finfo)
2506
2507             if len(turls) == 0:
2508                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2509                 continue
2510
2511             if self._downloader.params.get('listformats', None):
2512                 self._print_formats([i[0] for i in turls])
2513                 return
2514
2515             # For now, just pick the highest bitrate
2516             format,rtmp_video_url = turls[-1]
2517
2518             # Get the format arg from the arg stream
2519             req_format = self._downloader.params.get('format', None)
2520
2521             # Select format if we can find one
2522             for f,v in turls:
2523                 if f == req_format:
2524                     format, rtmp_video_url = f, v
2525                     break
2526
2527             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2528             if not m:
2529                 raise ExtractorError(u'Cannot transform RTMP url')
2530             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2531             video_url = base + m.group('finalid')
2532
2533             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2534             info = {
2535                 'id': shortMediaId,
2536                 'url': video_url,
2537                 'uploader': showId,
2538                 'upload_date': officialDate,
2539                 'title': effTitle,
2540                 'ext': 'mp4',
2541                 'format': format,
2542                 'thumbnail': None,
2543                 'description': officialTitle,
2544             }
2545             results.append(info)
2546
2547         return results
2548
2549
2550 class EscapistIE(InfoExtractor):
2551     """Information extractor for The Escapist """
2552
2553     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2554     IE_NAME = u'escapist'
2555
2556     def report_extraction(self, showName):
2557         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2558
2559     def report_config_download(self, showName):
2560         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2561
2562     def _real_extract(self, url):
2563         mobj = re.match(self._VALID_URL, url)
2564         if mobj is None:
2565             self._downloader.report_error(u'invalid URL: %s' % url)
2566             return
2567         showName = mobj.group('showname')
2568         videoId = mobj.group('episode')
2569
2570         self.report_extraction(showName)
2571         try:
2572             webPage = compat_urllib_request.urlopen(url)
2573             webPageBytes = webPage.read()
2574             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2575             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2576         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2578             return
2579
2580         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2581         description = unescapeHTML(descMatch.group(1))
2582         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2583         imgUrl = unescapeHTML(imgMatch.group(1))
2584         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2585         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2586         configUrlMatch = re.search('config=(.*)$', playerUrl)
2587         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2588
2589         self.report_config_download(showName)
2590         try:
2591             configJSON = compat_urllib_request.urlopen(configUrl)
2592             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2593             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2595             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2596             return
2597
2598         # Technically, it's JavaScript, not JSON
2599         configJSON = configJSON.replace("'", '"')
2600
2601         try:
2602             config = json.loads(configJSON)
2603         except (ValueError,) as err:
2604             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2605             return
2606
2607         playlist = config['playlist']
2608         videoUrl = playlist[1]['url']
2609
2610         info = {
2611             'id': videoId,
2612             'url': videoUrl,
2613             'uploader': showName,
2614             'upload_date': None,
2615             'title': showName,
2616             'ext': 'mp4',
2617             'thumbnail': imgUrl,
2618             'description': description,
2619             'player_url': playerUrl,
2620         }
2621
2622         return [info]
2623
2624 class CollegeHumorIE(InfoExtractor):
2625     """Information extractor for collegehumor.com"""
2626
2627     _WORKING = False
2628     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2629     IE_NAME = u'collegehumor'
2630
2631     def report_manifest(self, video_id):
2632         """Report information extraction."""
2633         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2634
2635     def report_extraction(self, video_id):
2636         """Report information extraction."""
2637         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2638
2639     def _real_extract(self, url):
2640         mobj = re.match(self._VALID_URL, url)
2641         if mobj is None:
2642             self._downloader.report_error(u'invalid URL: %s' % url)
2643             return
2644         video_id = mobj.group('videoid')
2645
2646         info = {
2647             'id': video_id,
2648             'uploader': None,
2649             'upload_date': None,
2650         }
2651
2652         self.report_extraction(video_id)
2653         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2654         try:
2655             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2656         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2657             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2658             return
2659
2660         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2661         try:
2662             videoNode = mdoc.findall('./video')[0]
2663             info['description'] = videoNode.findall('./description')[0].text
2664             info['title'] = videoNode.findall('./caption')[0].text
2665             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2666             manifest_url = videoNode.findall('./file')[0].text
2667         except IndexError:
2668             self._downloader.report_error(u'Invalid metadata XML file')
2669             return
2670
2671         manifest_url += '?hdcore=2.10.3'
2672         self.report_manifest(video_id)
2673         try:
2674             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2675         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2677             return
2678
2679         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2680         try:
2681             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2682             node_id = media_node.attrib['url']
2683             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2684         except IndexError as err:
2685             self._downloader.report_error(u'Invalid manifest file')
2686             return
2687
2688         url_pr = compat_urllib_parse_urlparse(manifest_url)
2689         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2690
2691         info['url'] = url
2692         info['ext'] = 'f4f'
2693         return [info]
2694
2695
2696 class XVideosIE(InfoExtractor):
2697     """Information extractor for xvideos.com"""
2698
2699     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2700     IE_NAME = u'xvideos'
2701
2702     def report_extraction(self, video_id):
2703         """Report information extraction."""
2704         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2705
2706     def _real_extract(self, url):
2707         mobj = re.match(self._VALID_URL, url)
2708         if mobj is None:
2709             self._downloader.report_error(u'invalid URL: %s' % url)
2710             return
2711         video_id = mobj.group(1)
2712
2713         webpage = self._download_webpage(url, video_id)
2714
2715         self.report_extraction(video_id)
2716
2717
2718         # Extract video URL
2719         mobj = re.search(r'flv_url=(.+?)&', webpage)
2720         if mobj is None:
2721             self._downloader.report_error(u'unable to extract video url')
2722             return
2723         video_url = compat_urllib_parse.unquote(mobj.group(1))
2724
2725
2726         # Extract title
2727         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2728         if mobj is None:
2729             self._downloader.report_error(u'unable to extract video title')
2730             return
2731         video_title = mobj.group(1)
2732
2733
2734         # Extract video thumbnail
2735         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2736         if mobj is None:
2737             self._downloader.report_error(u'unable to extract video thumbnail')
2738             return
2739         video_thumbnail = mobj.group(0)
2740
2741         info = {
2742             'id': video_id,
2743             'url': video_url,
2744             'uploader': None,
2745             'upload_date': None,
2746             'title': video_title,
2747             'ext': 'flv',
2748             'thumbnail': video_thumbnail,
2749             'description': None,
2750         }
2751
2752         return [info]
2753
2754
2755 class SoundcloudIE(InfoExtractor):
2756     """Information extractor for soundcloud.com
2757        To access the media, the uid of the song and a stream token
2758        must be extracted from the page source and the script must make
2759        a request to media.soundcloud.com/crossdomain.xml. Then
2760        the media can be grabbed by requesting from an url composed
2761        of the stream token and uid
2762      """
2763
2764     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2765     IE_NAME = u'soundcloud'
2766
2767     def __init__(self, downloader=None):
2768         InfoExtractor.__init__(self, downloader)
2769
2770     def report_resolve(self, video_id):
2771         """Report information extraction."""
2772         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2773
2774     def report_extraction(self, video_id):
2775         """Report information extraction."""
2776         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2777
2778     def _real_extract(self, url):
2779         mobj = re.match(self._VALID_URL, url)
2780         if mobj is None:
2781             self._downloader.report_error(u'invalid URL: %s' % url)
2782             return
2783
2784         # extract uploader (which is in the url)
2785         uploader = mobj.group(1)
2786         # extract simple title (uploader + slug of song title)
2787         slug_title =  mobj.group(2)
2788         simple_title = uploader + u'-' + slug_title
2789
2790         self.report_resolve('%s/%s' % (uploader, slug_title))
2791
2792         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2793         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794         request = compat_urllib_request.Request(resolv_url)
2795         try:
2796             info_json_bytes = compat_urllib_request.urlopen(request).read()
2797             info_json = info_json_bytes.decode('utf-8')
2798         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2800             return
2801
2802         info = json.loads(info_json)
2803         video_id = info['id']
2804         self.report_extraction('%s/%s' % (uploader, slug_title))
2805
2806         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807         request = compat_urllib_request.Request(streams_url)
2808         try:
2809             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2810             stream_json = stream_json_bytes.decode('utf-8')
2811         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2813             return
2814
2815         streams = json.loads(stream_json)
2816         mediaURL = streams['http_mp3_128_url']
2817
2818         return [{
2819             'id':       info['id'],
2820             'url':      mediaURL,
2821             'uploader': info['user']['username'],
2822             'upload_date':  info['created_at'],
2823             'title':    info['title'],
2824             'ext':      u'mp3',
2825             'description': info['description'],
2826         }]
2827
2828 class SoundcloudSetIE(InfoExtractor):
2829     """Information extractor for soundcloud.com sets
2830        To access the media, the uid of the song and a stream token
2831        must be extracted from the page source and the script must make
2832        a request to media.soundcloud.com/crossdomain.xml. Then
2833        the media can be grabbed by requesting from an url composed
2834        of the stream token and uid
2835      """
2836
2837     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2838     IE_NAME = u'soundcloud'
2839
2840     def __init__(self, downloader=None):
2841         InfoExtractor.__init__(self, downloader)
2842
2843     def report_resolve(self, video_id):
2844         """Report information extraction."""
2845         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2846
2847     def report_extraction(self, video_id):
2848         """Report information extraction."""
2849         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2850
2851     def _real_extract(self, url):
2852         mobj = re.match(self._VALID_URL, url)
2853         if mobj is None:
2854             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2855             return
2856
2857         # extract uploader (which is in the url)
2858         uploader = mobj.group(1)
2859         # extract simple title (uploader + slug of song title)
2860         slug_title =  mobj.group(2)
2861         simple_title = uploader + u'-' + slug_title
2862
2863         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2864
2865         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2866         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2867         request = compat_urllib_request.Request(resolv_url)
2868         try:
2869             info_json_bytes = compat_urllib_request.urlopen(request).read()
2870             info_json = info_json_bytes.decode('utf-8')
2871         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2873             return
2874
2875         videos = []
2876         info = json.loads(info_json)
2877         if 'errors' in info:
2878             for err in info['errors']:
2879                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2880             return
2881
2882         for track in info['tracks']:
2883             video_id = track['id']
2884             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2885
2886             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2887             request = compat_urllib_request.Request(streams_url)
2888             try:
2889                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2890                 stream_json = stream_json_bytes.decode('utf-8')
2891             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2892                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2893                 return
2894
2895             streams = json.loads(stream_json)
2896             mediaURL = streams['http_mp3_128_url']
2897
2898             videos.append({
2899                 'id':       video_id,
2900                 'url':      mediaURL,
2901                 'uploader': track['user']['username'],
2902                 'upload_date':  track['created_at'],
2903                 'title':    track['title'],
2904                 'ext':      u'mp3',
2905                 'description': track['description'],
2906             })
2907         return videos
2908
2909
2910 class InfoQIE(InfoExtractor):
2911     """Information extractor for infoq.com"""
2912     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2913
2914     def report_extraction(self, video_id):
2915         """Report information extraction."""
2916         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2917
2918     def _real_extract(self, url):
2919         mobj = re.match(self._VALID_URL, url)
2920         if mobj is None:
2921             self._downloader.report_error(u'invalid URL: %s' % url)
2922             return
2923
2924         webpage = self._download_webpage(url, video_id=url)
2925         self.report_extraction(url)
2926
2927         # Extract video URL
2928         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2929         if mobj is None:
2930             self._downloader.report_error(u'unable to extract video url')
2931             return
2932         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2933         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2934
2935         # Extract title
2936         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2937         if mobj is None:
2938             self._downloader.report_error(u'unable to extract video title')
2939             return
2940         video_title = mobj.group(1)
2941
2942         # Extract description
2943         video_description = u'No description available.'
2944         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2945         if mobj is not None:
2946             video_description = mobj.group(1)
2947
2948         video_filename = video_url.split('/')[-1]
2949         video_id, extension = video_filename.split('.')
2950
2951         info = {
2952             'id': video_id,
2953             'url': video_url,
2954             'uploader': None,
2955             'upload_date': None,
2956             'title': video_title,
2957             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2958             'thumbnail': None,
2959             'description': video_description,
2960         }
2961
2962         return [info]
2963
2964 class MixcloudIE(InfoExtractor):
2965     """Information extractor for www.mixcloud.com"""
2966
2967     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2968     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2969     IE_NAME = u'mixcloud'
2970
2971     def __init__(self, downloader=None):
2972         InfoExtractor.__init__(self, downloader)
2973
2974     def report_download_json(self, file_id):
2975         """Report JSON download."""
2976         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2977
2978     def report_extraction(self, file_id):
2979         """Report information extraction."""
2980         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2981
2982     def get_urls(self, jsonData, fmt, bitrate='best'):
2983         """Get urls from 'audio_formats' section in json"""
2984         file_url = None
2985         try:
2986             bitrate_list = jsonData[fmt]
2987             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2988                 bitrate = max(bitrate_list) # select highest
2989
2990             url_list = jsonData[fmt][bitrate]
2991         except TypeError: # we have no bitrate info.
2992             url_list = jsonData[fmt]
2993         return url_list
2994
2995     def check_urls(self, url_list):
2996         """Returns 1st active url from list"""
2997         for url in url_list:
2998             try:
2999                 compat_urllib_request.urlopen(url)
3000                 return url
3001             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3002                 url = None
3003
3004         return None
3005
3006     def _print_formats(self, formats):
3007         print('Available formats:')
3008         for fmt in formats.keys():
3009             for b in formats[fmt]:
3010                 try:
3011                     ext = formats[fmt][b][0]
3012                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3013                 except TypeError: # we have no bitrate info
3014                     ext = formats[fmt][0]
3015                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3016                     break
3017
3018     def _real_extract(self, url):
3019         mobj = re.match(self._VALID_URL, url)
3020         if mobj is None:
3021             self._downloader.report_error(u'invalid URL: %s' % url)
3022             return
3023         # extract uploader & filename from url
3024         uploader = mobj.group(1).decode('utf-8')
3025         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3026
3027         # construct API request
3028         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3029         # retrieve .json file with links to files
3030         request = compat_urllib_request.Request(file_url)
3031         try:
3032             self.report_download_json(file_url)
3033             jsonData = compat_urllib_request.urlopen(request).read()
3034         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3035             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3036             return
3037
3038         # parse JSON
3039         json_data = json.loads(jsonData)
3040         player_url = json_data['player_swf_url']
3041         formats = dict(json_data['audio_formats'])
3042
3043         req_format = self._downloader.params.get('format', None)
3044         bitrate = None
3045
3046         if self._downloader.params.get('listformats', None):
3047             self._print_formats(formats)
3048             return
3049
3050         if req_format is None or req_format == 'best':
3051             for format_param in formats.keys():
3052                 url_list = self.get_urls(formats, format_param)
3053                 # check urls
3054                 file_url = self.check_urls(url_list)
3055                 if file_url is not None:
3056                     break # got it!
3057         else:
3058             if req_format not in formats:
3059                 self._downloader.report_error(u'format is not available')
3060                 return
3061
3062             url_list = self.get_urls(formats, req_format)
3063             file_url = self.check_urls(url_list)
3064             format_param = req_format
3065
3066         return [{
3067             'id': file_id.decode('utf-8'),
3068             'url': file_url.decode('utf-8'),
3069             'uploader': uploader.decode('utf-8'),
3070             'upload_date': None,
3071             'title': json_data['name'],
3072             'ext': file_url.split('.')[-1].decode('utf-8'),
3073             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3074             'thumbnail': json_data['thumbnail_url'],
3075             'description': json_data['description'],
3076             'player_url': player_url.decode('utf-8'),
3077         }]
3078
3079 class StanfordOpenClassroomIE(InfoExtractor):
3080     """Information extractor for Stanford's Open ClassRoom"""
3081
3082     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3083     IE_NAME = u'stanfordoc'
3084
3085     def report_download_webpage(self, objid):
3086         """Report information extraction."""
3087         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3088
3089     def report_extraction(self, video_id):
3090         """Report information extraction."""
3091         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3092
3093     def _real_extract(self, url):
3094         mobj = re.match(self._VALID_URL, url)
3095         if mobj is None:
3096             raise ExtractorError(u'Invalid URL: %s' % url)
3097
3098         if mobj.group('course') and mobj.group('video'): # A specific video
3099             course = mobj.group('course')
3100             video = mobj.group('video')
3101             info = {
3102                 'id': course + '_' + video,
3103                 'uploader': None,
3104                 'upload_date': None,
3105             }
3106
3107             self.report_extraction(info['id'])
3108             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3109             xmlUrl = baseUrl + video + '.xml'
3110             try:
3111                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3112             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3113                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3114                 return
3115             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3116             try:
3117                 info['title'] = mdoc.findall('./title')[0].text
3118                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3119             except IndexError:
3120                 self._downloader.report_error(u'Invalid metadata XML file')
3121                 return
3122             info['ext'] = info['url'].rpartition('.')[2]
3123             return [info]
3124         elif mobj.group('course'): # A course page
3125             course = mobj.group('course')
3126             info = {
3127                 'id': course,
3128                 'type': 'playlist',
3129                 'uploader': None,
3130                 'upload_date': None,
3131             }
3132
3133             coursepage = self._download_webpage(url, info['id'],
3134                                         note='Downloading course info page',
3135                                         errnote='Unable to download course info page')
3136
3137             m = re.search('<h1>([^<]+)</h1>', coursepage)
3138             if m:
3139                 info['title'] = unescapeHTML(m.group(1))
3140             else:
3141                 info['title'] = info['id']
3142
3143             m = re.search('<description>([^<]+)</description>', coursepage)
3144             if m:
3145                 info['description'] = unescapeHTML(m.group(1))
3146
3147             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3148             info['list'] = [
3149                 {
3150                     'type': 'reference',
3151                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3152                 }
3153                     for vpage in links]
3154             results = []
3155             for entry in info['list']:
3156                 assert entry['type'] == 'reference'
3157                 results += self.extract(entry['url'])
3158             return results
3159         else: # Root page
3160             info = {
3161                 'id': 'Stanford OpenClassroom',
3162                 'type': 'playlist',
3163                 'uploader': None,
3164                 'upload_date': None,
3165             }
3166
3167             self.report_download_webpage(info['id'])
3168             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3169             try:
3170                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3171             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3172                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3173                 return
3174
3175             info['title'] = info['id']
3176
3177             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3178             info['list'] = [
3179                 {
3180                     'type': 'reference',
3181                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3182                 }
3183                     for cpage in links]
3184
3185             results = []
3186             for entry in info['list']:
3187                 assert entry['type'] == 'reference'
3188                 results += self.extract(entry['url'])
3189             return results
3190
3191 class MTVIE(InfoExtractor):
3192     """Information extractor for MTV.com"""
3193
3194     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3195     IE_NAME = u'mtv'
3196
3197     def report_extraction(self, video_id):
3198         """Report information extraction."""
3199         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3200
3201     def _real_extract(self, url):
3202         mobj = re.match(self._VALID_URL, url)
3203         if mobj is None:
3204             self._downloader.report_error(u'invalid URL: %s' % url)
3205             return
3206         if not mobj.group('proto'):
3207             url = 'http://' + url
3208         video_id = mobj.group('videoid')
3209
3210         webpage = self._download_webpage(url, video_id)
3211
3212         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3213         if mobj is None:
3214             self._downloader.report_error(u'unable to extract song name')
3215             return
3216         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3217         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3218         if mobj is None:
3219             self._downloader.report_error(u'unable to extract performer')
3220             return
3221         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3222         video_title = performer + ' - ' + song_name
3223
3224         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3225         if mobj is None:
3226             self._downloader.report_error(u'unable to mtvn_uri')
3227             return
3228         mtvn_uri = mobj.group(1)
3229
3230         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3231         if mobj is None:
3232             self._downloader.report_error(u'unable to extract content id')
3233             return
3234         content_id = mobj.group(1)
3235
3236         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3237         self.report_extraction(video_id)
3238         request = compat_urllib_request.Request(videogen_url)
3239         try:
3240             metadataXml = compat_urllib_request.urlopen(request).read()
3241         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3242             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3243             return
3244
3245         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3246         renditions = mdoc.findall('.//rendition')
3247
3248         # For now, always pick the highest quality.
3249         rendition = renditions[-1]
3250
3251         try:
3252             _,_,ext = rendition.attrib['type'].partition('/')
3253             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3254             video_url = rendition.find('./src').text
3255         except KeyError:
3256             self._downloader.trouble('Invalid rendition field.')
3257             return
3258
3259         info = {
3260             'id': video_id,
3261             'url': video_url,
3262             'uploader': performer,
3263             'upload_date': None,
3264             'title': video_title,
3265             'ext': ext,
3266             'format': format,
3267         }
3268
3269         return [info]
3270
3271
3272 class YoukuIE(InfoExtractor):
3273     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3274
3275     def report_download_webpage(self, file_id):
3276         """Report webpage download."""
3277         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3278
3279     def report_extraction(self, file_id):
3280         """Report information extraction."""
3281         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3282
3283     def _gen_sid(self):
3284         nowTime = int(time.time() * 1000)
3285         random1 = random.randint(1000,1998)
3286         random2 = random.randint(1000,9999)
3287
3288         return "%d%d%d" %(nowTime,random1,random2)
3289
3290     def _get_file_ID_mix_string(self, seed):
3291         mixed = []
3292         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3293         seed = float(seed)
3294         for i in range(len(source)):
3295             seed  =  (seed * 211 + 30031 ) % 65536
3296             index  =  math.floor(seed / 65536 * len(source) )
3297             mixed.append(source[int(index)])
3298             source.remove(source[int(index)])
3299         #return ''.join(mixed)
3300         return mixed
3301
3302     def _get_file_id(self, fileId, seed):
3303         mixed = self._get_file_ID_mix_string(seed)
3304         ids = fileId.split('*')
3305         realId = []
3306         for ch in ids:
3307             if ch:
3308                 realId.append(mixed[int(ch)])
3309         return ''.join(realId)
3310
3311     def _real_extract(self, url):
3312         mobj = re.match(self._VALID_URL, url)
3313         if mobj is None:
3314             self._downloader.report_error(u'invalid URL: %s' % url)
3315             return
3316         video_id = mobj.group('ID')
3317
3318         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3319
3320         request = compat_urllib_request.Request(info_url, None, std_headers)
3321         try:
3322             self.report_download_webpage(video_id)
3323             jsondata = compat_urllib_request.urlopen(request).read()
3324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3325             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3326             return
3327
3328         self.report_extraction(video_id)
3329         try:
3330             jsonstr = jsondata.decode('utf-8')
3331             config = json.loads(jsonstr)
3332
3333             video_title =  config['data'][0]['title']
3334             seed = config['data'][0]['seed']
3335
3336             format = self._downloader.params.get('format', None)
3337             supported_format = list(config['data'][0]['streamfileids'].keys())
3338
3339             if format is None or format == 'best':
3340                 if 'hd2' in supported_format:
3341                     format = 'hd2'
3342                 else:
3343                     format = 'flv'
3344                 ext = u'flv'
3345             elif format == 'worst':
3346                 format = 'mp4'
3347                 ext = u'mp4'
3348             else:
3349                 format = 'flv'
3350                 ext = u'flv'
3351
3352
3353             fileid = config['data'][0]['streamfileids'][format]
3354             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3355         except (UnicodeDecodeError, ValueError, KeyError):
3356             self._downloader.report_error(u'unable to extract info section')
3357             return
3358
3359         files_info=[]
3360         sid = self._gen_sid()
3361         fileid = self._get_file_id(fileid, seed)
3362
3363         #column 8,9 of fileid represent the segment number
3364         #fileid[7:9] should be changed
3365         for index, key in enumerate(keys):
3366
3367             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3368             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3369
3370             info = {
3371                 'id': '%s_part%02d' % (video_id, index),
3372                 'url': download_url,
3373                 'uploader': None,
3374                 'upload_date': None,
3375                 'title': video_title,
3376                 'ext': ext,
3377             }
3378             files_info.append(info)
3379
3380         return files_info
3381
3382
3383 class XNXXIE(InfoExtractor):
3384     """Information extractor for xnxx.com"""
3385
3386     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3387     IE_NAME = u'xnxx'
3388     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3389     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3390     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3391
3392     def report_webpage(self, video_id):
3393         """Report information extraction"""
3394         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3395
3396     def report_extraction(self, video_id):
3397         """Report information extraction"""
3398         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3399
3400     def _real_extract(self, url):
3401         mobj = re.match(self._VALID_URL, url)
3402         if mobj is None:
3403             self._downloader.report_error(u'invalid URL: %s' % url)
3404             return
3405         video_id = mobj.group(1)
3406
3407         self.report_webpage(video_id)
3408
3409         # Get webpage content
3410         try:
3411             webpage_bytes = compat_urllib_request.urlopen(url).read()
3412             webpage = webpage_bytes.decode('utf-8')
3413         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3414             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3415             return
3416
3417         result = re.search(self.VIDEO_URL_RE, webpage)
3418         if result is None:
3419             self._downloader.report_error(u'unable to extract video url')
3420             return
3421         video_url = compat_urllib_parse.unquote(result.group(1))
3422
3423         result = re.search(self.VIDEO_TITLE_RE, webpage)
3424         if result is None:
3425             self._downloader.report_error(u'unable to extract video title')
3426             return
3427         video_title = result.group(1)
3428
3429         result = re.search(self.VIDEO_THUMB_RE, webpage)
3430         if result is None:
3431             self._downloader.report_error(u'unable to extract video thumbnail')
3432             return
3433         video_thumbnail = result.group(1)
3434
3435         return [{
3436             'id': video_id,
3437             'url': video_url,
3438             'uploader': None,
3439             'upload_date': None,
3440             'title': video_title,
3441             'ext': 'flv',
3442             'thumbnail': video_thumbnail,
3443             'description': None,
3444         }]
3445
3446
3447 class GooglePlusIE(InfoExtractor):
3448     """Information extractor for plus.google.com."""
3449
3450     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3451     IE_NAME = u'plus.google'
3452
3453     def __init__(self, downloader=None):
3454         InfoExtractor.__init__(self, downloader)
3455
3456     def report_extract_entry(self, url):
3457         """Report downloading extry"""
3458         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3459
3460     def report_date(self, upload_date):
3461         """Report downloading extry"""
3462         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3463
3464     def report_uploader(self, uploader):
3465         """Report downloading extry"""
3466         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3467
3468     def report_title(self, video_title):
3469         """Report downloading extry"""
3470         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3471
3472     def report_extract_vid_page(self, video_page):
3473         """Report information extraction."""
3474         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3475
3476     def _real_extract(self, url):
3477         # Extract id from URL
3478         mobj = re.match(self._VALID_URL, url)
3479         if mobj is None:
3480             self._downloader.report_error(u'Invalid URL: %s' % url)
3481             return
3482
3483         post_url = mobj.group(0)
3484         video_id = mobj.group(1)
3485
3486         video_extension = 'flv'
3487
3488         # Step 1, Retrieve post webpage to extract further information
3489         self.report_extract_entry(post_url)
3490         request = compat_urllib_request.Request(post_url)
3491         try:
3492             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3493         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3494             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3495             return
3496
3497         # Extract update date
3498         upload_date = None
3499         pattern = 'title="Timestamp">(.*?)</a>'
3500         mobj = re.search(pattern, webpage)
3501         if mobj:
3502             upload_date = mobj.group(1)
3503             # Convert timestring to a format suitable for filename
3504             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3505             upload_date = upload_date.strftime('%Y%m%d')
3506         self.report_date(upload_date)
3507
3508         # Extract uploader
3509         uploader = None
3510         pattern = r'rel\="author".*?>(.*?)</a>'
3511         mobj = re.search(pattern, webpage)
3512         if mobj:
3513             uploader = mobj.group(1)
3514         self.report_uploader(uploader)
3515
3516         # Extract title
3517         # Get the first line for title
3518         video_title = u'NA'
3519         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3520         mobj = re.search(pattern, webpage)
3521         if mobj:
3522             video_title = mobj.group(1)
3523         self.report_title(video_title)
3524
3525         # Step 2, Stimulate clicking the image box to launch video
3526         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3527         mobj = re.search(pattern, webpage)
3528         if mobj is None:
3529             self._downloader.report_error(u'unable to extract video page URL')
3530
3531         video_page = mobj.group(1)
3532         request = compat_urllib_request.Request(video_page)
3533         try:
3534             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3535         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3536             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3537             return
3538         self.report_extract_vid_page(video_page)
3539
3540
3541         # Extract video links on video page
3542         """Extract video links of all sizes"""
3543         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3544         mobj = re.findall(pattern, webpage)
3545         if len(mobj) == 0:
3546             self._downloader.report_error(u'unable to extract video links')
3547
3548         # Sort in resolution
3549         links = sorted(mobj)
3550
3551         # Choose the lowest of the sort, i.e. highest resolution
3552         video_url = links[-1]
3553         # Only get the url. The resolution part in the tuple has no use anymore
3554         video_url = video_url[-1]
3555         # Treat escaped \u0026 style hex
3556         try:
3557             video_url = video_url.decode("unicode_escape")
3558         except AttributeError: # Python 3
3559             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3560
3561
3562         return [{
3563             'id':       video_id,
3564             'url':      video_url,
3565             'uploader': uploader,
3566             'upload_date':  upload_date,
3567             'title':    video_title,
3568             'ext':      video_extension,
3569         }]
3570
3571 class NBAIE(InfoExtractor):
3572     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3573     IE_NAME = u'nba'
3574
3575     def _real_extract(self, url):
3576         mobj = re.match(self._VALID_URL, url)
3577         if mobj is None:
3578             self._downloader.report_error(u'invalid URL: %s' % url)
3579             return
3580
3581         video_id = mobj.group(1)
3582         if video_id.endswith('/index.html'):
3583             video_id = video_id[:-len('/index.html')]
3584
3585         webpage = self._download_webpage(url, video_id)
3586
3587         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3588         def _findProp(rexp, default=None):
3589             m = re.search(rexp, webpage)
3590             if m:
3591                 return unescapeHTML(m.group(1))
3592             else:
3593                 return default
3594
3595         shortened_video_id = video_id.rpartition('/')[2]
3596         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3597         info = {
3598             'id': shortened_video_id,
3599             'url': video_url,
3600             'ext': 'mp4',
3601             'title': title,
3602             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3603             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3604         }
3605         return [info]
3606
3607 class JustinTVIE(InfoExtractor):
3608     """Information extractor for justin.tv and twitch.tv"""
3609     # TODO: One broadcast may be split into multiple videos. The key
3610     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3611     # starts at 1 and increases. Can we treat all parts as one video?
3612
3613     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3614         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3615     _JUSTIN_PAGE_LIMIT = 100
3616     IE_NAME = u'justin.tv'
3617
3618     def report_extraction(self, file_id):
3619         """Report information extraction."""
3620         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3621
3622     def report_download_page(self, channel, offset):
3623         """Report attempt to download a single page of videos."""
3624         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3625                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3626
3627     # Return count of items, list of *valid* items
3628     def _parse_page(self, url):
3629         try:
3630             urlh = compat_urllib_request.urlopen(url)
3631             webpage_bytes = urlh.read()
3632             webpage = webpage_bytes.decode('utf-8', 'ignore')
3633         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3634             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3635             return
3636
3637         response = json.loads(webpage)
3638         if type(response) != list:
3639             error_text = response.get('error', 'unknown error')
3640             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3641             return
3642         info = []
3643         for clip in response:
3644             video_url = clip['video_file_url']
3645             if video_url:
3646                 video_extension = os.path.splitext(video_url)[1][1:]
3647                 video_date = re.sub('-', '', clip['start_time'][:10])
3648                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3649                 video_id = clip['id']
3650                 video_title = clip.get('title', video_id)
3651                 info.append({
3652                     'id': video_id,
3653                     'url': video_url,
3654                     'title': video_title,
3655                     'uploader': clip.get('channel_name', video_uploader_id),
3656                     'uploader_id': video_uploader_id,
3657                     'upload_date': video_date,
3658                     'ext': video_extension,
3659                 })
3660         return (len(response), info)
3661
3662     def _real_extract(self, url):
3663         mobj = re.match(self._VALID_URL, url)
3664         if mobj is None:
3665             self._downloader.report_error(u'invalid URL: %s' % url)
3666             return
3667
3668         api = 'http://api.justin.tv'
3669         video_id = mobj.group(mobj.lastindex)
3670         paged = False
3671         if mobj.lastindex == 1:
3672             paged = True
3673             api += '/channel/archives/%s.json'
3674         else:
3675             api += '/broadcast/by_archive/%s.json'
3676         api = api % (video_id,)
3677
3678         self.report_extraction(video_id)
3679
3680         info = []
3681         offset = 0
3682         limit = self._JUSTIN_PAGE_LIMIT
3683         while True:
3684             if paged:
3685                 self.report_download_page(video_id, offset)
3686             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3687             page_count, page_info = self._parse_page(page_url)
3688             info.extend(page_info)
3689             if not paged or page_count != limit:
3690                 break
3691             offset += limit
3692         return info
3693
3694 class FunnyOrDieIE(InfoExtractor):
3695     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3696
3697     def _real_extract(self, url):
3698         mobj = re.match(self._VALID_URL, url)
3699         if mobj is None:
3700             self._downloader.report_error(u'invalid URL: %s' % url)
3701             return
3702
3703         video_id = mobj.group('id')
3704         webpage = self._download_webpage(url, video_id)
3705
3706         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3707         if not m:
3708             self._downloader.report_error(u'unable to find video information')
3709         video_url = unescapeHTML(m.group('url'))
3710
3711         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3712         if not m:
3713             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3714             if not m:
3715                 self._downloader.trouble(u'Cannot find video title')
3716         title = clean_html(m.group('title'))
3717
3718         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3719         if m:
3720             desc = unescapeHTML(m.group('desc'))
3721         else:
3722             desc = None
3723
3724         info = {
3725             'id': video_id,
3726             'url': video_url,
3727             'ext': 'mp4',
3728             'title': title,
3729             'description': desc,
3730         }
3731         return [info]
3732
3733 class SteamIE(InfoExtractor):
3734     _VALID_URL = r"""http://store.steampowered.com/
3735                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3736                 (?P<gameID>\d+)/?
3737                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3738                 """
3739
3740     @classmethod
3741     def suitable(cls, url):
3742         """Receives a URL and returns True if suitable for this IE."""
3743         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3744
3745     def _real_extract(self, url):
3746         m = re.match(self._VALID_URL, url, re.VERBOSE)
3747         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3748         gameID = m.group('gameID')
3749         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3750         webpage = self._download_webpage(videourl, gameID)
3751         mweb = re.finditer(urlRE, webpage)
3752         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3753         titles = re.finditer(namesRE, webpage)
3754         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3755         thumbs = re.finditer(thumbsRE, webpage)
3756         videos = []
3757         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3758             video_id = vid.group('videoID')
3759             title = vtitle.group('videoName')
3760             video_url = vid.group('videoURL')
3761             video_thumb = thumb.group('thumbnail')
3762             if not video_url:
3763                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3764             info = {
3765                 'id':video_id,
3766                 'url':video_url,
3767                 'ext': 'flv',
3768                 'title': unescapeHTML(title),
3769                 'thumbnail': video_thumb
3770                   }
3771             videos.append(info)
3772         return videos
3773
3774 class UstreamIE(InfoExtractor):
3775     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3776     IE_NAME = u'ustream'
3777
3778     def _real_extract(self, url):
3779         m = re.match(self._VALID_URL, url)
3780         video_id = m.group('videoID')
3781         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3782         webpage = self._download_webpage(url, video_id)
3783         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3784         title = m.group('title')
3785         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3786         uploader = m.group('uploader')
3787         info = {
3788                 'id':video_id,
3789                 'url':video_url,
3790                 'ext': 'flv',
3791                 'title': title,
3792                 'uploader': uploader
3793                   }
3794         return [info]
3795
3796 class WorldStarHipHopIE(InfoExtractor):
3797     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3798     IE_NAME = u'WorldStarHipHop'
3799
3800     def _real_extract(self, url):
3801         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3802
3803         webpage_src = compat_urllib_request.urlopen(url).read()
3804         webpage_src = webpage_src.decode('utf-8')
3805
3806         mobj = re.search(_src_url, webpage_src)
3807
3808         m = re.match(self._VALID_URL, url)
3809         video_id = m.group('id')
3810
3811         if mobj is not None:
3812             video_url = mobj.group()
3813             if 'mp4' in video_url:
3814                 ext = 'mp4'
3815             else:
3816                 ext = 'flv'
3817         else:
3818             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3819             return
3820
3821         _title = r"""<title>(.*)</title>"""
3822
3823         mobj = re.search(_title, webpage_src)
3824
3825         if mobj is not None:
3826             title = mobj.group(1)
3827         else:
3828             title = 'World Start Hip Hop - %s' % time.ctime()
3829
3830         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3831         mobj = re.search(_thumbnail, webpage_src)
3832
3833         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3834         if mobj is not None:
3835             thumbnail = mobj.group(1)
3836         else:
3837             _title = r"""candytitles.*>(.*)</span>"""
3838             mobj = re.search(_title, webpage_src)
3839             if mobj is not None:
3840                 title = mobj.group(1)
3841             thumbnail = None
3842
3843         results = [{
3844                     'id': video_id,
3845                     'url' : video_url,
3846                     'title' : title,
3847                     'thumbnail' : thumbnail,
3848                     'ext' : ext,
3849                     }]
3850         return results
3851
3852 class RBMARadioIE(InfoExtractor):
3853     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3854
3855     def _real_extract(self, url):
3856         m = re.match(self._VALID_URL, url)
3857         video_id = m.group('videoID')
3858
3859         webpage = self._download_webpage(url, video_id)
3860         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3861         if not m:
3862             raise ExtractorError(u'Cannot find metadata')
3863         json_data = m.group(1)
3864
3865         try:
3866             data = json.loads(json_data)
3867         except ValueError as e:
3868             raise ExtractorError(u'Invalid JSON: ' + str(e))
3869
3870         video_url = data['akamai_url'] + '&cbr=256'
3871         url_parts = compat_urllib_parse_urlparse(video_url)
3872         video_ext = url_parts.path.rpartition('.')[2]
3873         info = {
3874                 'id': video_id,
3875                 'url': video_url,
3876                 'ext': video_ext,
3877                 'title': data['title'],
3878                 'description': data.get('teaser_text'),
3879                 'location': data.get('country_of_origin'),
3880                 'uploader': data.get('host', {}).get('name'),
3881                 'uploader_id': data.get('host', {}).get('slug'),
3882                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3883                 'duration': data.get('duration'),
3884         }
3885         return [info]
3886
3887
3888 class YouPornIE(InfoExtractor):
3889     """Information extractor for youporn.com."""
3890     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3891
3892     def _print_formats(self, formats):
3893         """Print all available formats"""
3894         print(u'Available formats:')
3895         print(u'ext\t\tformat')
3896         print(u'---------------------------------')
3897         for format in formats:
3898             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3899
3900     def _specific(self, req_format, formats):
3901         for x in formats:
3902             if(x["format"]==req_format):
3903                 return x
3904         return None
3905
3906     def _real_extract(self, url):
3907         mobj = re.match(self._VALID_URL, url)
3908         if mobj is None:
3909             self._downloader.report_error(u'invalid URL: %s' % url)
3910             return
3911
3912         video_id = mobj.group('videoid')
3913
3914         req = compat_urllib_request.Request(url)
3915         req.add_header('Cookie', 'age_verified=1')
3916         webpage = self._download_webpage(req, video_id)
3917
3918         # Get the video title
3919         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3920         if result is None:
3921             raise ExtractorError(u'Unable to extract video title')
3922         video_title = result.group('title').strip()
3923
3924         # Get the video date
3925         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3926         if result is None:
3927             self._downloader.report_warning(u'unable to extract video date')
3928             upload_date = None
3929         else:
3930             upload_date = result.group('date').strip()
3931
3932         # Get the video uploader
3933         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3934         if result is None:
3935             self._downloader.report_warning(u'unable to extract uploader')
3936             video_uploader = None
3937         else:
3938             video_uploader = result.group('uploader').strip()
3939             video_uploader = clean_html( video_uploader )
3940
3941         # Get all of the formats available
3942         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3943         result = re.search(DOWNLOAD_LIST_RE, webpage)
3944         if result is None:
3945             raise ExtractorError(u'Unable to extract download list')
3946         download_list_html = result.group('download_list').strip()
3947
3948         # Get all of the links from the page
3949         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3950         links = re.findall(LINK_RE, download_list_html)
3951         if(len(links) == 0):
3952             raise ExtractorError(u'ERROR: no known formats available for video')
3953
3954         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3955
3956         formats = []
3957         for link in links:
3958
3959             # A link looks like this:
3960             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3961             # A path looks like this:
3962             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3963             video_url = unescapeHTML( link )
3964             path = compat_urllib_parse_urlparse( video_url ).path
3965             extension = os.path.splitext( path )[1][1:]
3966             format = path.split('/')[4].split('_')[:2]
3967             size = format[0]
3968             bitrate = format[1]
3969             format = "-".join( format )
3970             title = u'%s-%s-%s' % (video_title, size, bitrate)
3971
3972             formats.append({
3973                 'id': video_id,
3974                 'url': video_url,
3975                 'uploader': video_uploader,
3976                 'upload_date': upload_date,
3977                 'title': title,
3978                 'ext': extension,
3979                 'format': format,
3980                 'thumbnail': None,
3981                 'description': None,
3982                 'player_url': None
3983             })
3984
3985         if self._downloader.params.get('listformats', None):
3986             self._print_formats(formats)
3987             return
3988
3989         req_format = self._downloader.params.get('format', None)
3990         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3991
3992         if req_format is None or req_format == 'best':
3993             return [formats[0]]
3994         elif req_format == 'worst':
3995             return [formats[-1]]
3996         elif req_format in ('-1', 'all'):
3997             return formats
3998         else:
3999             format = self._specific( req_format, formats )
4000             if result is None:
4001                 self._downloader.report_error(u'requested format not available')
4002                 return
4003             return [format]
4004
4005
4006
4007 class PornotubeIE(InfoExtractor):
4008     """Information extractor for pornotube.com."""
4009     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4010
4011     def _real_extract(self, url):
4012         mobj = re.match(self._VALID_URL, url)
4013         if mobj is None:
4014             self._downloader.report_error(u'invalid URL: %s' % url)
4015             return
4016
4017         video_id = mobj.group('videoid')
4018         video_title = mobj.group('title')
4019
4020         # Get webpage content
4021         webpage = self._download_webpage(url, video_id)
4022
4023         # Get the video URL
4024         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4025         result = re.search(VIDEO_URL_RE, webpage)
4026         if result is None:
4027             self._downloader.report_error(u'unable to extract video url')
4028             return
4029         video_url = compat_urllib_parse.unquote(result.group('url'))
4030
4031         #Get the uploaded date
4032         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4033         result = re.search(VIDEO_UPLOADED_RE, webpage)
4034         if result is None:
4035             self._downloader.report_error(u'unable to extract video title')
4036             return
4037         upload_date = result.group('date')
4038
4039         info = {'id': video_id,
4040                 'url': video_url,
4041                 'uploader': None,
4042                 'upload_date': upload_date,
4043                 'title': video_title,
4044                 'ext': 'flv',
4045                 'format': 'flv'}
4046
4047         return [info]
4048
4049 class YouJizzIE(InfoExtractor):
4050     """Information extractor for youjizz.com."""
4051     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4052
4053     def _real_extract(self, url):
4054         mobj = re.match(self._VALID_URL, url)
4055         if mobj is None:
4056             self._downloader.report_error(u'invalid URL: %s' % url)
4057             return
4058
4059         video_id = mobj.group('videoid')
4060
4061         # Get webpage content
4062         webpage = self._download_webpage(url, video_id)
4063
4064         # Get the video title
4065         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4066         if result is None:
4067             raise ExtractorError(u'ERROR: unable to extract video title')
4068         video_title = result.group('title').strip()
4069
4070         # Get the embed page
4071         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4072         if result is None:
4073             raise ExtractorError(u'ERROR: unable to extract embed page')
4074
4075         embed_page_url = result.group(0).strip()
4076         video_id = result.group('videoid')
4077
4078         webpage = self._download_webpage(embed_page_url, video_id)
4079
4080         # Get the video URL
4081         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4082         if result is None:
4083             raise ExtractorError(u'ERROR: unable to extract video url')
4084         video_url = result.group('source')
4085
4086         info = {'id': video_id,
4087                 'url': video_url,
4088                 'title': video_title,
4089                 'ext': 'flv',
4090                 'format': 'flv',
4091                 'player_url': embed_page_url}
4092
4093         return [info]
4094
4095 class EightTracksIE(InfoExtractor):
4096     IE_NAME = '8tracks'
4097     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4098
4099     def _real_extract(self, url):
4100         mobj = re.match(self._VALID_URL, url)
4101         if mobj is None:
4102             raise ExtractorError(u'Invalid URL: %s' % url)
4103         playlist_id = mobj.group('id')
4104
4105         webpage = self._download_webpage(url, playlist_id)
4106
4107         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4108         if not m:
4109             raise ExtractorError(u'Cannot find trax information')
4110         json_like = m.group(1)
4111         data = json.loads(json_like)
4112
4113         session = str(random.randint(0, 1000000000))
4114         mix_id = data['id']
4115         track_count = data['tracks_count']
4116         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4117         next_url = first_url
4118         res = []
4119         for i in itertools.count():
4120             api_json = self._download_webpage(next_url, playlist_id,
4121                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4122                 errnote=u'Failed to download song information')
4123             api_data = json.loads(api_json)
4124             track_data = api_data[u'set']['track']
4125             info = {
4126                 'id': track_data['id'],
4127                 'url': track_data['track_file_stream_url'],
4128                 'title': track_data['performer'] + u' - ' + track_data['name'],
4129                 'raw_title': track_data['name'],
4130                 'uploader_id': data['user']['login'],
4131                 'ext': 'm4a',
4132             }
4133             res.append(info)
4134             if api_data['set']['at_last_track']:
4135                 break
4136             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4137         return res
4138
4139 class KeekIE(InfoExtractor):
4140     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4141     IE_NAME = u'keek'
4142
4143     def _real_extract(self, url):
4144         m = re.match(self._VALID_URL, url)
4145         video_id = m.group('videoID')
4146         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4147         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4148         webpage = self._download_webpage(url, video_id)
4149         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4150         title = unescapeHTML(m.group('title'))
4151         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4152         uploader = clean_html(m.group('uploader'))
4153         info = {
4154                 'id': video_id,
4155                 'url': video_url,
4156                 'ext': 'mp4',
4157                 'title': title,
4158                 'thumbnail': thumbnail,
4159                 'uploader': uploader
4160         }
4161         return [info]
4162
4163 class TEDIE(InfoExtractor):
4164     _VALID_URL=r'''http://www.ted.com/
4165                    (
4166                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4167                         |
4168                         ((?P<type_talk>talks)) # We have a simple talk
4169                    )
4170                    /(?P<name>\w+) # Here goes the name and then ".html"
4171                    '''
4172
4173     @classmethod
4174     def suitable(cls, url):
4175         """Receives a URL and returns True if suitable for this IE."""
4176         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4177
4178     def _real_extract(self, url):
4179         m=re.match(self._VALID_URL, url, re.VERBOSE)
4180         if m.group('type_talk'):
4181             return [self._talk_info(url)]
4182         else :
4183             playlist_id=m.group('playlist_id')
4184             name=m.group('name')
4185             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4186             return [self._playlist_videos_info(url,name,playlist_id)]
4187
4188     def _talk_video_link(self,mediaSlug):
4189         '''Returns the video link for that mediaSlug'''
4190         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4191
4192     def _playlist_videos_info(self,url,name,playlist_id=0):
4193         '''Returns the videos of the playlist'''
4194         video_RE=r'''
4195                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4196                      ([.\s]*?)data-playlist_item_id="(\d+)"
4197                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4198                      '''
4199         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4200         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4201         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4202         m_names=re.finditer(video_name_RE,webpage)
4203
4204         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4205         m_playlist = re.search(playlist_RE, webpage)
4206         playlist_title = m_playlist.group('playlist_title')
4207
4208         playlist_entries = []
4209         for m_video, m_name in zip(m_videos,m_names):
4210             video_id=m_video.group('video_id')
4211             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4212             playlist_entries.append(self.url_result(talk_url, 'TED'))
4213         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4214
4215     def _talk_info(self, url, video_id=0):
4216         """Return the video for the talk in the url"""
4217         m=re.match(self._VALID_URL, url,re.VERBOSE)
4218         videoName=m.group('name')
4219         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4220         # If the url includes the language we get the title translated
4221         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4222         title=re.search(title_RE, webpage).group('title')
4223         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4224                         "id":(?P<videoID>[\d]+).*?
4225                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4226         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4227         thumb_match=re.search(thumb_RE,webpage)
4228         info_match=re.search(info_RE,webpage,re.VERBOSE)
4229         video_id=info_match.group('videoID')
4230         mediaSlug=info_match.group('mediaSlug')
4231         video_url=self._talk_video_link(mediaSlug)
4232         info = {
4233                 'id': video_id,
4234                 'url': video_url,
4235                 'ext': 'mp4',
4236                 'title': title,
4237                 'thumbnail': thumb_match.group('thumbnail')
4238                 }
4239         return info
4240
4241 class MySpassIE(InfoExtractor):
4242     _VALID_URL = r'http://www.myspass.de/.*'
4243
4244     def _real_extract(self, url):
4245         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4246
4247         # video id is the last path element of the URL
4248         # usually there is a trailing slash, so also try the second but last
4249         url_path = compat_urllib_parse_urlparse(url).path
4250         url_parent_path, video_id = os.path.split(url_path)
4251         if not video_id:
4252             _, video_id = os.path.split(url_parent_path)
4253
4254         # get metadata
4255         metadata_url = META_DATA_URL_TEMPLATE % video_id
4256         metadata_text = self._download_webpage(metadata_url, video_id)
4257         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4258
4259         # extract values from metadata
4260         url_flv_el = metadata.find('url_flv')
4261         if url_flv_el is None:
4262             self._downloader.report_error(u'unable to extract download url')
4263             return
4264         video_url = url_flv_el.text
4265         extension = os.path.splitext(video_url)[1][1:]
4266         title_el = metadata.find('title')
4267         if title_el is None:
4268             self._downloader.report_error(u'unable to extract title')
4269             return
4270         title = title_el.text
4271         format_id_el = metadata.find('format_id')
4272         if format_id_el is None:
4273             format = ext
4274         else:
4275             format = format_id_el.text
4276         description_el = metadata.find('description')
4277         if description_el is not None:
4278             description = description_el.text
4279         else:
4280             description = None
4281         imagePreview_el = metadata.find('imagePreview')
4282         if imagePreview_el is not None:
4283             thumbnail = imagePreview_el.text
4284         else:
4285             thumbnail = None
4286         info = {
4287             'id': video_id,
4288             'url': video_url,
4289             'title': title,
4290             'ext': extension,
4291             'format': format,
4292             'thumbnail': thumbnail,
4293             'description': description
4294         }
4295         return [info]
4296
4297 class SpiegelIE(InfoExtractor):
4298     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4299
4300     def _real_extract(self, url):
4301         m = re.match(self._VALID_URL, url)
4302         video_id = m.group('videoID')
4303
4304         webpage = self._download_webpage(url, video_id)
4305         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4306         if not m:
4307             raise ExtractorError(u'Cannot find title')
4308         video_title = unescapeHTML(m.group(1))
4309
4310         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4311         xml_code = self._download_webpage(xml_url, video_id,
4312                     note=u'Downloading XML', errnote=u'Failed to download XML')
4313
4314         idoc = xml.etree.ElementTree.fromstring(xml_code)
4315         last_type = idoc[-1]
4316         filename = last_type.findall('./filename')[0].text
4317         duration = float(last_type.findall('./duration')[0].text)
4318
4319         video_url = 'http://video2.spiegel.de/flash/' + filename
4320         video_ext = filename.rpartition('.')[2]
4321         info = {
4322             'id': video_id,
4323             'url': video_url,
4324             'ext': video_ext,
4325             'title': video_title,
4326             'duration': duration,
4327         }
4328         return [info]
4329
4330 class LiveLeakIE(InfoExtractor):
4331
4332     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4333     IE_NAME = u'liveleak'
4334
4335     def _real_extract(self, url):
4336         mobj = re.match(self._VALID_URL, url)
4337         if mobj is None:
4338             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4339             return
4340
4341         video_id = mobj.group('video_id')
4342
4343         webpage = self._download_webpage(url, video_id)
4344
4345         m = re.search(r'file: "(.*?)",', webpage)
4346         if not m:
4347             self._downloader.report_error(u'unable to find video url')
4348             return
4349         video_url = m.group(1)
4350
4351         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4352         if not m:
4353             self._downloader.trouble(u'Cannot find video title')
4354         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4355
4356         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4357         if m:
4358             desc = unescapeHTML(m.group('desc'))
4359         else:
4360             desc = None
4361
4362         m = re.search(r'By:.*?(\w+)</a>', webpage)
4363         if m:
4364             uploader = clean_html(m.group(1))
4365         else:
4366             uploader = None
4367
4368         info = {
4369             'id':  video_id,
4370             'url': video_url,
4371             'ext': 'mp4',
4372             'title': title,
4373             'description': desc,
4374             'uploader': uploader
4375         }
4376
4377         return [info]
4378
4379 class ARDIE(InfoExtractor):
4380     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4381     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4382     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4383
4384     def _real_extract(self, url):
4385         # determine video id from url
4386         m = re.match(self._VALID_URL, url)
4387
4388         numid = re.search(r'documentId=([0-9]+)', url)
4389         if numid:
4390             video_id = numid.group(1)
4391         else:
4392             video_id = m.group('video_id')
4393
4394         # determine title and media streams from webpage
4395         html = self._download_webpage(url, video_id)
4396         title = re.search(self._TITLE, html).group('title')
4397         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4398         if not streams:
4399             assert '"fsk"' in html
4400             self._downloader.report_error(u'this video is only available after 8:00 pm')
4401             return
4402
4403         # choose default media type and highest quality for now
4404         stream = max([s for s in streams if int(s["media_type"]) == 0],
4405                      key=lambda s: int(s["quality"]))
4406
4407         # there's two possibilities: RTMP stream or HTTP download
4408         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4409         if stream['rtmp_url']:
4410             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4411             assert stream['video_url'].startswith('mp4:')
4412             info["url"] = stream["rtmp_url"]
4413             info["play_path"] = stream['video_url']
4414         else:
4415             assert stream["video_url"].endswith('.mp4')
4416             info["url"] = stream["video_url"]
4417         return [info]
4418
4419
4420 def gen_extractors():
4421     """ Return a list of an instance of every supported extractor.
4422     The order does matter; the first extractor matched is the one handling the URL.
4423     """
4424     return [
4425         YoutubePlaylistIE(),
4426         YoutubeChannelIE(),
4427         YoutubeUserIE(),
4428         YoutubeSearchIE(),
4429         YoutubeIE(),
4430         MetacafeIE(),
4431         DailymotionIE(),
4432         GoogleSearchIE(),
4433         PhotobucketIE(),
4434         YahooIE(),
4435         YahooSearchIE(),
4436         DepositFilesIE(),
4437         FacebookIE(),
4438         BlipTVUserIE(),
4439         BlipTVIE(),
4440         VimeoIE(),
4441         MyVideoIE(),
4442         ComedyCentralIE(),
4443         EscapistIE(),
4444         CollegeHumorIE(),
4445         XVideosIE(),
4446         SoundcloudSetIE(),
4447         SoundcloudIE(),
4448         InfoQIE(),
4449         MixcloudIE(),
4450         StanfordOpenClassroomIE(),
4451         MTVIE(),
4452         YoukuIE(),
4453         XNXXIE(),
4454         YouJizzIE(),
4455         PornotubeIE(),
4456         YouPornIE(),
4457         GooglePlusIE(),
4458         ArteTvIE(),
4459         NBAIE(),
4460         WorldStarHipHopIE(),
4461         JustinTVIE(),
4462         FunnyOrDieIE(),
4463         SteamIE(),
4464         UstreamIE(),
4465         RBMARadioIE(),
4466         EightTracksIE(),
4467         KeekIE(),
4468         TEDIE(),
4469         MySpassIE(),
4470         SpiegelIE(),
4471         LiveLeakIE(),
4472         ARDIE(),
4473         GenericIE()
4474     ]
4475
4476 def get_info_extractor(ie_name):
4477     """Returns the info extractor class with the given ie_name"""
4478     return globals()[ie_name+'IE']