_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             note = u'Downloading video webpage'
 118         if note is not False:
 119             self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self._downloader.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     #Methods for following #608
 148     #They set the correct value of the '_type' key
 149     def video_result(self, video_info):
 150         """Returns a video"""
 151         video_info['_type'] = 'video'
 152         return video_info
 153     def url_result(self, url, ie=None):
 154         """Returns a url that points to a page that should be processed"""
 155         #TODO: ie should be the class used for getting the info
 156         video_info = {'_type': 'url',
 157                       'url': url,
 158                       'ie_key': ie}
 159         return video_info
 160     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 161         """Returns a playlist"""
 162         video_info = {'_type': 'playlist',
 163                       'entries': entries}
 164         if playlist_id:
 165             video_info['id'] = playlist_id
 166         if playlist_title:
 167             video_info['title'] = playlist_title
 168         return video_info
 169
 170
 171 class YoutubeIE(InfoExtractor):
 172     """Information extractor for youtube.com."""
 173
 174     _VALID_URL = r"""^
 175                      (
 176                          (?:https?://)?                                       # http(s):// (optional)
 177                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 178                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 179                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 180                          (?:                                                  # the various things that can precede the ID:
 181                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 182                              |(?:                                             # or the v= param in all its forms
 183                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 184                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 185                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 186                                  v=
 187                              )
 188                          )?                                                   # optional -> youtube.com/xxxx is OK
 189                      )?                                                       # all until now is optional -> you can pass the naked ID
 190                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 191                      (?(1).+)?                                                # if we found the ID, everything can follow
 192                      $"""
 193     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 194     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 195     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 196     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 197     _NETRC_MACHINE = 'youtube'
 198     # Listed in order of quality
 199     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 200     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 201     _video_extensions = {
 202         '13': '3gp',
 203         '17': 'mp4',
 204         '18': 'mp4',
 205         '22': 'mp4',
 206         '37': 'mp4',
 207         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 208         '43': 'webm',
 209         '44': 'webm',
 210         '45': 'webm',
 211         '46': 'webm',
 212     }
 213     _video_dimensions = {
 214         '5': '240x400',
 215         '6': '???',
 216         '13': '???',
 217         '17': '144x176',
 218         '18': '360x640',
 219         '22': '720x1280',
 220         '34': '360x640',
 221         '35': '480x854',
 222         '37': '1080x1920',
 223         '38': '3072x4096',
 224         '43': '360x640',
 225         '44': '480x854',
 226         '45': '720x1280',
 227         '46': '1080x1920',
 228     }
 229     IE_NAME = u'youtube'
 230
 231     @classmethod
 232     def suitable(cls, url):
 233         """Receives a URL and returns True if suitable for this IE."""
 234         if YoutubePlaylistIE.suitable(url): return False
 235         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 236
 237     def report_lang(self):
 238         """Report attempt to set language."""
 239         self._downloader.to_screen(u'[youtube] Setting language')
 240
 241     def report_login(self):
 242         """Report attempt to log in."""
 243         self._downloader.to_screen(u'[youtube] Logging in')
 244
 245     def report_age_confirmation(self):
 246         """Report attempt to confirm age."""
 247         self._downloader.to_screen(u'[youtube] Confirming age')
 248
 249     def report_video_webpage_download(self, video_id):
 250         """Report attempt to download video webpage."""
 251         self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 252
 253     def report_video_info_webpage_download(self, video_id):
 254         """Report attempt to download video info webpage."""
 255         self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 256
 257     def report_video_subtitles_download(self, video_id):
 258         """Report attempt to download video info webpage."""
 259         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
 260
 261     def report_video_subtitles_request(self, video_id, sub_lang, format):
 262         """Report attempt to download video info webpage."""
 263         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 264
 265     def report_video_subtitles_available(self, video_id, sub_lang_list):
 266         """Report available subtitles."""
 267         sub_lang = ",".join(list(sub_lang_list.keys()))
 268         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
 269
 270     def report_information_extraction(self, video_id):
 271         """Report attempt to extract video information."""
 272         self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 273
 274     def report_unavailable_format(self, video_id, format):
 275         """Report extracted video URL."""
 276         self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 277
 278     def report_rtmp_download(self):
 279         """Indicate the download will use the RTMP protocol."""
 280         self._downloader.to_screen(u'[youtube] RTMP download detected')
 281
 282     def _get_available_subtitles(self, video_id):
 283         self.report_video_subtitles_download(video_id)
 284         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 285         try:
 286             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 287         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 288             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 289         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 290         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 291         if not sub_lang_list:
 292             return (u'video doesn\'t have subtitles', None)
 293         return sub_lang_list
 294
 295     def _list_available_subtitles(self, video_id):
 296         sub_lang_list = self._get_available_subtitles(video_id)
 297         self.report_video_subtitles_available(video_id, sub_lang_list)
 298
 299     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 300         """
 301         Return tuple:
 302         (error_message, sub_lang, sub)
 303         """
 304         self.report_video_subtitles_request(video_id, sub_lang, format)
 305         params = compat_urllib_parse.urlencode({
 306             'lang': sub_lang,
 307             'name': sub_name,
 308             'v': video_id,
 309             'fmt': format,
 310         })
 311         url = 'http://www.youtube.com/api/timedtext?' + params
 312         try:
 313             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 314         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 315             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 316         if not sub:
 317             return (u'Did not fetch video subtitles', None, None)
 318         return (None, sub_lang, sub)
 319
 320     def _extract_subtitle(self, video_id):
 321         """
 322         Return a list with a tuple:
 323         [(error_message, sub_lang, sub)]
 324         """
 325         sub_lang_list = self._get_available_subtitles(video_id)
 326         sub_format = self._downloader.params.get('subtitlesformat')
 327         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 328             return [(sub_lang_list[0], None, None)]
 329         if self._downloader.params.get('subtitleslang', False):
 330             sub_lang = self._downloader.params.get('subtitleslang')
 331         elif 'en' in sub_lang_list:
 332             sub_lang = 'en'
 333         else:
 334             sub_lang = list(sub_lang_list.keys())[0]
 335         if not sub_lang in sub_lang_list:
 336             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 337
 338         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 339         return [subtitle]
 340
 341     def _extract_all_subtitles(self, video_id):
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         subtitles = []
 347         for sub_lang in sub_lang_list:
 348             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 349             subtitles.append(subtitle)
 350         return subtitles
 351
 352     def _print_formats(self, formats):
 353         print('Available formats:')
 354         for x in formats:
 355             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 356
 357     def _real_initialize(self):
 358         if self._downloader is None:
 359             return
 360
 361         username = None
 362         password = None
 363         downloader_params = self._downloader.params
 364
 365         # Attempt to use provided username and password or .netrc data
 366         if downloader_params.get('username', None) is not None:
 367             username = downloader_params['username']
 368             password = downloader_params['password']
 369         elif downloader_params.get('usenetrc', False):
 370             try:
 371                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 372                 if info is not None:
 373                     username = info[0]
 374                     password = info[2]
 375                 else:
 376                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 377             except (IOError, netrc.NetrcParseError) as err:
 378                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 379                 return
 380
 381         # Set language
 382         request = compat_urllib_request.Request(self._LANG_URL)
 383         try:
 384             self.report_lang()
 385             compat_urllib_request.urlopen(request).read()
 386         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 387             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 388             return
 389
 390         # No authentication to be performed
 391         if username is None:
 392             return
 393
 394         request = compat_urllib_request.Request(self._LOGIN_URL)
 395         try:
 396             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 397         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 398             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 399             return
 400
 401         galx = None
 402         dsh = None
 403         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 404         if match:
 405           galx = match.group(1)
 406
 407         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 408         if match:
 409           dsh = match.group(1)
 410
 411         # Log in
 412         login_form_strs = {
 413                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 414                 u'Email': username,
 415                 u'GALX': galx,
 416                 u'Passwd': password,
 417                 u'PersistentCookie': u'yes',
 418                 u'_utf8': u'霱',
 419                 u'bgresponse': u'js_disabled',
 420                 u'checkConnection': u'',
 421                 u'checkedDomains': u'youtube',
 422                 u'dnConn': u'',
 423                 u'dsh': dsh,
 424                 u'pstMsg': u'0',
 425                 u'rmShown': u'1',
 426                 u'secTok': u'',
 427                 u'signIn': u'Sign in',
 428                 u'timeStmp': u'',
 429                 u'service': u'youtube',
 430                 u'uilel': u'3',
 431                 u'hl': u'en_US',
 432         }
 433         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 434         # chokes on unicode
 435         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 436         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 437         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 438         try:
 439             self.report_login()
 440             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 441             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 442                 self._downloader.report_warning(u'unable to log in: bad username or password')
 443                 return
 444         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 445             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 446             return
 447
 448         # Confirm age
 449         age_form = {
 450                 'next_url':     '/',
 451                 'action_confirm':   'Confirm',
 452                 }
 453         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 454         try:
 455             self.report_age_confirmation()
 456             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 457         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 458             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 459             return
 460
 461     def _extract_id(self, url):
 462         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 463         if mobj is None:
 464             self._downloader.report_error(u'invalid URL: %s' % url)
 465             return
 466         video_id = mobj.group(2)
 467         return video_id
 468
 469     def _real_extract(self, url):
 470         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 471         mobj = re.search(self._NEXT_URL_RE, url)
 472         if mobj:
 473             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 474         video_id = self._extract_id(url)
 475
 476         # Get video webpage
 477         self.report_video_webpage_download(video_id)
 478         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 479         request = compat_urllib_request.Request(url)
 480         try:
 481             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 482         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 483             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 484             return
 485
 486         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 487
 488         # Attempt to extract SWF player URL
 489         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 490         if mobj is not None:
 491             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 492         else:
 493             player_url = None
 494
 495         # Get video info
 496         self.report_video_info_webpage_download(video_id)
 497         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 498             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 499                     % (video_id, el_type))
 500             video_info_webpage = self._download_webpage(video_info_url, video_id,
 501                                     note=False,
 502                                     errnote='unable to download video info webpage')
 503             video_info = compat_parse_qs(video_info_webpage)
 504             if 'token' in video_info:
 505                 break
 506         if 'token' not in video_info:
 507             if 'reason' in video_info:
 508                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 509             else:
 510                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 511             return
 512
 513         # Check for "rental" videos
 514         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 515             self._downloader.report_error(u'"rental" videos not supported')
 516             return
 517
 518         # Start extracting information
 519         self.report_information_extraction(video_id)
 520
 521         # uploader
 522         if 'author' not in video_info:
 523             self._downloader.report_error(u'unable to extract uploader name')
 524             return
 525         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 526
 527         # uploader_id
 528         video_uploader_id = None
 529         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 530         if mobj is not None:
 531             video_uploader_id = mobj.group(1)
 532         else:
 533             self._downloader.report_warning(u'unable to extract uploader nickname')
 534
 535         # title
 536         if 'title' not in video_info:
 537             self._downloader.report_error(u'unable to extract video title')
 538             return
 539         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 540
 541         # thumbnail image
 542         if 'thumbnail_url' not in video_info:
 543             self._downloader.report_warning(u'unable to extract video thumbnail')
 544             video_thumbnail = ''
 545         else:   # don't panic if we can't find it
 546             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 547
 548         # upload date
 549         upload_date = None
 550         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 551         if mobj is not None:
 552             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 553             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 554             for expression in format_expressions:
 555                 try:
 556                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 557                 except:
 558                     pass
 559
 560         # description
 561         video_description = get_element_by_id("eow-description", video_webpage)
 562         if video_description:
 563             video_description = clean_html(video_description)
 564         else:
 565             video_description = ''
 566
 567         # subtitles
 568         video_subtitles = None
 569
 570         if self._downloader.params.get('writesubtitles', False):
 571             video_subtitles = self._extract_subtitle(video_id)
 572             if video_subtitles:
 573                 (sub_error, sub_lang, sub) = video_subtitles[0]
 574                 if sub_error:
 575                     self._downloader.report_error(sub_error)
 576
 577         if self._downloader.params.get('allsubtitles', False):
 578             video_subtitles = self._extract_all_subtitles(video_id)
 579             for video_subtitle in video_subtitles:
 580                 (sub_error, sub_lang, sub) = video_subtitle
 581                 if sub_error:
 582                     self._downloader.report_error(sub_error)
 583
 584         if self._downloader.params.get('listsubtitles', False):
 585             sub_lang_list = self._list_available_subtitles(video_id)
 586             return
 587
 588         if 'length_seconds' not in video_info:
 589             self._downloader.report_warning(u'unable to extract video duration')
 590             video_duration = ''
 591         else:
 592             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 593
 594         # token
 595         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 596
 597         # Decide which formats to download
 598         req_format = self._downloader.params.get('format', None)
 599
 600         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 601             self.report_rtmp_download()
 602             video_url_list = [(None, video_info['conn'][0])]
 603         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 604             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 605             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 606             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 607             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 608
 609             format_limit = self._downloader.params.get('format_limit', None)
 610             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 611             if format_limit is not None and format_limit in available_formats:
 612                 format_list = available_formats[available_formats.index(format_limit):]
 613             else:
 614                 format_list = available_formats
 615             existing_formats = [x for x in format_list if x in url_map]
 616             if len(existing_formats) == 0:
 617                 self._downloader.report_error(u'no known formats available for video')
 618                 return
 619             if self._downloader.params.get('listformats', None):
 620                 self._print_formats(existing_formats)
 621                 return
 622             if req_format is None or req_format == 'best':
 623                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 624             elif req_format == 'worst':
 625                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 626             elif req_format in ('-1', 'all'):
 627                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 628             else:
 629                 # Specific formats. We pick the first in a slash-delimeted sequence.
 630                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 631                 req_formats = req_format.split('/')
 632                 video_url_list = None
 633                 for rf in req_formats:
 634                     if rf in url_map:
 635                         video_url_list = [(rf, url_map[rf])]
 636                         break
 637                 if video_url_list is None:
 638                     self._downloader.report_error(u'requested format not available')
 639                     return
 640         else:
 641             self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
 642             return
 643
 644         results = []
 645         for format_param, video_real_url in video_url_list:
 646             # Extension
 647             video_extension = self._video_extensions.get(format_param, 'flv')
 648
 649             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 650                                               self._video_dimensions.get(format_param, '???'))
 651
 652             results.append({
 653                 'id':       video_id,
 654                 'url':      video_real_url,
 655                 'uploader': video_uploader,
 656                 'uploader_id': video_uploader_id,
 657                 'upload_date':  upload_date,
 658                 'title':    video_title,
 659                 'ext':      video_extension,
 660                 'format':   video_format,
 661                 'thumbnail':    video_thumbnail,
 662                 'description':  video_description,
 663                 'player_url':   player_url,
 664                 'subtitles':    video_subtitles,
 665                 'duration':     video_duration
 666             })
 667         return results
 668
 669
 670 class MetacafeIE(InfoExtractor):
 671     """Information Extractor for metacafe.com."""
 672
 673     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 674     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 675     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 676     IE_NAME = u'metacafe'
 677
 678     def __init__(self, downloader=None):
 679         InfoExtractor.__init__(self, downloader)
 680
 681     def report_disclaimer(self):
 682         """Report disclaimer retrieval."""
 683         self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 684
 685     def report_age_confirmation(self):
 686         """Report attempt to confirm age."""
 687         self._downloader.to_screen(u'[metacafe] Confirming age')
 688
 689     def report_download_webpage(self, video_id):
 690         """Report webpage download."""
 691         self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 692
 693     def report_extraction(self, video_id):
 694         """Report information extraction."""
 695         self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 696
 697     def _real_initialize(self):
 698         # Retrieve disclaimer
 699         request = compat_urllib_request.Request(self._DISCLAIMER)
 700         try:
 701             self.report_disclaimer()
 702             disclaimer = compat_urllib_request.urlopen(request).read()
 703         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 704             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 705             return
 706
 707         # Confirm age
 708         disclaimer_form = {
 709             'filters': '0',
 710             'submit': "Continue - I'm over 18",
 711             }
 712         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 713         try:
 714             self.report_age_confirmation()
 715             disclaimer = compat_urllib_request.urlopen(request).read()
 716         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 717             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 718             return
 719
 720     def _real_extract(self, url):
 721         # Extract id and simplified title from URL
 722         mobj = re.match(self._VALID_URL, url)
 723         if mobj is None:
 724             self._downloader.report_error(u'invalid URL: %s' % url)
 725             return
 726
 727         video_id = mobj.group(1)
 728
 729         # Check if video comes from YouTube
 730         mobj2 = re.match(r'^yt-(.*)$', video_id)
 731         if mobj2 is not None:
 732             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 733
 734         # Retrieve video webpage to extract further information
 735         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 736
 737         # Extract URL, uploader and title from webpage
 738         self.report_extraction(video_id)
 739         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 740         if mobj is not None:
 741             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 742             video_extension = mediaURL[-3:]
 743
 744             # Extract gdaKey if available
 745             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 746             if mobj is None:
 747                 video_url = mediaURL
 748             else:
 749                 gdaKey = mobj.group(1)
 750                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 751         else:
 752             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 753             if mobj is None:
 754                 self._downloader.report_error(u'unable to extract media URL')
 755                 return
 756             vardict = compat_parse_qs(mobj.group(1))
 757             if 'mediaData' not in vardict:
 758                 self._downloader.report_error(u'unable to extract media URL')
 759                 return
 760             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 761             if mobj is None:
 762                 self._downloader.report_error(u'unable to extract media URL')
 763                 return
 764             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 765             video_extension = mediaURL[-3:]
 766             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 767
 768         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 769         if mobj is None:
 770             self._downloader.report_error(u'unable to extract title')
 771             return
 772         video_title = mobj.group(1).decode('utf-8')
 773
 774         mobj = re.search(r'submitter=(.*?);', webpage)
 775         if mobj is None:
 776             self._downloader.report_error(u'unable to extract uploader nickname')
 777             return
 778         video_uploader = mobj.group(1)
 779
 780         return [{
 781             'id':       video_id.decode('utf-8'),
 782             'url':      video_url.decode('utf-8'),
 783             'uploader': video_uploader.decode('utf-8'),
 784             'upload_date':  None,
 785             'title':    video_title,
 786             'ext':      video_extension.decode('utf-8'),
 787         }]
 788
 789
 790 class DailymotionIE(InfoExtractor):
 791     """Information Extractor for Dailymotion"""
 792
 793     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 794     IE_NAME = u'dailymotion'
 795     _WORKING = False
 796
 797     def __init__(self, downloader=None):
 798         InfoExtractor.__init__(self, downloader)
 799
 800     def report_extraction(self, video_id):
 801         """Report information extraction."""
 802         self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 803
 804     def _real_extract(self, url):
 805         # Extract id and simplified title from URL
 806         mobj = re.match(self._VALID_URL, url)
 807         if mobj is None:
 808             self._downloader.report_error(u'invalid URL: %s' % url)
 809             return
 810
 811         video_id = mobj.group(1).split('_')[0].split('?')[0]
 812
 813         video_extension = 'mp4'
 814
 815         # Retrieve video webpage to extract further information
 816         request = compat_urllib_request.Request(url)
 817         request.add_header('Cookie', 'family_filter=off')
 818         webpage = self._download_webpage(request, video_id)
 819
 820         # Extract URL, uploader and title from webpage
 821         self.report_extraction(video_id)
 822         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 823         if mobj is None:
 824             self._downloader.report_error(u'unable to extract media URL')
 825             return
 826         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 827
 828         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 829             if key in flashvars:
 830                 max_quality = key
 831                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 832                 break
 833         else:
 834             self._downloader.report_error(u'unable to extract video URL')
 835             return
 836
 837         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 838         if mobj is None:
 839             self._downloader.report_error(u'unable to extract video URL')
 840             return
 841
 842         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 843
 844         # TODO: support choosing qualities
 845
 846         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 847         if mobj is None:
 848             self._downloader.report_error(u'unable to extract title')
 849             return
 850         video_title = unescapeHTML(mobj.group('title'))
 851
 852         video_uploader = None
 853         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 854         if mobj is None:
 855             # lookin for official user
 856             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 857             if mobj_official is None:
 858                 self._downloader.report_warning(u'unable to extract uploader nickname')
 859             else:
 860                 video_uploader = mobj_official.group(1)
 861         else:
 862             video_uploader = mobj.group(1)
 863
 864         video_upload_date = None
 865         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 866         if mobj is not None:
 867             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 868
 869         return [{
 870             'id':       video_id,
 871             'url':      video_url,
 872             'uploader': video_uploader,
 873             'upload_date':  video_upload_date,
 874             'title':    video_title,
 875             'ext':      video_extension,
 876         }]
 877
 878
 879 class PhotobucketIE(InfoExtractor):
 880     """Information extractor for photobucket.com."""
 881
 882     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 883     IE_NAME = u'photobucket'
 884
 885     def __init__(self, downloader=None):
 886         InfoExtractor.__init__(self, downloader)
 887
 888     def report_download_webpage(self, video_id):
 889         """Report webpage download."""
 890         self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 891
 892     def report_extraction(self, video_id):
 893         """Report information extraction."""
 894         self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 895
 896     def _real_extract(self, url):
 897         # Extract id from URL
 898         mobj = re.match(self._VALID_URL, url)
 899         if mobj is None:
 900             self._downloader.report_error(u'Invalid URL: %s' % url)
 901             return
 902
 903         video_id = mobj.group(1)
 904
 905         video_extension = 'flv'
 906
 907         # Retrieve video webpage to extract further information
 908         request = compat_urllib_request.Request(url)
 909         try:
 910             self.report_download_webpage(video_id)
 911             webpage = compat_urllib_request.urlopen(request).read()
 912         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 913             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 914             return
 915
 916         # Extract URL, uploader, and title from webpage
 917         self.report_extraction(video_id)
 918         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 919         if mobj is None:
 920             self._downloader.report_error(u'unable to extract media URL')
 921             return
 922         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 923
 924         video_url = mediaURL
 925
 926         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 927         if mobj is None:
 928             self._downloader.report_error(u'unable to extract title')
 929             return
 930         video_title = mobj.group(1).decode('utf-8')
 931
 932         video_uploader = mobj.group(2).decode('utf-8')
 933
 934         return [{
 935             'id':       video_id.decode('utf-8'),
 936             'url':      video_url.decode('utf-8'),
 937             'uploader': video_uploader,
 938             'upload_date':  None,
 939             'title':    video_title,
 940             'ext':      video_extension.decode('utf-8'),
 941         }]
 942
 943
 944 class YahooIE(InfoExtractor):
 945     """Information extractor for video.yahoo.com."""
 946
 947     _WORKING = False
 948     # _VALID_URL matches all Yahoo! Video URLs
 949     # _VPAGE_URL matches only the extractable '/watch/' URLs
 950     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 951     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 952     IE_NAME = u'video.yahoo'
 953
 954     def __init__(self, downloader=None):
 955         InfoExtractor.__init__(self, downloader)
 956
 957     def report_download_webpage(self, video_id):
 958         """Report webpage download."""
 959         self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 960
 961     def report_extraction(self, video_id):
 962         """Report information extraction."""
 963         self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 964
 965     def _real_extract(self, url, new_video=True):
 966         # Extract ID from URL
 967         mobj = re.match(self._VALID_URL, url)
 968         if mobj is None:
 969             self._downloader.report_error(u'Invalid URL: %s' % url)
 970             return
 971
 972         video_id = mobj.group(2)
 973         video_extension = 'flv'
 974
 975         # Rewrite valid but non-extractable URLs as
 976         # extractable English language /watch/ URLs
 977         if re.match(self._VPAGE_URL, url) is None:
 978             request = compat_urllib_request.Request(url)
 979             try:
 980                 webpage = compat_urllib_request.urlopen(request).read()
 981             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 982                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 983                 return
 984
 985             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 986             if mobj is None:
 987                 self._downloader.report_error(u'Unable to extract id field')
 988                 return
 989             yahoo_id = mobj.group(1)
 990
 991             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 992             if mobj is None:
 993                 self._downloader.report_error(u'Unable to extract vid field')
 994                 return
 995             yahoo_vid = mobj.group(1)
 996
 997             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 998             return self._real_extract(url, new_video=False)
 999
1000         # Retrieve video webpage to extract further information
1001         request = compat_urllib_request.Request(url)
1002         try:
1003             self.report_download_webpage(video_id)
1004             webpage = compat_urllib_request.urlopen(request).read()
1005         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1007             return
1008
1009         # Extract uploader and title from webpage
1010         self.report_extraction(video_id)
1011         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1012         if mobj is None:
1013             self._downloader.report_error(u'unable to extract video title')
1014             return
1015         video_title = mobj.group(1).decode('utf-8')
1016
1017         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1018         if mobj is None:
1019             self._downloader.report_error(u'unable to extract video uploader')
1020             return
1021         video_uploader = mobj.group(1).decode('utf-8')
1022
1023         # Extract video thumbnail
1024         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1025         if mobj is None:
1026             self._downloader.report_error(u'unable to extract video thumbnail')
1027             return
1028         video_thumbnail = mobj.group(1).decode('utf-8')
1029
1030         # Extract video description
1031         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'unable to extract video description')
1034             return
1035         video_description = mobj.group(1).decode('utf-8')
1036         if not video_description:
1037             video_description = 'No description available.'
1038
1039         # Extract video height and width
1040         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1041         if mobj is None:
1042             self._downloader.report_error(u'unable to extract video height')
1043             return
1044         yv_video_height = mobj.group(1)
1045
1046         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1047         if mobj is None:
1048             self._downloader.report_error(u'unable to extract video width')
1049             return
1050         yv_video_width = mobj.group(1)
1051
1052         # Retrieve video playlist to extract media URL
1053         # I'm not completely sure what all these options are, but we
1054         # seem to need most of them, otherwise the server sends a 401.
1055         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1056         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1057         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1060         try:
1061             self.report_download_webpage(video_id)
1062             webpage = compat_urllib_request.urlopen(request).read()
1063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1065             return
1066
1067         # Extract media URL from playlist XML
1068         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1069         if mobj is None:
1070             self._downloader.report_error(u'Unable to extract media URL')
1071             return
1072         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073         video_url = unescapeHTML(video_url)
1074
1075         return [{
1076             'id':       video_id.decode('utf-8'),
1077             'url':      video_url,
1078             'uploader': video_uploader,
1079             'upload_date':  None,
1080             'title':    video_title,
1081             'ext':      video_extension.decode('utf-8'),
1082             'thumbnail':    video_thumbnail.decode('utf-8'),
1083             'description':  video_description,
1084         }]
1085
1086
1087 class VimeoIE(InfoExtractor):
1088     """Information extractor for vimeo.com."""
1089
1090     # _VALID_URL matches Vimeo URLs
1091     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1092     IE_NAME = u'vimeo'
1093
1094     def __init__(self, downloader=None):
1095         InfoExtractor.__init__(self, downloader)
1096
1097     def report_download_webpage(self, video_id):
1098         """Report webpage download."""
1099         self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1100
1101     def report_extraction(self, video_id):
1102         """Report information extraction."""
1103         self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1104
1105     def _real_extract(self, url, new_video=True):
1106         # Extract ID from URL
1107         mobj = re.match(self._VALID_URL, url)
1108         if mobj is None:
1109             self._downloader.report_error(u'Invalid URL: %s' % url)
1110             return
1111
1112         video_id = mobj.group('id')
1113         if not mobj.group('proto'):
1114             url = 'https://' + url
1115         if mobj.group('direct_link'):
1116             url = 'https://vimeo.com/' + video_id
1117
1118         # Retrieve video webpage to extract further information
1119         request = compat_urllib_request.Request(url, None, std_headers)
1120         try:
1121             self.report_download_webpage(video_id)
1122             webpage_bytes = compat_urllib_request.urlopen(request).read()
1123             webpage = webpage_bytes.decode('utf-8')
1124         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1126             return
1127
1128         # Now we begin extracting as much information as we can from what we
1129         # retrieved. First we extract the information common to all extractors,
1130         # and latter we extract those that are Vimeo specific.
1131         self.report_extraction(video_id)
1132
1133         # Extract the config JSON
1134         try:
1135             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136             config = json.loads(config)
1137         except:
1138             self._downloader.report_error(u'unable to extract info section')
1139             return
1140
1141         # Extract title
1142         video_title = config["video"]["title"]
1143
1144         # Extract uploader and uploader_id
1145         video_uploader = config["video"]["owner"]["name"]
1146         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1147
1148         # Extract video thumbnail
1149         video_thumbnail = config["video"]["thumbnail"]
1150
1151         # Extract video description
1152         video_description = get_element_by_attribute("itemprop", "description", webpage)
1153         if video_description: video_description = clean_html(video_description)
1154         else: video_description = u''
1155
1156         # Extract upload date
1157         video_upload_date = None
1158         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159         if mobj is not None:
1160             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1161
1162         # Vimeo specific: extract request signature and timestamp
1163         sig = config['request']['signature']
1164         timestamp = config['request']['timestamp']
1165
1166         # Vimeo specific: extract video codec and quality information
1167         # First consider quality, then codecs, then take everything
1168         # TODO bind to format param
1169         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170         files = { 'hd': [], 'sd': [], 'other': []}
1171         for codec_name, codec_extension in codecs:
1172             if codec_name in config["video"]["files"]:
1173                 if 'hd' in config["video"]["files"][codec_name]:
1174                     files['hd'].append((codec_name, codec_extension, 'hd'))
1175                 elif 'sd' in config["video"]["files"][codec_name]:
1176                     files['sd'].append((codec_name, codec_extension, 'sd'))
1177                 else:
1178                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1179
1180         for quality in ('hd', 'sd', 'other'):
1181             if len(files[quality]) > 0:
1182                 video_quality = files[quality][0][2]
1183                 video_codec = files[quality][0][0]
1184                 video_extension = files[quality][0][1]
1185                 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1186                 break
1187         else:
1188             self._downloader.report_error(u'no known codec found')
1189             return
1190
1191         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1193
1194         return [{
1195             'id':       video_id,
1196             'url':      video_url,
1197             'uploader': video_uploader,
1198             'uploader_id': video_uploader_id,
1199             'upload_date':  video_upload_date,
1200             'title':    video_title,
1201             'ext':      video_extension,
1202             'thumbnail':    video_thumbnail,
1203             'description':  video_description,
1204         }]
1205
1206
1207 class ArteTvIE(InfoExtractor):
1208     """arte.tv information extractor."""
1209
1210     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211     _LIVE_URL = r'index-[0-9]+\.html$'
1212
1213     IE_NAME = u'arte.tv'
1214
1215     def __init__(self, downloader=None):
1216         InfoExtractor.__init__(self, downloader)
1217
1218     def report_download_webpage(self, video_id):
1219         """Report webpage download."""
1220         self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1221
1222     def report_extraction(self, video_id):
1223         """Report information extraction."""
1224         self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1225
1226     def fetch_webpage(self, url):
1227         request = compat_urllib_request.Request(url)
1228         try:
1229             self.report_download_webpage(url)
1230             webpage = compat_urllib_request.urlopen(request).read()
1231         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1233             return
1234         except ValueError as err:
1235             self._downloader.report_error(u'Invalid URL: %s' % url)
1236             return
1237         return webpage
1238
1239     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240         page = self.fetch_webpage(url)
1241         mobj = re.search(regex, page, regexFlags)
1242         info = {}
1243
1244         if mobj is None:
1245             self._downloader.report_error(u'Invalid URL: %s' % url)
1246             return
1247
1248         for (i, key, err) in matchTuples:
1249             if mobj.group(i) is None:
1250                 self._downloader.trouble(err)
1251                 return
1252             else:
1253                 info[key] = mobj.group(i)
1254
1255         return info
1256
1257     def extractLiveStream(self, url):
1258         video_lang = url.split('/')[-4]
1259         info = self.grep_webpage(
1260             url,
1261             r'src="(.*?/videothek_js.*?\.js)',
1262             0,
1263             [
1264                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1265             ]
1266         )
1267         http_host = url.split('/')[2]
1268         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269         info = self.grep_webpage(
1270             next_url,
1271             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272                 '(http://.*?\.swf).*?' +
1273                 '(rtmp://.*?)\'',
1274             re.DOTALL,
1275             [
1276                 (1, 'path',   u'ERROR: could not extract video path: %s' % url),
1277                 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278                 (3, 'url',    u'ERROR: could not extract video url: %s' % url)
1279             ]
1280         )
1281         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1282
1283     def extractPlus7Stream(self, url):
1284         video_lang = url.split('/')[-3]
1285         info = self.grep_webpage(
1286             url,
1287             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1288             0,
1289             [
1290                 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1291             ]
1292         )
1293         next_url = compat_urllib_parse.unquote(info.get('url'))
1294         info = self.grep_webpage(
1295             next_url,
1296             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1297             0,
1298             [
1299                 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1300             ]
1301         )
1302         next_url = compat_urllib_parse.unquote(info.get('url'))
1303
1304         info = self.grep_webpage(
1305             next_url,
1306             r'<video id="(.*?)".*?>.*?' +
1307                 '<name>(.*?)</name>.*?' +
1308                 '<dateVideo>(.*?)</dateVideo>.*?' +
1309                 '<url quality="hd">(.*?)</url>',
1310             re.DOTALL,
1311             [
1312                 (1, 'id',    u'ERROR: could not extract video id: %s' % url),
1313                 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314                 (3, 'date',  u'ERROR: could not extract video date: %s' % url),
1315                 (4, 'url',   u'ERROR: could not extract video url: %s' % url)
1316             ]
1317         )
1318
1319         return {
1320             'id':           info.get('id'),
1321             'url':          compat_urllib_parse.unquote(info.get('url')),
1322             'uploader':     u'arte.tv',
1323             'upload_date':  info.get('date'),
1324             'title':        info.get('title').decode('utf-8'),
1325             'ext':          u'mp4',
1326             'format':       u'NA',
1327             'player_url':   None,
1328         }
1329
1330     def _real_extract(self, url):
1331         video_id = url.split('/')[-1]
1332         self.report_extraction(video_id)
1333
1334         if re.search(self._LIVE_URL, video_id) is not None:
1335             self.extractLiveStream(url)
1336             return
1337         else:
1338             info = self.extractPlus7Stream(url)
1339
1340         return [info]
1341
1342
1343 class GenericIE(InfoExtractor):
1344     """Generic last-resort information extractor."""
1345
1346     _VALID_URL = r'.*'
1347     IE_NAME = u'generic'
1348
1349     def __init__(self, downloader=None):
1350         InfoExtractor.__init__(self, downloader)
1351
1352     def report_download_webpage(self, video_id):
1353         """Report webpage download."""
1354         if not self._downloader.params.get('test', False):
1355             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356         self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1357
1358     def report_extraction(self, video_id):
1359         """Report information extraction."""
1360         self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1361
1362     def report_following_redirect(self, new_url):
1363         """Report information extraction."""
1364         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1365
1366     def _test_redirect(self, url):
1367         """Check if it is a redirect, like url shorteners, in case return the new url."""
1368         class HeadRequest(compat_urllib_request.Request):
1369             def get_method(self):
1370                 return "HEAD"
1371
1372         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1373             """
1374             Subclass the HTTPRedirectHandler to make it use our
1375             HeadRequest also on the redirected URL
1376             """
1377             def redirect_request(self, req, fp, code, msg, headers, newurl):
1378                 if code in (301, 302, 303, 307):
1379                     newurl = newurl.replace(' ', '%20')
1380                     newheaders = dict((k,v) for k,v in req.headers.items()
1381                                       if k.lower() not in ("content-length", "content-type"))
1382                     return HeadRequest(newurl,
1383                                        headers=newheaders,
1384                                        origin_req_host=req.get_origin_req_host(),
1385                                        unverifiable=True)
1386                 else:
1387                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1388
1389         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1390             """
1391             Fallback to GET if HEAD is not allowed (405 HTTP error)
1392             """
1393             def http_error_405(self, req, fp, code, msg, headers):
1394                 fp.read()
1395                 fp.close()
1396
1397                 newheaders = dict((k,v) for k,v in req.headers.items()
1398                                   if k.lower() not in ("content-length", "content-type"))
1399                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1400                                                  headers=newheaders,
1401                                                  origin_req_host=req.get_origin_req_host(),
1402                                                  unverifiable=True))
1403
1404         # Build our opener
1405         opener = compat_urllib_request.OpenerDirector()
1406         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407                         HTTPMethodFallback, HEADRedirectHandler,
1408                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409             opener.add_handler(handler())
1410
1411         response = opener.open(HeadRequest(url))
1412         new_url = response.geturl()
1413
1414         if url == new_url:
1415             return False
1416
1417         self.report_following_redirect(new_url)
1418         return new_url
1419
1420     def _real_extract(self, url):
1421         new_url = self._test_redirect(url)
1422         if new_url: return [self.url_result(new_url)]
1423
1424         video_id = url.split('/')[-1]
1425         try:
1426             webpage = self._download_webpage(url, video_id)
1427         except ValueError as err:
1428             # since this is the last-resort InfoExtractor, if
1429             # this error is thrown, it'll be thrown here
1430             self._downloader.report_error(u'Invalid URL: %s' % url)
1431             return
1432
1433         self.report_extraction(video_id)
1434         # Start with something easy: JW Player in SWFObject
1435         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1436         if mobj is None:
1437             # Broaden the search a little bit
1438             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1439         if mobj is None:
1440             # Broaden the search a little bit: JWPlayer JS loader
1441             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1442         if mobj is None:
1443             self._downloader.report_error(u'Invalid URL: %s' % url)
1444             return
1445
1446         # It's possible that one of the regexes
1447         # matched, but returned an empty group:
1448         if mobj.group(1) is None:
1449             self._downloader.report_error(u'Invalid URL: %s' % url)
1450             return
1451
1452         video_url = compat_urllib_parse.unquote(mobj.group(1))
1453         video_id = os.path.basename(video_url)
1454
1455         # here's a fun little line of code for you:
1456         video_extension = os.path.splitext(video_id)[1][1:]
1457         video_id = os.path.splitext(video_id)[0]
1458
1459         # it's tempting to parse this further, but you would
1460         # have to take into account all the variations like
1461         #   Video Title - Site Name
1462         #   Site Name | Video Title
1463         #   Video Title - Tagline | Site Name
1464         # and so on and so forth; it's just not practical
1465         mobj = re.search(r'<title>(.*)</title>', webpage)
1466         if mobj is None:
1467             self._downloader.report_error(u'unable to extract title')
1468             return
1469         video_title = mobj.group(1)
1470
1471         # video uploader is domain name
1472         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1473         if mobj is None:
1474             self._downloader.report_error(u'unable to extract title')
1475             return
1476         video_uploader = mobj.group(1)
1477
1478         return [{
1479             'id':       video_id,
1480             'url':      video_url,
1481             'uploader': video_uploader,
1482             'upload_date':  None,
1483             'title':    video_title,
1484             'ext':      video_extension,
1485         }]
1486
1487
1488 class YoutubeSearchIE(InfoExtractor):
1489     """Information Extractor for YouTube search queries."""
1490     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492     _max_youtube_results = 1000
1493     IE_NAME = u'youtube:search'
1494
1495     def __init__(self, downloader=None):
1496         InfoExtractor.__init__(self, downloader)
1497
1498     def report_download_page(self, query, pagenum):
1499         """Report attempt to download search page with given number."""
1500         query = query.decode(preferredencoding())
1501         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1502
1503     def _real_extract(self, query):
1504         mobj = re.match(self._VALID_URL, query)
1505         if mobj is None:
1506             self._downloader.report_error(u'invalid search query "%s"' % query)
1507             return
1508
1509         prefix, query = query.split(':')
1510         prefix = prefix[8:]
1511         query = query.encode('utf-8')
1512         if prefix == '':
1513             return self._get_n_results(query, 1)
1514         elif prefix == 'all':
1515             self._get_n_results(query, self._max_youtube_results)
1516         else:
1517             try:
1518                 n = int(prefix)
1519                 if n <= 0:
1520                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1521                     return
1522                 elif n > self._max_youtube_results:
1523                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1524                     n = self._max_youtube_results
1525                 return self._get_n_results(query, n)
1526             except ValueError: # parsing prefix as integer fails
1527                 return self._get_n_results(query, 1)
1528
1529     def _get_n_results(self, query, n):
1530         """Get a specified number of results for a query"""
1531
1532         video_ids = []
1533         pagenum = 0
1534         limit = n
1535
1536         while (50 * pagenum) < limit:
1537             self.report_download_page(query, pagenum+1)
1538             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1539             request = compat_urllib_request.Request(result_url)
1540             try:
1541                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1542             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1543                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1544                 return
1545             api_response = json.loads(data)['data']
1546
1547             if not 'items' in api_response:
1548                 self._downloader.trouble(u'[youtube] No video results')
1549                 return
1550
1551             new_ids = list(video['id'] for video in api_response['items'])
1552             video_ids += new_ids
1553
1554             limit = min(n, api_response['totalItems'])
1555             pagenum += 1
1556
1557         if len(video_ids) > n:
1558             video_ids = video_ids[:n]
1559         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1560         return videos
1561
1562
1563 class GoogleSearchIE(InfoExtractor):
1564     """Information Extractor for Google Video search queries."""
1565     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1566     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1567     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1568     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1569     _max_google_results = 1000
1570     IE_NAME = u'video.google:search'
1571
1572     def __init__(self, downloader=None):
1573         InfoExtractor.__init__(self, downloader)
1574
1575     def report_download_page(self, query, pagenum):
1576         """Report attempt to download playlist page with given number."""
1577         query = query.decode(preferredencoding())
1578         self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1579
1580     def _real_extract(self, query):
1581         mobj = re.match(self._VALID_URL, query)
1582         if mobj is None:
1583             self._downloader.report_error(u'invalid search query "%s"' % query)
1584             return
1585
1586         prefix, query = query.split(':')
1587         prefix = prefix[8:]
1588         query = query.encode('utf-8')
1589         if prefix == '':
1590             self._download_n_results(query, 1)
1591             return
1592         elif prefix == 'all':
1593             self._download_n_results(query, self._max_google_results)
1594             return
1595         else:
1596             try:
1597                 n = int(prefix)
1598                 if n <= 0:
1599                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1600                     return
1601                 elif n > self._max_google_results:
1602                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1603                     n = self._max_google_results
1604                 self._download_n_results(query, n)
1605                 return
1606             except ValueError: # parsing prefix as integer fails
1607                 self._download_n_results(query, 1)
1608                 return
1609
1610     def _download_n_results(self, query, n):
1611         """Downloads a specified number of results for a query"""
1612
1613         video_ids = []
1614         pagenum = 0
1615
1616         while True:
1617             self.report_download_page(query, pagenum)
1618             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1619             request = compat_urllib_request.Request(result_url)
1620             try:
1621                 page = compat_urllib_request.urlopen(request).read()
1622             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1623                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1624                 return
1625
1626             # Extract video identifiers
1627             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1628                 video_id = mobj.group(1)
1629                 if video_id not in video_ids:
1630                     video_ids.append(video_id)
1631                     if len(video_ids) == n:
1632                         # Specified n videos reached
1633                         for id in video_ids:
1634                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1635                         return
1636
1637             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1638                 for id in video_ids:
1639                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1640                 return
1641
1642             pagenum = pagenum + 1
1643
1644
1645 class YahooSearchIE(InfoExtractor):
1646     """Information Extractor for Yahoo! Video search queries."""
1647
1648     _WORKING = False
1649     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1650     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1651     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1652     _MORE_PAGES_INDICATOR = r'\s*Next'
1653     _max_yahoo_results = 1000
1654     IE_NAME = u'video.yahoo:search'
1655
1656     def __init__(self, downloader=None):
1657         InfoExtractor.__init__(self, downloader)
1658
1659     def report_download_page(self, query, pagenum):
1660         """Report attempt to download playlist page with given number."""
1661         query = query.decode(preferredencoding())
1662         self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1663
1664     def _real_extract(self, query):
1665         mobj = re.match(self._VALID_URL, query)
1666         if mobj is None:
1667             self._downloader.report_error(u'invalid search query "%s"' % query)
1668             return
1669
1670         prefix, query = query.split(':')
1671         prefix = prefix[8:]
1672         query = query.encode('utf-8')
1673         if prefix == '':
1674             self._download_n_results(query, 1)
1675             return
1676         elif prefix == 'all':
1677             self._download_n_results(query, self._max_yahoo_results)
1678             return
1679         else:
1680             try:
1681                 n = int(prefix)
1682                 if n <= 0:
1683                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1684                     return
1685                 elif n > self._max_yahoo_results:
1686                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1687                     n = self._max_yahoo_results
1688                 self._download_n_results(query, n)
1689                 return
1690             except ValueError: # parsing prefix as integer fails
1691                 self._download_n_results(query, 1)
1692                 return
1693
1694     def _download_n_results(self, query, n):
1695         """Downloads a specified number of results for a query"""
1696
1697         video_ids = []
1698         already_seen = set()
1699         pagenum = 1
1700
1701         while True:
1702             self.report_download_page(query, pagenum)
1703             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1704             request = compat_urllib_request.Request(result_url)
1705             try:
1706                 page = compat_urllib_request.urlopen(request).read()
1707             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1709                 return
1710
1711             # Extract video identifiers
1712             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1713                 video_id = mobj.group(1)
1714                 if video_id not in already_seen:
1715                     video_ids.append(video_id)
1716                     already_seen.add(video_id)
1717                     if len(video_ids) == n:
1718                         # Specified n videos reached
1719                         for id in video_ids:
1720                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1721                         return
1722
1723             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1724                 for id in video_ids:
1725                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1726                 return
1727
1728             pagenum = pagenum + 1
1729
1730
1731 class YoutubePlaylistIE(InfoExtractor):
1732     """Information Extractor for YouTube playlists."""
1733
1734     _VALID_URL = r"""(?:
1735                         (?:https?://)?
1736                         (?:\w+\.)?
1737                         youtube\.com/
1738                         (?:
1739                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1740                            \? (?:.*?&)*? (?:p|a|list)=
1741                         |  p/
1742                         )
1743                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1744                         .*
1745                      |
1746                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1747                      )"""
1748     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1749     _MAX_RESULTS = 50
1750     IE_NAME = u'youtube:playlist'
1751
1752     def __init__(self, downloader=None):
1753         InfoExtractor.__init__(self, downloader)
1754
1755     @classmethod
1756     def suitable(cls, url):
1757         """Receives a URL and returns True if suitable for this IE."""
1758         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1759
1760     def report_download_page(self, playlist_id, pagenum):
1761         """Report attempt to download playlist page with given number."""
1762         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1763
1764     def _real_extract(self, url):
1765         # Extract playlist id
1766         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1767         if mobj is None:
1768             self._downloader.report_error(u'invalid url: %s' % url)
1769             return
1770
1771         # Download playlist videos from API
1772         playlist_id = mobj.group(1) or mobj.group(2)
1773         page_num = 1
1774         videos = []
1775
1776         while True:
1777             self.report_download_page(playlist_id, page_num)
1778
1779             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1780             try:
1781                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1782             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1784                 return
1785
1786             try:
1787                 response = json.loads(page)
1788             except ValueError as err:
1789                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1790                 return
1791
1792             if 'feed' not in response:
1793                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1794                 return
1795             if 'entry' not in response['feed']:
1796                 # Number of videos is a multiple of self._MAX_RESULTS
1797                 break
1798
1799             playlist_title = response['feed']['title']['$t']
1800
1801             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1802                         for entry in response['feed']['entry']
1803                         if 'content' in entry ]
1804
1805             if len(response['feed']['entry']) < self._MAX_RESULTS:
1806                 break
1807             page_num += 1
1808
1809         videos = [v[1] for v in sorted(videos)]
1810
1811         url_results = [self.url_result(url, 'Youtube') for url in videos]
1812         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1813
1814
1815 class YoutubeChannelIE(InfoExtractor):
1816     """Information Extractor for YouTube channels."""
1817
1818     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1819     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1820     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1821     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1822     IE_NAME = u'youtube:channel'
1823
1824     def report_download_page(self, channel_id, pagenum):
1825         """Report attempt to download channel page with given number."""
1826         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1827
1828     def extract_videos_from_page(self, page):
1829         ids_in_page = []
1830         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1831             if mobj.group(1) not in ids_in_page:
1832                 ids_in_page.append(mobj.group(1))
1833         return ids_in_page
1834
1835     def _real_extract(self, url):
1836         # Extract channel id
1837         mobj = re.match(self._VALID_URL, url)
1838         if mobj is None:
1839             self._downloader.report_error(u'invalid url: %s' % url)
1840             return
1841
1842         # Download channel page
1843         channel_id = mobj.group(1)
1844         video_ids = []
1845         pagenum = 1
1846
1847         self.report_download_page(channel_id, pagenum)
1848         url = self._TEMPLATE_URL % (channel_id, pagenum)
1849         request = compat_urllib_request.Request(url)
1850         try:
1851             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1852         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1853             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1854             return
1855
1856         # Extract video identifiers
1857         ids_in_page = self.extract_videos_from_page(page)
1858         video_ids.extend(ids_in_page)
1859
1860         # Download any subsequent channel pages using the json-based channel_ajax query
1861         if self._MORE_PAGES_INDICATOR in page:
1862             while True:
1863                 pagenum = pagenum + 1
1864
1865                 self.report_download_page(channel_id, pagenum)
1866                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1867                 request = compat_urllib_request.Request(url)
1868                 try:
1869                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1870                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1871                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1872                     return
1873
1874                 page = json.loads(page)
1875
1876                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1877                 video_ids.extend(ids_in_page)
1878
1879                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1880                     break
1881
1882         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1883
1884         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1885         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1886         return [self.playlist_result(url_entries, channel_id)]
1887
1888
1889 class YoutubeUserIE(InfoExtractor):
1890     """Information Extractor for YouTube users."""
1891
1892     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1893     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1894     _GDATA_PAGE_SIZE = 50
1895     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1896     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1897     IE_NAME = u'youtube:user'
1898
1899     def __init__(self, downloader=None):
1900         InfoExtractor.__init__(self, downloader)
1901
1902     def report_download_page(self, username, start_index):
1903         """Report attempt to download user page."""
1904         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1905                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1906
1907     def _real_extract(self, url):
1908         # Extract username
1909         mobj = re.match(self._VALID_URL, url)
1910         if mobj is None:
1911             self._downloader.report_error(u'invalid url: %s' % url)
1912             return
1913
1914         username = mobj.group(1)
1915
1916         # Download video ids using YouTube Data API. Result size per
1917         # query is limited (currently to 50 videos) so we need to query
1918         # page by page until there are no video ids - it means we got
1919         # all of them.
1920
1921         video_ids = []
1922         pagenum = 0
1923
1924         while True:
1925             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1926             self.report_download_page(username, start_index)
1927
1928             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1929
1930             try:
1931                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1932             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1934                 return
1935
1936             # Extract video identifiers
1937             ids_in_page = []
1938
1939             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940                 if mobj.group(1) not in ids_in_page:
1941                     ids_in_page.append(mobj.group(1))
1942
1943             video_ids.extend(ids_in_page)
1944
1945             # A little optimization - if current page is not
1946             # "full", ie. does not contain PAGE_SIZE video ids then
1947             # we can assume that this page is the last one - there
1948             # are no more ids on further pages - no need to query
1949             # again.
1950
1951             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1952                 break
1953
1954             pagenum += 1
1955
1956         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1957         url_results = [self.url_result(url, 'Youtube') for url in urls]
1958         return [self.playlist_result(url_results, playlist_title = username)]
1959
1960
1961 class BlipTVUserIE(InfoExtractor):
1962     """Information Extractor for blip.tv users."""
1963
1964     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1965     _PAGE_SIZE = 12
1966     IE_NAME = u'blip.tv:user'
1967
1968     def __init__(self, downloader=None):
1969         InfoExtractor.__init__(self, downloader)
1970
1971     def report_download_page(self, username, pagenum):
1972         """Report attempt to download user page."""
1973         self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1974                 (self.IE_NAME, username, pagenum))
1975
1976     def _real_extract(self, url):
1977         # Extract username
1978         mobj = re.match(self._VALID_URL, url)
1979         if mobj is None:
1980             self._downloader.report_error(u'invalid url: %s' % url)
1981             return
1982
1983         username = mobj.group(1)
1984
1985         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1986
1987         request = compat_urllib_request.Request(url)
1988
1989         try:
1990             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1991             mobj = re.search(r'data-users-id="([^"]+)"', page)
1992             page_base = page_base % mobj.group(1)
1993         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1995             return
1996
1997
1998         # Download video ids using BlipTV Ajax calls. Result size per
1999         # query is limited (currently to 12 videos) so we need to query
2000         # page by page until there are no video ids - it means we got
2001         # all of them.
2002
2003         video_ids = []
2004         pagenum = 1
2005
2006         while True:
2007             self.report_download_page(username, pagenum)
2008             url = page_base + "&page=" + str(pagenum)
2009             request = compat_urllib_request.Request( url )
2010             try:
2011                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2012             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2013                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2014                 return
2015
2016             # Extract video identifiers
2017             ids_in_page = []
2018
2019             for mobj in re.finditer(r'href="/([^"]+)"', page):
2020                 if mobj.group(1) not in ids_in_page:
2021                     ids_in_page.append(unescapeHTML(mobj.group(1)))
2022
2023             video_ids.extend(ids_in_page)
2024
2025             # A little optimization - if current page is not
2026             # "full", ie. does not contain PAGE_SIZE video ids then
2027             # we can assume that this page is the last one - there
2028             # are no more ids on further pages - no need to query
2029             # again.
2030
2031             if len(ids_in_page) < self._PAGE_SIZE:
2032                 break
2033
2034             pagenum += 1
2035
2036         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2037         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2038         return [self.playlist_result(url_entries, playlist_title = username)]
2039
2040
2041 class DepositFilesIE(InfoExtractor):
2042     """Information extractor for depositfiles.com"""
2043
2044     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2045
2046     def report_download_webpage(self, file_id):
2047         """Report webpage download."""
2048         self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2049
2050     def report_extraction(self, file_id):
2051         """Report information extraction."""
2052         self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2053
2054     def _real_extract(self, url):
2055         file_id = url.split('/')[-1]
2056         # Rebuild url in english locale
2057         url = 'http://depositfiles.com/en/files/' + file_id
2058
2059         # Retrieve file webpage with 'Free download' button pressed
2060         free_download_indication = { 'gateway_result' : '1' }
2061         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2062         try:
2063             self.report_download_webpage(file_id)
2064             webpage = compat_urllib_request.urlopen(request).read()
2065         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2066             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2067             return
2068
2069         # Search for the real file URL
2070         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2071         if (mobj is None) or (mobj.group(1) is None):
2072             # Try to figure out reason of the error.
2073             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2074             if (mobj is not None) and (mobj.group(1) is not None):
2075                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2076                 self._downloader.report_error(u'%s' % restriction_message)
2077             else:
2078                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2079             return
2080
2081         file_url = mobj.group(1)
2082         file_extension = os.path.splitext(file_url)[1][1:]
2083
2084         # Search for file title
2085         mobj = re.search(r'<b title="(.*?)">', webpage)
2086         if mobj is None:
2087             self._downloader.report_error(u'unable to extract title')
2088             return
2089         file_title = mobj.group(1).decode('utf-8')
2090
2091         return [{
2092             'id':       file_id.decode('utf-8'),
2093             'url':      file_url.decode('utf-8'),
2094             'uploader': None,
2095             'upload_date':  None,
2096             'title':    file_title,
2097             'ext':      file_extension.decode('utf-8'),
2098         }]
2099
2100
2101 class FacebookIE(InfoExtractor):
2102     """Information Extractor for Facebook"""
2103
2104     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2105     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2106     _NETRC_MACHINE = 'facebook'
2107     IE_NAME = u'facebook'
2108
2109     def report_login(self):
2110         """Report attempt to log in."""
2111         self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2112
2113     def _real_initialize(self):
2114         if self._downloader is None:
2115             return
2116
2117         useremail = None
2118         password = None
2119         downloader_params = self._downloader.params
2120
2121         # Attempt to use provided username and password or .netrc data
2122         if downloader_params.get('username', None) is not None:
2123             useremail = downloader_params['username']
2124             password = downloader_params['password']
2125         elif downloader_params.get('usenetrc', False):
2126             try:
2127                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2128                 if info is not None:
2129                     useremail = info[0]
2130                     password = info[2]
2131                 else:
2132                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2133             except (IOError, netrc.NetrcParseError) as err:
2134                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2135                 return
2136
2137         if useremail is None:
2138             return
2139
2140         # Log in
2141         login_form = {
2142             'email': useremail,
2143             'pass': password,
2144             'login': 'Log+In'
2145             }
2146         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2147         try:
2148             self.report_login()
2149             login_results = compat_urllib_request.urlopen(request).read()
2150             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2151                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2152                 return
2153         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2154             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2155             return
2156
2157     def _real_extract(self, url):
2158         mobj = re.match(self._VALID_URL, url)
2159         if mobj is None:
2160             self._downloader.report_error(u'invalid URL: %s' % url)
2161             return
2162         video_id = mobj.group('ID')
2163
2164         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2165         webpage = self._download_webpage(url, video_id)
2166
2167         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2168         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2169         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2170         if not m:
2171             raise ExtractorError(u'Cannot parse data')
2172         data = dict(json.loads(m.group(1)))
2173         params_raw = compat_urllib_parse.unquote(data['params'])
2174         params = json.loads(params_raw)
2175         video_data = params['video_data'][0]
2176         video_url = video_data.get('hd_src')
2177         if not video_url:
2178             video_url = video_data['sd_src']
2179         if not video_url:
2180             raise ExtractorError(u'Cannot find video URL')
2181         video_duration = int(video_data['video_duration'])
2182         thumbnail = video_data['thumbnail_src']
2183
2184         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2185         if not m:
2186             raise ExtractorError(u'Cannot find title in webpage')
2187         video_title = unescapeHTML(m.group(1))
2188
2189         info = {
2190             'id': video_id,
2191             'title': video_title,
2192             'url': video_url,
2193             'ext': 'mp4',
2194             'duration': video_duration,
2195             'thumbnail': thumbnail,
2196         }
2197         return [info]
2198
2199
2200 class BlipTVIE(InfoExtractor):
2201     """Information extractor for blip.tv"""
2202
2203     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2204     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2205     IE_NAME = u'blip.tv'
2206
2207     def report_extraction(self, file_id):
2208         """Report information extraction."""
2209         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2210
2211     def report_direct_download(self, title):
2212         """Report information extraction."""
2213         self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2214
2215     def _real_extract(self, url):
2216         mobj = re.match(self._VALID_URL, url)
2217         if mobj is None:
2218             self._downloader.report_error(u'invalid URL: %s' % url)
2219             return
2220
2221         urlp = compat_urllib_parse_urlparse(url)
2222         if urlp.path.startswith('/play/'):
2223             request = compat_urllib_request.Request(url)
2224             response = compat_urllib_request.urlopen(request)
2225             redirecturl = response.geturl()
2226             rurlp = compat_urllib_parse_urlparse(redirecturl)
2227             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2228             url = 'http://blip.tv/a/a-' + file_id
2229             return self._real_extract(url)
2230
2231
2232         if '?' in url:
2233             cchar = '&'
2234         else:
2235             cchar = '?'
2236         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2237         request = compat_urllib_request.Request(json_url)
2238         request.add_header('User-Agent', 'iTunes/10.6.1')
2239         self.report_extraction(mobj.group(1))
2240         info = None
2241         try:
2242             urlh = compat_urllib_request.urlopen(request)
2243             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2244                 basename = url.split('/')[-1]
2245                 title,ext = os.path.splitext(basename)
2246                 title = title.decode('UTF-8')
2247                 ext = ext.replace('.', '')
2248                 self.report_direct_download(title)
2249                 info = {
2250                     'id': title,
2251                     'url': url,
2252                     'uploader': None,
2253                     'upload_date': None,
2254                     'title': title,
2255                     'ext': ext,
2256                     'urlhandle': urlh
2257                 }
2258         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2259             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2260         if info is None: # Regular URL
2261             try:
2262                 json_code_bytes = urlh.read()
2263                 json_code = json_code_bytes.decode('utf-8')
2264             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2265                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2266                 return
2267
2268             try:
2269                 json_data = json.loads(json_code)
2270                 if 'Post' in json_data:
2271                     data = json_data['Post']
2272                 else:
2273                     data = json_data
2274
2275                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2276                 video_url = data['media']['url']
2277                 umobj = re.match(self._URL_EXT, video_url)
2278                 if umobj is None:
2279                     raise ValueError('Can not determine filename extension')
2280                 ext = umobj.group(1)
2281
2282                 info = {
2283                     'id': data['item_id'],
2284                     'url': video_url,
2285                     'uploader': data['display_name'],
2286                     'upload_date': upload_date,
2287                     'title': data['title'],
2288                     'ext': ext,
2289                     'format': data['media']['mimeType'],
2290                     'thumbnail': data['thumbnailUrl'],
2291                     'description': data['description'],
2292                     'player_url': data['embedUrl'],
2293                     'user_agent': 'iTunes/10.6.1',
2294                 }
2295             except (ValueError,KeyError) as err:
2296                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2297                 return
2298
2299         return [info]
2300
2301
2302 class MyVideoIE(InfoExtractor):
2303     """Information Extractor for myvideo.de."""
2304
2305     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2306     IE_NAME = u'myvideo'
2307
2308     def __init__(self, downloader=None):
2309         InfoExtractor.__init__(self, downloader)
2310
2311     def report_extraction(self, video_id):
2312         """Report information extraction."""
2313         self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2314
2315     def _real_extract(self,url):
2316         mobj = re.match(self._VALID_URL, url)
2317         if mobj is None:
2318             self._download.report_error(u'invalid URL: %s' % url)
2319             return
2320
2321         video_id = mobj.group(1)
2322
2323         # Get video webpage
2324         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2325         webpage = self._download_webpage(webpage_url, video_id)
2326
2327         self.report_extraction(video_id)
2328         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2329                  webpage)
2330         if mobj is None:
2331             self._downloader.report_error(u'unable to extract media URL')
2332             return
2333         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2334
2335         mobj = re.search('<title>([^<]+)</title>', webpage)
2336         if mobj is None:
2337             self._downloader.report_error(u'unable to extract title')
2338             return
2339
2340         video_title = mobj.group(1)
2341
2342         return [{
2343             'id':       video_id,
2344             'url':      video_url,
2345             'uploader': None,
2346             'upload_date':  None,
2347             'title':    video_title,
2348             'ext':      u'flv',
2349         }]
2350
2351 class ComedyCentralIE(InfoExtractor):
2352     """Information extractor for The Daily Show and Colbert Report """
2353
2354     # urls can be abbreviations like :thedailyshow or :colbert
2355     # urls for episodes like:
2356     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2357     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2358     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2359     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2360                       |(https?://)?(www\.)?
2361                           (?P<showname>thedailyshow|colbertnation)\.com/
2362                          (full-episodes/(?P<episode>.*)|
2363                           (?P<clip>
2364                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2365                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2366                      $"""
2367
2368     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2369
2370     _video_extensions = {
2371         '3500': 'mp4',
2372         '2200': 'mp4',
2373         '1700': 'mp4',
2374         '1200': 'mp4',
2375         '750': 'mp4',
2376         '400': 'mp4',
2377     }
2378     _video_dimensions = {
2379         '3500': '1280x720',
2380         '2200': '960x540',
2381         '1700': '768x432',
2382         '1200': '640x360',
2383         '750': '512x288',
2384         '400': '384x216',
2385     }
2386
2387     @classmethod
2388     def suitable(cls, url):
2389         """Receives a URL and returns True if suitable for this IE."""
2390         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2391
2392     def report_extraction(self, episode_id):
2393         self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2394
2395     def report_config_download(self, episode_id, media_id):
2396         self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2397
2398     def report_index_download(self, episode_id):
2399         self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2400
2401     def _print_formats(self, formats):
2402         print('Available formats:')
2403         for x in formats:
2404             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2405
2406
2407     def _real_extract(self, url):
2408         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409         if mobj is None:
2410             self._downloader.report_error(u'invalid URL: %s' % url)
2411             return
2412
2413         if mobj.group('shortname'):
2414             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2415                 url = u'http://www.thedailyshow.com/full-episodes/'
2416             else:
2417                 url = u'http://www.colbertnation.com/full-episodes/'
2418             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2419             assert mobj is not None
2420
2421         if mobj.group('clip'):
2422             if mobj.group('showname') == 'thedailyshow':
2423                 epTitle = mobj.group('tdstitle')
2424             else:
2425                 epTitle = mobj.group('cntitle')
2426             dlNewest = False
2427         else:
2428             dlNewest = not mobj.group('episode')
2429             if dlNewest:
2430                 epTitle = mobj.group('showname')
2431             else:
2432                 epTitle = mobj.group('episode')
2433
2434         req = compat_urllib_request.Request(url)
2435         self.report_extraction(epTitle)
2436         try:
2437             htmlHandle = compat_urllib_request.urlopen(req)
2438             html = htmlHandle.read()
2439             webpage = html.decode('utf-8')
2440         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2442             return
2443         if dlNewest:
2444             url = htmlHandle.geturl()
2445             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2446             if mobj is None:
2447                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2448                 return
2449             if mobj.group('episode') == '':
2450                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2451                 return
2452             epTitle = mobj.group('episode')
2453
2454         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2455
2456         if len(mMovieParams) == 0:
2457             # The Colbert Report embeds the information in a without
2458             # a URL prefix; so extract the alternate reference
2459             # and then add the URL prefix manually.
2460
2461             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2462             if len(altMovieParams) == 0:
2463                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2464                 return
2465             else:
2466                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2467
2468         uri = mMovieParams[0][1]
2469         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2470         self.report_index_download(epTitle)
2471         try:
2472             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2474             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2475             return
2476
2477         results = []
2478
2479         idoc = xml.etree.ElementTree.fromstring(indexXml)
2480         itemEls = idoc.findall('.//item')
2481         for partNum,itemEl in enumerate(itemEls):
2482             mediaId = itemEl.findall('./guid')[0].text
2483             shortMediaId = mediaId.split(':')[-1]
2484             showId = mediaId.split(':')[-2].replace('.com', '')
2485             officialTitle = itemEl.findall('./title')[0].text
2486             officialDate = itemEl.findall('./pubDate')[0].text
2487
2488             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2489                         compat_urllib_parse.urlencode({'uri': mediaId}))
2490             configReq = compat_urllib_request.Request(configUrl)
2491             self.report_config_download(epTitle, shortMediaId)
2492             try:
2493                 configXml = compat_urllib_request.urlopen(configReq).read()
2494             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2495                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2496                 return
2497
2498             cdoc = xml.etree.ElementTree.fromstring(configXml)
2499             turls = []
2500             for rendition in cdoc.findall('.//rendition'):
2501                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2502                 turls.append(finfo)
2503
2504             if len(turls) == 0:
2505                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2506                 continue
2507
2508             if self._downloader.params.get('listformats', None):
2509                 self._print_formats([i[0] for i in turls])
2510                 return
2511
2512             # For now, just pick the highest bitrate
2513             format,rtmp_video_url = turls[-1]
2514
2515             # Get the format arg from the arg stream
2516             req_format = self._downloader.params.get('format', None)
2517
2518             # Select format if we can find one
2519             for f,v in turls:
2520                 if f == req_format:
2521                     format, rtmp_video_url = f, v
2522                     break
2523
2524             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2525             if not m:
2526                 raise ExtractorError(u'Cannot transform RTMP url')
2527             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2528             video_url = base + m.group('finalid')
2529
2530             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2531             info = {
2532                 'id': shortMediaId,
2533                 'url': video_url,
2534                 'uploader': showId,
2535                 'upload_date': officialDate,
2536                 'title': effTitle,
2537                 'ext': 'mp4',
2538                 'format': format,
2539                 'thumbnail': None,
2540                 'description': officialTitle,
2541             }
2542             results.append(info)
2543
2544         return results
2545
2546
2547 class EscapistIE(InfoExtractor):
2548     """Information extractor for The Escapist """
2549
2550     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2551     IE_NAME = u'escapist'
2552
2553     def report_extraction(self, showName):
2554         self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2555
2556     def report_config_download(self, showName):
2557         self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2558
2559     def _real_extract(self, url):
2560         mobj = re.match(self._VALID_URL, url)
2561         if mobj is None:
2562             self._downloader.report_error(u'invalid URL: %s' % url)
2563             return
2564         showName = mobj.group('showname')
2565         videoId = mobj.group('episode')
2566
2567         self.report_extraction(showName)
2568         try:
2569             webPage = compat_urllib_request.urlopen(url)
2570             webPageBytes = webPage.read()
2571             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2572             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2573         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2574             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2575             return
2576
2577         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2578         description = unescapeHTML(descMatch.group(1))
2579         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2580         imgUrl = unescapeHTML(imgMatch.group(1))
2581         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2582         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2583         configUrlMatch = re.search('config=(.*)$', playerUrl)
2584         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2585
2586         self.report_config_download(showName)
2587         try:
2588             configJSON = compat_urllib_request.urlopen(configUrl)
2589             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2590             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2591         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2592             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2593             return
2594
2595         # Technically, it's JavaScript, not JSON
2596         configJSON = configJSON.replace("'", '"')
2597
2598         try:
2599             config = json.loads(configJSON)
2600         except (ValueError,) as err:
2601             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2602             return
2603
2604         playlist = config['playlist']
2605         videoUrl = playlist[1]['url']
2606
2607         info = {
2608             'id': videoId,
2609             'url': videoUrl,
2610             'uploader': showName,
2611             'upload_date': None,
2612             'title': showName,
2613             'ext': 'mp4',
2614             'thumbnail': imgUrl,
2615             'description': description,
2616             'player_url': playerUrl,
2617         }
2618
2619         return [info]
2620
2621 class CollegeHumorIE(InfoExtractor):
2622     """Information extractor for collegehumor.com"""
2623
2624     _WORKING = False
2625     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2626     IE_NAME = u'collegehumor'
2627
2628     def report_manifest(self, video_id):
2629         """Report information extraction."""
2630         self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2631
2632     def report_extraction(self, video_id):
2633         """Report information extraction."""
2634         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2635
2636     def _real_extract(self, url):
2637         mobj = re.match(self._VALID_URL, url)
2638         if mobj is None:
2639             self._downloader.report_error(u'invalid URL: %s' % url)
2640             return
2641         video_id = mobj.group('videoid')
2642
2643         info = {
2644             'id': video_id,
2645             'uploader': None,
2646             'upload_date': None,
2647         }
2648
2649         self.report_extraction(video_id)
2650         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2651         try:
2652             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2653         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2654             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2655             return
2656
2657         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2658         try:
2659             videoNode = mdoc.findall('./video')[0]
2660             info['description'] = videoNode.findall('./description')[0].text
2661             info['title'] = videoNode.findall('./caption')[0].text
2662             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2663             manifest_url = videoNode.findall('./file')[0].text
2664         except IndexError:
2665             self._downloader.report_error(u'Invalid metadata XML file')
2666             return
2667
2668         manifest_url += '?hdcore=2.10.3'
2669         self.report_manifest(video_id)
2670         try:
2671             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2672         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2673             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2674             return
2675
2676         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2677         try:
2678             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2679             node_id = media_node.attrib['url']
2680             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2681         except IndexError as err:
2682             self._downloader.report_error(u'Invalid manifest file')
2683             return
2684
2685         url_pr = compat_urllib_parse_urlparse(manifest_url)
2686         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2687
2688         info['url'] = url
2689         info['ext'] = 'f4f'
2690         return [info]
2691
2692
2693 class XVideosIE(InfoExtractor):
2694     """Information extractor for xvideos.com"""
2695
2696     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2697     IE_NAME = u'xvideos'
2698
2699     def report_extraction(self, video_id):
2700         """Report information extraction."""
2701         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2702
2703     def _real_extract(self, url):
2704         mobj = re.match(self._VALID_URL, url)
2705         if mobj is None:
2706             self._downloader.report_error(u'invalid URL: %s' % url)
2707             return
2708         video_id = mobj.group(1)
2709
2710         webpage = self._download_webpage(url, video_id)
2711
2712         self.report_extraction(video_id)
2713
2714
2715         # Extract video URL
2716         mobj = re.search(r'flv_url=(.+?)&', webpage)
2717         if mobj is None:
2718             self._downloader.report_error(u'unable to extract video url')
2719             return
2720         video_url = compat_urllib_parse.unquote(mobj.group(1))
2721
2722
2723         # Extract title
2724         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2725         if mobj is None:
2726             self._downloader.report_error(u'unable to extract video title')
2727             return
2728         video_title = mobj.group(1)
2729
2730
2731         # Extract video thumbnail
2732         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2733         if mobj is None:
2734             self._downloader.report_error(u'unable to extract video thumbnail')
2735             return
2736         video_thumbnail = mobj.group(0)
2737
2738         info = {
2739             'id': video_id,
2740             'url': video_url,
2741             'uploader': None,
2742             'upload_date': None,
2743             'title': video_title,
2744             'ext': 'flv',
2745             'thumbnail': video_thumbnail,
2746             'description': None,
2747         }
2748
2749         return [info]
2750
2751
2752 class SoundcloudIE(InfoExtractor):
2753     """Information extractor for soundcloud.com
2754        To access the media, the uid of the song and a stream token
2755        must be extracted from the page source and the script must make
2756        a request to media.soundcloud.com/crossdomain.xml. Then
2757        the media can be grabbed by requesting from an url composed
2758        of the stream token and uid
2759      """
2760
2761     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2762     IE_NAME = u'soundcloud'
2763
2764     def __init__(self, downloader=None):
2765         InfoExtractor.__init__(self, downloader)
2766
2767     def report_resolve(self, video_id):
2768         """Report information extraction."""
2769         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2770
2771     def report_extraction(self, video_id):
2772         """Report information extraction."""
2773         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2774
2775     def _real_extract(self, url):
2776         mobj = re.match(self._VALID_URL, url)
2777         if mobj is None:
2778             self._downloader.report_error(u'invalid URL: %s' % url)
2779             return
2780
2781         # extract uploader (which is in the url)
2782         uploader = mobj.group(1)
2783         # extract simple title (uploader + slug of song title)
2784         slug_title =  mobj.group(2)
2785         simple_title = uploader + u'-' + slug_title
2786
2787         self.report_resolve('%s/%s' % (uploader, slug_title))
2788
2789         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2790         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791         request = compat_urllib_request.Request(resolv_url)
2792         try:
2793             info_json_bytes = compat_urllib_request.urlopen(request).read()
2794             info_json = info_json_bytes.decode('utf-8')
2795         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2797             return
2798
2799         info = json.loads(info_json)
2800         video_id = info['id']
2801         self.report_extraction('%s/%s' % (uploader, slug_title))
2802
2803         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2804         request = compat_urllib_request.Request(streams_url)
2805         try:
2806             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2807             stream_json = stream_json_bytes.decode('utf-8')
2808         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2809             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2810             return
2811
2812         streams = json.loads(stream_json)
2813         mediaURL = streams['http_mp3_128_url']
2814
2815         return [{
2816             'id':       info['id'],
2817             'url':      mediaURL,
2818             'uploader': info['user']['username'],
2819             'upload_date':  info['created_at'],
2820             'title':    info['title'],
2821             'ext':      u'mp3',
2822             'description': info['description'],
2823         }]
2824
2825 class SoundcloudSetIE(InfoExtractor):
2826     """Information extractor for soundcloud.com sets
2827        To access the media, the uid of the song and a stream token
2828        must be extracted from the page source and the script must make
2829        a request to media.soundcloud.com/crossdomain.xml. Then
2830        the media can be grabbed by requesting from an url composed
2831        of the stream token and uid
2832      """
2833
2834     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2835     IE_NAME = u'soundcloud'
2836
2837     def __init__(self, downloader=None):
2838         InfoExtractor.__init__(self, downloader)
2839
2840     def report_resolve(self, video_id):
2841         """Report information extraction."""
2842         self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2843
2844     def report_extraction(self, video_id):
2845         """Report information extraction."""
2846         self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2847
2848     def _real_extract(self, url):
2849         mobj = re.match(self._VALID_URL, url)
2850         if mobj is None:
2851             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2852             return
2853
2854         # extract uploader (which is in the url)
2855         uploader = mobj.group(1)
2856         # extract simple title (uploader + slug of song title)
2857         slug_title =  mobj.group(2)
2858         simple_title = uploader + u'-' + slug_title
2859
2860         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2861
2862         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2863         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2864         request = compat_urllib_request.Request(resolv_url)
2865         try:
2866             info_json_bytes = compat_urllib_request.urlopen(request).read()
2867             info_json = info_json_bytes.decode('utf-8')
2868         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869             self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2870             return
2871
2872         videos = []
2873         info = json.loads(info_json)
2874         if 'errors' in info:
2875             for err in info['errors']:
2876                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2877             return
2878
2879         for track in info['tracks']:
2880             video_id = track['id']
2881             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2882
2883             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2884             request = compat_urllib_request.Request(streams_url)
2885             try:
2886                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2887                 stream_json = stream_json_bytes.decode('utf-8')
2888             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2889                 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2890                 return
2891
2892             streams = json.loads(stream_json)
2893             mediaURL = streams['http_mp3_128_url']
2894
2895             videos.append({
2896                 'id':       video_id,
2897                 'url':      mediaURL,
2898                 'uploader': track['user']['username'],
2899                 'upload_date':  track['created_at'],
2900                 'title':    track['title'],
2901                 'ext':      u'mp3',
2902                 'description': track['description'],
2903             })
2904         return videos
2905
2906
2907 class InfoQIE(InfoExtractor):
2908     """Information extractor for infoq.com"""
2909     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2910
2911     def report_extraction(self, video_id):
2912         """Report information extraction."""
2913         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2914
2915     def _real_extract(self, url):
2916         mobj = re.match(self._VALID_URL, url)
2917         if mobj is None:
2918             self._downloader.report_error(u'invalid URL: %s' % url)
2919             return
2920
2921         webpage = self._download_webpage(url, video_id=url)
2922         self.report_extraction(url)
2923
2924         # Extract video URL
2925         mobj = re.search(r"jsclassref='([^']*)'", webpage)
2926         if mobj is None:
2927             self._downloader.report_error(u'unable to extract video url')
2928             return
2929         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2930         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2931
2932         # Extract title
2933         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2934         if mobj is None:
2935             self._downloader.report_error(u'unable to extract video title')
2936             return
2937         video_title = mobj.group(1)
2938
2939         # Extract description
2940         video_description = u'No description available.'
2941         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2942         if mobj is not None:
2943             video_description = mobj.group(1)
2944
2945         video_filename = video_url.split('/')[-1]
2946         video_id, extension = video_filename.split('.')
2947
2948         info = {
2949             'id': video_id,
2950             'url': video_url,
2951             'uploader': None,
2952             'upload_date': None,
2953             'title': video_title,
2954             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2955             'thumbnail': None,
2956             'description': video_description,
2957         }
2958
2959         return [info]
2960
2961 class MixcloudIE(InfoExtractor):
2962     """Information extractor for www.mixcloud.com"""
2963
2964     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2965     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2966     IE_NAME = u'mixcloud'
2967
2968     def __init__(self, downloader=None):
2969         InfoExtractor.__init__(self, downloader)
2970
2971     def report_download_json(self, file_id):
2972         """Report JSON download."""
2973         self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2974
2975     def report_extraction(self, file_id):
2976         """Report information extraction."""
2977         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2978
2979     def get_urls(self, jsonData, fmt, bitrate='best'):
2980         """Get urls from 'audio_formats' section in json"""
2981         file_url = None
2982         try:
2983             bitrate_list = jsonData[fmt]
2984             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2985                 bitrate = max(bitrate_list) # select highest
2986
2987             url_list = jsonData[fmt][bitrate]
2988         except TypeError: # we have no bitrate info.
2989             url_list = jsonData[fmt]
2990         return url_list
2991
2992     def check_urls(self, url_list):
2993         """Returns 1st active url from list"""
2994         for url in url_list:
2995             try:
2996                 compat_urllib_request.urlopen(url)
2997                 return url
2998             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2999                 url = None
3000
3001         return None
3002
3003     def _print_formats(self, formats):
3004         print('Available formats:')
3005         for fmt in formats.keys():
3006             for b in formats[fmt]:
3007                 try:
3008                     ext = formats[fmt][b][0]
3009                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3010                 except TypeError: # we have no bitrate info
3011                     ext = formats[fmt][0]
3012                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3013                     break
3014
3015     def _real_extract(self, url):
3016         mobj = re.match(self._VALID_URL, url)
3017         if mobj is None:
3018             self._downloader.report_error(u'invalid URL: %s' % url)
3019             return
3020         # extract uploader & filename from url
3021         uploader = mobj.group(1).decode('utf-8')
3022         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3023
3024         # construct API request
3025         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3026         # retrieve .json file with links to files
3027         request = compat_urllib_request.Request(file_url)
3028         try:
3029             self.report_download_json(file_url)
3030             jsonData = compat_urllib_request.urlopen(request).read()
3031         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3033             return
3034
3035         # parse JSON
3036         json_data = json.loads(jsonData)
3037         player_url = json_data['player_swf_url']
3038         formats = dict(json_data['audio_formats'])
3039
3040         req_format = self._downloader.params.get('format', None)
3041         bitrate = None
3042
3043         if self._downloader.params.get('listformats', None):
3044             self._print_formats(formats)
3045             return
3046
3047         if req_format is None or req_format == 'best':
3048             for format_param in formats.keys():
3049                 url_list = self.get_urls(formats, format_param)
3050                 # check urls
3051                 file_url = self.check_urls(url_list)
3052                 if file_url is not None:
3053                     break # got it!
3054         else:
3055             if req_format not in formats:
3056                 self._downloader.report_error(u'format is not available')
3057                 return
3058
3059             url_list = self.get_urls(formats, req_format)
3060             file_url = self.check_urls(url_list)
3061             format_param = req_format
3062
3063         return [{
3064             'id': file_id.decode('utf-8'),
3065             'url': file_url.decode('utf-8'),
3066             'uploader': uploader.decode('utf-8'),
3067             'upload_date': None,
3068             'title': json_data['name'],
3069             'ext': file_url.split('.')[-1].decode('utf-8'),
3070             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3071             'thumbnail': json_data['thumbnail_url'],
3072             'description': json_data['description'],
3073             'player_url': player_url.decode('utf-8'),
3074         }]
3075
3076 class StanfordOpenClassroomIE(InfoExtractor):
3077     """Information extractor for Stanford's Open ClassRoom"""
3078
3079     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3080     IE_NAME = u'stanfordoc'
3081
3082     def report_download_webpage(self, objid):
3083         """Report information extraction."""
3084         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3085
3086     def report_extraction(self, video_id):
3087         """Report information extraction."""
3088         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3089
3090     def _real_extract(self, url):
3091         mobj = re.match(self._VALID_URL, url)
3092         if mobj is None:
3093             raise ExtractorError(u'Invalid URL: %s' % url)
3094
3095         if mobj.group('course') and mobj.group('video'): # A specific video
3096             course = mobj.group('course')
3097             video = mobj.group('video')
3098             info = {
3099                 'id': course + '_' + video,
3100                 'uploader': None,
3101                 'upload_date': None,
3102             }
3103
3104             self.report_extraction(info['id'])
3105             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3106             xmlUrl = baseUrl + video + '.xml'
3107             try:
3108                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3109             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3110                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3111                 return
3112             mdoc = xml.etree.ElementTree.fromstring(metaXml)
3113             try:
3114                 info['title'] = mdoc.findall('./title')[0].text
3115                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3116             except IndexError:
3117                 self._downloader.report_error(u'Invalid metadata XML file')
3118                 return
3119             info['ext'] = info['url'].rpartition('.')[2]
3120             return [info]
3121         elif mobj.group('course'): # A course page
3122             course = mobj.group('course')
3123             info = {
3124                 'id': course,
3125                 'type': 'playlist',
3126                 'uploader': None,
3127                 'upload_date': None,
3128             }
3129
3130             coursepage = self._download_webpage(url, info['id'],
3131                                         note='Downloading course info page',
3132                                         errnote='Unable to download course info page')
3133
3134             m = re.search('<h1>([^<]+)</h1>', coursepage)
3135             if m:
3136                 info['title'] = unescapeHTML(m.group(1))
3137             else:
3138                 info['title'] = info['id']
3139
3140             m = re.search('<description>([^<]+)</description>', coursepage)
3141             if m:
3142                 info['description'] = unescapeHTML(m.group(1))
3143
3144             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3145             info['list'] = [
3146                 {
3147                     'type': 'reference',
3148                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3149                 }
3150                     for vpage in links]
3151             results = []
3152             for entry in info['list']:
3153                 assert entry['type'] == 'reference'
3154                 results += self.extract(entry['url'])
3155             return results
3156         else: # Root page
3157             info = {
3158                 'id': 'Stanford OpenClassroom',
3159                 'type': 'playlist',
3160                 'uploader': None,
3161                 'upload_date': None,
3162             }
3163
3164             self.report_download_webpage(info['id'])
3165             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3166             try:
3167                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3168             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3170                 return
3171
3172             info['title'] = info['id']
3173
3174             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3175             info['list'] = [
3176                 {
3177                     'type': 'reference',
3178                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3179                 }
3180                     for cpage in links]
3181
3182             results = []
3183             for entry in info['list']:
3184                 assert entry['type'] == 'reference'
3185                 results += self.extract(entry['url'])
3186             return results
3187
3188 class MTVIE(InfoExtractor):
3189     """Information extractor for MTV.com"""
3190
3191     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3192     IE_NAME = u'mtv'
3193
3194     def report_extraction(self, video_id):
3195         """Report information extraction."""
3196         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3197
3198     def _real_extract(self, url):
3199         mobj = re.match(self._VALID_URL, url)
3200         if mobj is None:
3201             self._downloader.report_error(u'invalid URL: %s' % url)
3202             return
3203         if not mobj.group('proto'):
3204             url = 'http://' + url
3205         video_id = mobj.group('videoid')
3206
3207         webpage = self._download_webpage(url, video_id)
3208
3209         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3210         if mobj is None:
3211             self._downloader.report_error(u'unable to extract song name')
3212             return
3213         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3215         if mobj is None:
3216             self._downloader.report_error(u'unable to extract performer')
3217             return
3218         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219         video_title = performer + ' - ' + song_name
3220
3221         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3222         if mobj is None:
3223             self._downloader.report_error(u'unable to mtvn_uri')
3224             return
3225         mtvn_uri = mobj.group(1)
3226
3227         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3228         if mobj is None:
3229             self._downloader.report_error(u'unable to extract content id')
3230             return
3231         content_id = mobj.group(1)
3232
3233         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3234         self.report_extraction(video_id)
3235         request = compat_urllib_request.Request(videogen_url)
3236         try:
3237             metadataXml = compat_urllib_request.urlopen(request).read()
3238         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3239             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3240             return
3241
3242         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3243         renditions = mdoc.findall('.//rendition')
3244
3245         # For now, always pick the highest quality.
3246         rendition = renditions[-1]
3247
3248         try:
3249             _,_,ext = rendition.attrib['type'].partition('/')
3250             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3251             video_url = rendition.find('./src').text
3252         except KeyError:
3253             self._downloader.trouble('Invalid rendition field.')
3254             return
3255
3256         info = {
3257             'id': video_id,
3258             'url': video_url,
3259             'uploader': performer,
3260             'upload_date': None,
3261             'title': video_title,
3262             'ext': ext,
3263             'format': format,
3264         }
3265
3266         return [info]
3267
3268
3269 class YoukuIE(InfoExtractor):
3270     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3271
3272     def report_download_webpage(self, file_id):
3273         """Report webpage download."""
3274         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3275
3276     def report_extraction(self, file_id):
3277         """Report information extraction."""
3278         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3279
3280     def _gen_sid(self):
3281         nowTime = int(time.time() * 1000)
3282         random1 = random.randint(1000,1998)
3283         random2 = random.randint(1000,9999)
3284
3285         return "%d%d%d" %(nowTime,random1,random2)
3286
3287     def _get_file_ID_mix_string(self, seed):
3288         mixed = []
3289         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3290         seed = float(seed)
3291         for i in range(len(source)):
3292             seed  =  (seed * 211 + 30031 ) % 65536
3293             index  =  math.floor(seed / 65536 * len(source) )
3294             mixed.append(source[int(index)])
3295             source.remove(source[int(index)])
3296         #return ''.join(mixed)
3297         return mixed
3298
3299     def _get_file_id(self, fileId, seed):
3300         mixed = self._get_file_ID_mix_string(seed)
3301         ids = fileId.split('*')
3302         realId = []
3303         for ch in ids:
3304             if ch:
3305                 realId.append(mixed[int(ch)])
3306         return ''.join(realId)
3307
3308     def _real_extract(self, url):
3309         mobj = re.match(self._VALID_URL, url)
3310         if mobj is None:
3311             self._downloader.report_error(u'invalid URL: %s' % url)
3312             return
3313         video_id = mobj.group('ID')
3314
3315         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3316
3317         request = compat_urllib_request.Request(info_url, None, std_headers)
3318         try:
3319             self.report_download_webpage(video_id)
3320             jsondata = compat_urllib_request.urlopen(request).read()
3321         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3323             return
3324
3325         self.report_extraction(video_id)
3326         try:
3327             jsonstr = jsondata.decode('utf-8')
3328             config = json.loads(jsonstr)
3329
3330             video_title =  config['data'][0]['title']
3331             seed = config['data'][0]['seed']
3332
3333             format = self._downloader.params.get('format', None)
3334             supported_format = list(config['data'][0]['streamfileids'].keys())
3335
3336             if format is None or format == 'best':
3337                 if 'hd2' in supported_format:
3338                     format = 'hd2'
3339                 else:
3340                     format = 'flv'
3341                 ext = u'flv'
3342             elif format == 'worst':
3343                 format = 'mp4'
3344                 ext = u'mp4'
3345             else:
3346                 format = 'flv'
3347                 ext = u'flv'
3348
3349
3350             fileid = config['data'][0]['streamfileids'][format]
3351             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3352         except (UnicodeDecodeError, ValueError, KeyError):
3353             self._downloader.report_error(u'unable to extract info section')
3354             return
3355
3356         files_info=[]
3357         sid = self._gen_sid()
3358         fileid = self._get_file_id(fileid, seed)
3359
3360         #column 8,9 of fileid represent the segment number
3361         #fileid[7:9] should be changed
3362         for index, key in enumerate(keys):
3363
3364             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3365             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3366
3367             info = {
3368                 'id': '%s_part%02d' % (video_id, index),
3369                 'url': download_url,
3370                 'uploader': None,
3371                 'upload_date': None,
3372                 'title': video_title,
3373                 'ext': ext,
3374             }
3375             files_info.append(info)
3376
3377         return files_info
3378
3379
3380 class XNXXIE(InfoExtractor):
3381     """Information extractor for xnxx.com"""
3382
3383     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3384     IE_NAME = u'xnxx'
3385     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3386     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3387     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3388
3389     def report_webpage(self, video_id):
3390         """Report information extraction"""
3391         self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3392
3393     def report_extraction(self, video_id):
3394         """Report information extraction"""
3395         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3396
3397     def _real_extract(self, url):
3398         mobj = re.match(self._VALID_URL, url)
3399         if mobj is None:
3400             self._downloader.report_error(u'invalid URL: %s' % url)
3401             return
3402         video_id = mobj.group(1)
3403
3404         self.report_webpage(video_id)
3405
3406         # Get webpage content
3407         try:
3408             webpage_bytes = compat_urllib_request.urlopen(url).read()
3409             webpage = webpage_bytes.decode('utf-8')
3410         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3411             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3412             return
3413
3414         result = re.search(self.VIDEO_URL_RE, webpage)
3415         if result is None:
3416             self._downloader.report_error(u'unable to extract video url')
3417             return
3418         video_url = compat_urllib_parse.unquote(result.group(1))
3419
3420         result = re.search(self.VIDEO_TITLE_RE, webpage)
3421         if result is None:
3422             self._downloader.report_error(u'unable to extract video title')
3423             return
3424         video_title = result.group(1)
3425
3426         result = re.search(self.VIDEO_THUMB_RE, webpage)
3427         if result is None:
3428             self._downloader.report_error(u'unable to extract video thumbnail')
3429             return
3430         video_thumbnail = result.group(1)
3431
3432         return [{
3433             'id': video_id,
3434             'url': video_url,
3435             'uploader': None,
3436             'upload_date': None,
3437             'title': video_title,
3438             'ext': 'flv',
3439             'thumbnail': video_thumbnail,
3440             'description': None,
3441         }]
3442
3443
3444 class GooglePlusIE(InfoExtractor):
3445     """Information extractor for plus.google.com."""
3446
3447     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3448     IE_NAME = u'plus.google'
3449
3450     def __init__(self, downloader=None):
3451         InfoExtractor.__init__(self, downloader)
3452
3453     def report_extract_entry(self, url):
3454         """Report downloading extry"""
3455         self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3456
3457     def report_date(self, upload_date):
3458         """Report downloading extry"""
3459         self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3460
3461     def report_uploader(self, uploader):
3462         """Report downloading extry"""
3463         self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3464
3465     def report_title(self, video_title):
3466         """Report downloading extry"""
3467         self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3468
3469     def report_extract_vid_page(self, video_page):
3470         """Report information extraction."""
3471         self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3472
3473     def _real_extract(self, url):
3474         # Extract id from URL
3475         mobj = re.match(self._VALID_URL, url)
3476         if mobj is None:
3477             self._downloader.report_error(u'Invalid URL: %s' % url)
3478             return
3479
3480         post_url = mobj.group(0)
3481         video_id = mobj.group(1)
3482
3483         video_extension = 'flv'
3484
3485         # Step 1, Retrieve post webpage to extract further information
3486         self.report_extract_entry(post_url)
3487         request = compat_urllib_request.Request(post_url)
3488         try:
3489             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3491             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3492             return
3493
3494         # Extract update date
3495         upload_date = None
3496         pattern = 'title="Timestamp">(.*?)</a>'
3497         mobj = re.search(pattern, webpage)
3498         if mobj:
3499             upload_date = mobj.group(1)
3500             # Convert timestring to a format suitable for filename
3501             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3502             upload_date = upload_date.strftime('%Y%m%d')
3503         self.report_date(upload_date)
3504
3505         # Extract uploader
3506         uploader = None
3507         pattern = r'rel\="author".*?>(.*?)</a>'
3508         mobj = re.search(pattern, webpage)
3509         if mobj:
3510             uploader = mobj.group(1)
3511         self.report_uploader(uploader)
3512
3513         # Extract title
3514         # Get the first line for title
3515         video_title = u'NA'
3516         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3517         mobj = re.search(pattern, webpage)
3518         if mobj:
3519             video_title = mobj.group(1)
3520         self.report_title(video_title)
3521
3522         # Step 2, Stimulate clicking the image box to launch video
3523         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3524         mobj = re.search(pattern, webpage)
3525         if mobj is None:
3526             self._downloader.report_error(u'unable to extract video page URL')
3527
3528         video_page = mobj.group(1)
3529         request = compat_urllib_request.Request(video_page)
3530         try:
3531             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3532         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3533             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3534             return
3535         self.report_extract_vid_page(video_page)
3536
3537
3538         # Extract video links on video page
3539         """Extract video links of all sizes"""
3540         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3541         mobj = re.findall(pattern, webpage)
3542         if len(mobj) == 0:
3543             self._downloader.report_error(u'unable to extract video links')
3544
3545         # Sort in resolution
3546         links = sorted(mobj)
3547
3548         # Choose the lowest of the sort, i.e. highest resolution
3549         video_url = links[-1]
3550         # Only get the url. The resolution part in the tuple has no use anymore
3551         video_url = video_url[-1]
3552         # Treat escaped \u0026 style hex
3553         try:
3554             video_url = video_url.decode("unicode_escape")
3555         except AttributeError: # Python 3
3556             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3557
3558
3559         return [{
3560             'id':       video_id,
3561             'url':      video_url,
3562             'uploader': uploader,
3563             'upload_date':  upload_date,
3564             'title':    video_title,
3565             'ext':      video_extension,
3566         }]
3567
3568 class NBAIE(InfoExtractor):
3569     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3570     IE_NAME = u'nba'
3571
3572     def _real_extract(self, url):
3573         mobj = re.match(self._VALID_URL, url)
3574         if mobj is None:
3575             self._downloader.report_error(u'invalid URL: %s' % url)
3576             return
3577
3578         video_id = mobj.group(1)
3579         if video_id.endswith('/index.html'):
3580             video_id = video_id[:-len('/index.html')]
3581
3582         webpage = self._download_webpage(url, video_id)
3583
3584         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3585         def _findProp(rexp, default=None):
3586             m = re.search(rexp, webpage)
3587             if m:
3588                 return unescapeHTML(m.group(1))
3589             else:
3590                 return default
3591
3592         shortened_video_id = video_id.rpartition('/')[2]
3593         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3594         info = {
3595             'id': shortened_video_id,
3596             'url': video_url,
3597             'ext': 'mp4',
3598             'title': title,
3599             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3600             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3601         }
3602         return [info]
3603
3604 class JustinTVIE(InfoExtractor):
3605     """Information extractor for justin.tv and twitch.tv"""
3606     # TODO: One broadcast may be split into multiple videos. The key
3607     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3608     # starts at 1 and increases. Can we treat all parts as one video?
3609
3610     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3611         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3612     _JUSTIN_PAGE_LIMIT = 100
3613     IE_NAME = u'justin.tv'
3614
3615     def report_extraction(self, file_id):
3616         """Report information extraction."""
3617         self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3618
3619     def report_download_page(self, channel, offset):
3620         """Report attempt to download a single page of videos."""
3621         self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3622                 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3623
3624     # Return count of items, list of *valid* items
3625     def _parse_page(self, url):
3626         try:
3627             urlh = compat_urllib_request.urlopen(url)
3628             webpage_bytes = urlh.read()
3629             webpage = webpage_bytes.decode('utf-8', 'ignore')
3630         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3631             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3632             return
3633
3634         response = json.loads(webpage)
3635         if type(response) != list:
3636             error_text = response.get('error', 'unknown error')
3637             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3638             return
3639         info = []
3640         for clip in response:
3641             video_url = clip['video_file_url']
3642             if video_url:
3643                 video_extension = os.path.splitext(video_url)[1][1:]
3644                 video_date = re.sub('-', '', clip['start_time'][:10])
3645                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3646                 video_id = clip['id']
3647                 video_title = clip.get('title', video_id)
3648                 info.append({
3649                     'id': video_id,
3650                     'url': video_url,
3651                     'title': video_title,
3652                     'uploader': clip.get('channel_name', video_uploader_id),
3653                     'uploader_id': video_uploader_id,
3654                     'upload_date': video_date,
3655                     'ext': video_extension,
3656                 })
3657         return (len(response), info)
3658
3659     def _real_extract(self, url):
3660         mobj = re.match(self._VALID_URL, url)
3661         if mobj is None:
3662             self._downloader.report_error(u'invalid URL: %s' % url)
3663             return
3664
3665         api = 'http://api.justin.tv'
3666         video_id = mobj.group(mobj.lastindex)
3667         paged = False
3668         if mobj.lastindex == 1:
3669             paged = True
3670             api += '/channel/archives/%s.json'
3671         else:
3672             api += '/broadcast/by_archive/%s.json'
3673         api = api % (video_id,)
3674
3675         self.report_extraction(video_id)
3676
3677         info = []
3678         offset = 0
3679         limit = self._JUSTIN_PAGE_LIMIT
3680         while True:
3681             if paged:
3682                 self.report_download_page(video_id, offset)
3683             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3684             page_count, page_info = self._parse_page(page_url)
3685             info.extend(page_info)
3686             if not paged or page_count != limit:
3687                 break
3688             offset += limit
3689         return info
3690
3691 class FunnyOrDieIE(InfoExtractor):
3692     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3693
3694     def _real_extract(self, url):
3695         mobj = re.match(self._VALID_URL, url)
3696         if mobj is None:
3697             self._downloader.report_error(u'invalid URL: %s' % url)
3698             return
3699
3700         video_id = mobj.group('id')
3701         webpage = self._download_webpage(url, video_id)
3702
3703         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3704         if not m:
3705             self._downloader.report_error(u'unable to find video information')
3706         video_url = unescapeHTML(m.group('url'))
3707
3708         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3709         if not m:
3710             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3711             if not m:
3712                 self._downloader.trouble(u'Cannot find video title')
3713         title = clean_html(m.group('title'))
3714
3715         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3716         if m:
3717             desc = unescapeHTML(m.group('desc'))
3718         else:
3719             desc = None
3720
3721         info = {
3722             'id': video_id,
3723             'url': video_url,
3724             'ext': 'mp4',
3725             'title': title,
3726             'description': desc,
3727         }
3728         return [info]
3729
3730 class SteamIE(InfoExtractor):
3731     _VALID_URL = r"""http://store.steampowered.com/
3732                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3733                 (?P<gameID>\d+)/?
3734                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3735                 """
3736
3737     @classmethod
3738     def suitable(cls, url):
3739         """Receives a URL and returns True if suitable for this IE."""
3740         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3741
3742     def _real_extract(self, url):
3743         m = re.match(self._VALID_URL, url, re.VERBOSE)
3744         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3745         gameID = m.group('gameID')
3746         videourl = 'http://store.steampowered.com/video/%s/' % gameID
3747         webpage = self._download_webpage(videourl, gameID)
3748         mweb = re.finditer(urlRE, webpage)
3749         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3750         titles = re.finditer(namesRE, webpage)
3751         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3752         thumbs = re.finditer(thumbsRE, webpage)
3753         videos = []
3754         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3755             video_id = vid.group('videoID')
3756             title = vtitle.group('videoName')
3757             video_url = vid.group('videoURL')
3758             video_thumb = thumb.group('thumbnail')
3759             if not video_url:
3760                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3761             info = {
3762                 'id':video_id,
3763                 'url':video_url,
3764                 'ext': 'flv',
3765                 'title': unescapeHTML(title),
3766                 'thumbnail': video_thumb
3767                   }
3768             videos.append(info)
3769         return videos
3770
3771 class UstreamIE(InfoExtractor):
3772     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3773     IE_NAME = u'ustream'
3774
3775     def _real_extract(self, url):
3776         m = re.match(self._VALID_URL, url)
3777         video_id = m.group('videoID')
3778         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3779         webpage = self._download_webpage(url, video_id)
3780         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3781         title = m.group('title')
3782         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3783         uploader = m.group('uploader')
3784         info = {
3785                 'id':video_id,
3786                 'url':video_url,
3787                 'ext': 'flv',
3788                 'title': title,
3789                 'uploader': uploader
3790                   }
3791         return [info]
3792
3793 class WorldStarHipHopIE(InfoExtractor):
3794     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3795     IE_NAME = u'WorldStarHipHop'
3796
3797     def _real_extract(self, url):
3798         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3799
3800         webpage_src = compat_urllib_request.urlopen(url).read()
3801         webpage_src = webpage_src.decode('utf-8')
3802
3803         mobj = re.search(_src_url, webpage_src)
3804
3805         m = re.match(self._VALID_URL, url)
3806         video_id = m.group('id')
3807
3808         if mobj is not None:
3809             video_url = mobj.group()
3810             if 'mp4' in video_url:
3811                 ext = 'mp4'
3812             else:
3813                 ext = 'flv'
3814         else:
3815             self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3816             return
3817
3818         _title = r"""<title>(.*)</title>"""
3819
3820         mobj = re.search(_title, webpage_src)
3821
3822         if mobj is not None:
3823             title = mobj.group(1)
3824         else:
3825             title = 'World Start Hip Hop - %s' % time.ctime()
3826
3827         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3828         mobj = re.search(_thumbnail, webpage_src)
3829
3830         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3831         if mobj is not None:
3832             thumbnail = mobj.group(1)
3833         else:
3834             _title = r"""candytitles.*>(.*)</span>"""
3835             mobj = re.search(_title, webpage_src)
3836             if mobj is not None:
3837                 title = mobj.group(1)
3838             thumbnail = None
3839
3840         results = [{
3841                     'id': video_id,
3842                     'url' : video_url,
3843                     'title' : title,
3844                     'thumbnail' : thumbnail,
3845                     'ext' : ext,
3846                     }]
3847         return results
3848
3849 class RBMARadioIE(InfoExtractor):
3850     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3851
3852     def _real_extract(self, url):
3853         m = re.match(self._VALID_URL, url)
3854         video_id = m.group('videoID')
3855
3856         webpage = self._download_webpage(url, video_id)
3857         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3858         if not m:
3859             raise ExtractorError(u'Cannot find metadata')
3860         json_data = m.group(1)
3861
3862         try:
3863             data = json.loads(json_data)
3864         except ValueError as e:
3865             raise ExtractorError(u'Invalid JSON: ' + str(e))
3866
3867         video_url = data['akamai_url'] + '&cbr=256'
3868         url_parts = compat_urllib_parse_urlparse(video_url)
3869         video_ext = url_parts.path.rpartition('.')[2]
3870         info = {
3871                 'id': video_id,
3872                 'url': video_url,
3873                 'ext': video_ext,
3874                 'title': data['title'],
3875                 'description': data.get('teaser_text'),
3876                 'location': data.get('country_of_origin'),
3877                 'uploader': data.get('host', {}).get('name'),
3878                 'uploader_id': data.get('host', {}).get('slug'),
3879                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3880                 'duration': data.get('duration'),
3881         }
3882         return [info]
3883
3884
3885 class YouPornIE(InfoExtractor):
3886     """Information extractor for youporn.com."""
3887     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3888
3889     def _print_formats(self, formats):
3890         """Print all available formats"""
3891         print(u'Available formats:')
3892         print(u'ext\t\tformat')
3893         print(u'---------------------------------')
3894         for format in formats:
3895             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3896
3897     def _specific(self, req_format, formats):
3898         for x in formats:
3899             if(x["format"]==req_format):
3900                 return x
3901         return None
3902
3903     def _real_extract(self, url):
3904         mobj = re.match(self._VALID_URL, url)
3905         if mobj is None:
3906             self._downloader.report_error(u'invalid URL: %s' % url)
3907             return
3908
3909         video_id = mobj.group('videoid')
3910
3911         req = compat_urllib_request.Request(url)
3912         req.add_header('Cookie', 'age_verified=1')
3913         webpage = self._download_webpage(req, video_id)
3914
3915         # Get the video title
3916         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3917         if result is None:
3918             raise ExtractorError(u'Unable to extract video title')
3919         video_title = result.group('title').strip()
3920
3921         # Get the video date
3922         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3923         if result is None:
3924             self._downloader.report_warning(u'unable to extract video date')
3925             upload_date = None
3926         else:
3927             upload_date = result.group('date').strip()
3928
3929         # Get the video uploader
3930         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3931         if result is None:
3932             self._downloader.report_warning(u'unable to extract uploader')
3933             video_uploader = None
3934         else:
3935             video_uploader = result.group('uploader').strip()
3936             video_uploader = clean_html( video_uploader )
3937
3938         # Get all of the formats available
3939         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3940         result = re.search(DOWNLOAD_LIST_RE, webpage)
3941         if result is None:
3942             raise ExtractorError(u'Unable to extract download list')
3943         download_list_html = result.group('download_list').strip()
3944
3945         # Get all of the links from the page
3946         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3947         links = re.findall(LINK_RE, download_list_html)
3948         if(len(links) == 0):
3949             raise ExtractorError(u'ERROR: no known formats available for video')
3950
3951         self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3952
3953         formats = []
3954         for link in links:
3955
3956             # A link looks like this:
3957             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3958             # A path looks like this:
3959             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3960             video_url = unescapeHTML( link )
3961             path = compat_urllib_parse_urlparse( video_url ).path
3962             extension = os.path.splitext( path )[1][1:]
3963             format = path.split('/')[4].split('_')[:2]
3964             size = format[0]
3965             bitrate = format[1]
3966             format = "-".join( format )
3967             title = u'%s-%s-%s' % (video_title, size, bitrate)
3968
3969             formats.append({
3970                 'id': video_id,
3971                 'url': video_url,
3972                 'uploader': video_uploader,
3973                 'upload_date': upload_date,
3974                 'title': title,
3975                 'ext': extension,
3976                 'format': format,
3977                 'thumbnail': None,
3978                 'description': None,
3979                 'player_url': None
3980             })
3981
3982         if self._downloader.params.get('listformats', None):
3983             self._print_formats(formats)
3984             return
3985
3986         req_format = self._downloader.params.get('format', None)
3987         self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3988
3989         if req_format is None or req_format == 'best':
3990             return [formats[0]]
3991         elif req_format == 'worst':
3992             return [formats[-1]]
3993         elif req_format in ('-1', 'all'):
3994             return formats
3995         else:
3996             format = self._specific( req_format, formats )
3997             if result is None:
3998                 self._downloader.report_error(u'requested format not available')
3999                 return
4000             return [format]
4001
4002
4003
4004 class PornotubeIE(InfoExtractor):
4005     """Information extractor for pornotube.com."""
4006     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4007
4008     def _real_extract(self, url):
4009         mobj = re.match(self._VALID_URL, url)
4010         if mobj is None:
4011             self._downloader.report_error(u'invalid URL: %s' % url)
4012             return
4013
4014         video_id = mobj.group('videoid')
4015         video_title = mobj.group('title')
4016
4017         # Get webpage content
4018         webpage = self._download_webpage(url, video_id)
4019
4020         # Get the video URL
4021         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4022         result = re.search(VIDEO_URL_RE, webpage)
4023         if result is None:
4024             self._downloader.report_error(u'unable to extract video url')
4025             return
4026         video_url = compat_urllib_parse.unquote(result.group('url'))
4027
4028         #Get the uploaded date
4029         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4030         result = re.search(VIDEO_UPLOADED_RE, webpage)
4031         if result is None:
4032             self._downloader.report_error(u'unable to extract video title')
4033             return
4034         upload_date = result.group('date')
4035
4036         info = {'id': video_id,
4037                 'url': video_url,
4038                 'uploader': None,
4039                 'upload_date': upload_date,
4040                 'title': video_title,
4041                 'ext': 'flv',
4042                 'format': 'flv'}
4043
4044         return [info]
4045
4046 class YouJizzIE(InfoExtractor):
4047     """Information extractor for youjizz.com."""
4048     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4049
4050     def _real_extract(self, url):
4051         mobj = re.match(self._VALID_URL, url)
4052         if mobj is None:
4053             self._downloader.report_error(u'invalid URL: %s' % url)
4054             return
4055
4056         video_id = mobj.group('videoid')
4057
4058         # Get webpage content
4059         webpage = self._download_webpage(url, video_id)
4060
4061         # Get the video title
4062         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4063         if result is None:
4064             raise ExtractorError(u'ERROR: unable to extract video title')
4065         video_title = result.group('title').strip()
4066
4067         # Get the embed page
4068         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4069         if result is None:
4070             raise ExtractorError(u'ERROR: unable to extract embed page')
4071
4072         embed_page_url = result.group(0).strip()
4073         video_id = result.group('videoid')
4074
4075         webpage = self._download_webpage(embed_page_url, video_id)
4076
4077         # Get the video URL
4078         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4079         if result is None:
4080             raise ExtractorError(u'ERROR: unable to extract video url')
4081         video_url = result.group('source')
4082
4083         info = {'id': video_id,
4084                 'url': video_url,
4085                 'title': video_title,
4086                 'ext': 'flv',
4087                 'format': 'flv',
4088                 'player_url': embed_page_url}
4089
4090         return [info]
4091
4092 class EightTracksIE(InfoExtractor):
4093     IE_NAME = '8tracks'
4094     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4095
4096     def _real_extract(self, url):
4097         mobj = re.match(self._VALID_URL, url)
4098         if mobj is None:
4099             raise ExtractorError(u'Invalid URL: %s' % url)
4100         playlist_id = mobj.group('id')
4101
4102         webpage = self._download_webpage(url, playlist_id)
4103
4104         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4105         if not m:
4106             raise ExtractorError(u'Cannot find trax information')
4107         json_like = m.group(1)
4108         data = json.loads(json_like)
4109
4110         session = str(random.randint(0, 1000000000))
4111         mix_id = data['id']
4112         track_count = data['tracks_count']
4113         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4114         next_url = first_url
4115         res = []
4116         for i in itertools.count():
4117             api_json = self._download_webpage(next_url, playlist_id,
4118                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4119                 errnote=u'Failed to download song information')
4120             api_data = json.loads(api_json)
4121             track_data = api_data[u'set']['track']
4122             info = {
4123                 'id': track_data['id'],
4124                 'url': track_data['track_file_stream_url'],
4125                 'title': track_data['performer'] + u' - ' + track_data['name'],
4126                 'raw_title': track_data['name'],
4127                 'uploader_id': data['user']['login'],
4128                 'ext': 'm4a',
4129             }
4130             res.append(info)
4131             if api_data['set']['at_last_track']:
4132                 break
4133             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4134         return res
4135
4136 class KeekIE(InfoExtractor):
4137     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4138     IE_NAME = u'keek'
4139
4140     def _real_extract(self, url):
4141         m = re.match(self._VALID_URL, url)
4142         video_id = m.group('videoID')
4143         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4144         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4145         webpage = self._download_webpage(url, video_id)
4146         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4147         title = unescapeHTML(m.group('title'))
4148         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4149         uploader = clean_html(m.group('uploader'))
4150         info = {
4151                 'id': video_id,
4152                 'url': video_url,
4153                 'ext': 'mp4',
4154                 'title': title,
4155                 'thumbnail': thumbnail,
4156                 'uploader': uploader
4157         }
4158         return [info]
4159
4160 class TEDIE(InfoExtractor):
4161     _VALID_URL=r'''http://www.ted.com/
4162                    (
4163                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4164                         |
4165                         ((?P<type_talk>talks)) # We have a simple talk
4166                    )
4167                    /(?P<name>\w+) # Here goes the name and then ".html"
4168                    '''
4169
4170     @classmethod
4171     def suitable(cls, url):
4172         """Receives a URL and returns True if suitable for this IE."""
4173         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4174
4175     def _real_extract(self, url):
4176         m=re.match(self._VALID_URL, url, re.VERBOSE)
4177         if m.group('type_talk'):
4178             return [self._talk_info(url)]
4179         else :
4180             playlist_id=m.group('playlist_id')
4181             name=m.group('name')
4182             self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4183             return [self._playlist_videos_info(url,name,playlist_id)]
4184
4185     def _talk_video_link(self,mediaSlug):
4186         '''Returns the video link for that mediaSlug'''
4187         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4188
4189     def _playlist_videos_info(self,url,name,playlist_id=0):
4190         '''Returns the videos of the playlist'''
4191         video_RE=r'''
4192                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4193                      ([.\s]*?)data-playlist_item_id="(\d+)"
4194                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4195                      '''
4196         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4197         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4198         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4199         m_names=re.finditer(video_name_RE,webpage)
4200
4201         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4202         m_playlist = re.search(playlist_RE, webpage)
4203         playlist_title = m_playlist.group('playlist_title')
4204
4205         playlist_entries = []
4206         for m_video, m_name in zip(m_videos,m_names):
4207             video_id=m_video.group('video_id')
4208             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4209             playlist_entries.append(self.url_result(talk_url, 'TED'))
4210         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4211
4212     def _talk_info(self, url, video_id=0):
4213         """Return the video for the talk in the url"""
4214         m=re.match(self._VALID_URL, url,re.VERBOSE)
4215         videoName=m.group('name')
4216         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4217         # If the url includes the language we get the title translated
4218         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4219         title=re.search(title_RE, webpage).group('title')
4220         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4221                         "id":(?P<videoID>[\d]+).*?
4222                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4223         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4224         thumb_match=re.search(thumb_RE,webpage)
4225         info_match=re.search(info_RE,webpage,re.VERBOSE)
4226         video_id=info_match.group('videoID')
4227         mediaSlug=info_match.group('mediaSlug')
4228         video_url=self._talk_video_link(mediaSlug)
4229         info = {
4230                 'id': video_id,
4231                 'url': video_url,
4232                 'ext': 'mp4',
4233                 'title': title,
4234                 'thumbnail': thumb_match.group('thumbnail')
4235                 }
4236         return info
4237
4238 class MySpassIE(InfoExtractor):
4239     _VALID_URL = r'http://www.myspass.de/.*'
4240
4241     def _real_extract(self, url):
4242         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4243
4244         # video id is the last path element of the URL
4245         # usually there is a trailing slash, so also try the second but last
4246         url_path = compat_urllib_parse_urlparse(url).path
4247         url_parent_path, video_id = os.path.split(url_path)
4248         if not video_id:
4249             _, video_id = os.path.split(url_parent_path)
4250
4251         # get metadata
4252         metadata_url = META_DATA_URL_TEMPLATE % video_id
4253         metadata_text = self._download_webpage(metadata_url, video_id)
4254         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4255
4256         # extract values from metadata
4257         url_flv_el = metadata.find('url_flv')
4258         if url_flv_el is None:
4259             self._downloader.report_error(u'unable to extract download url')
4260             return
4261         video_url = url_flv_el.text
4262         extension = os.path.splitext(video_url)[1][1:]
4263         title_el = metadata.find('title')
4264         if title_el is None:
4265             self._downloader.report_error(u'unable to extract title')
4266             return
4267         title = title_el.text
4268         format_id_el = metadata.find('format_id')
4269         if format_id_el is None:
4270             format = ext
4271         else:
4272             format = format_id_el.text
4273         description_el = metadata.find('description')
4274         if description_el is not None:
4275             description = description_el.text
4276         else:
4277             description = None
4278         imagePreview_el = metadata.find('imagePreview')
4279         if imagePreview_el is not None:
4280             thumbnail = imagePreview_el.text
4281         else:
4282             thumbnail = None
4283         info = {
4284             'id': video_id,
4285             'url': video_url,
4286             'title': title,
4287             'ext': extension,
4288             'format': format,
4289             'thumbnail': thumbnail,
4290             'description': description
4291         }
4292         return [info]
4293
4294 class SpiegelIE(InfoExtractor):
4295     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4296
4297     def _real_extract(self, url):
4298         m = re.match(self._VALID_URL, url)
4299         video_id = m.group('videoID')
4300
4301         webpage = self._download_webpage(url, video_id)
4302         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4303         if not m:
4304             raise ExtractorError(u'Cannot find title')
4305         video_title = unescapeHTML(m.group(1))
4306
4307         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4308         xml_code = self._download_webpage(xml_url, video_id,
4309                     note=u'Downloading XML', errnote=u'Failed to download XML')
4310
4311         idoc = xml.etree.ElementTree.fromstring(xml_code)
4312         last_type = idoc[-1]
4313         filename = last_type.findall('./filename')[0].text
4314         duration = float(last_type.findall('./duration')[0].text)
4315
4316         video_url = 'http://video2.spiegel.de/flash/' + filename
4317         video_ext = filename.rpartition('.')[2]
4318         info = {
4319             'id': video_id,
4320             'url': video_url,
4321             'ext': video_ext,
4322             'title': video_title,
4323             'duration': duration,
4324         }
4325         return [info]
4326
4327 class LiveLeakIE(InfoExtractor):
4328
4329     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4330     IE_NAME = u'liveleak'
4331
4332     def _real_extract(self, url):
4333         mobj = re.match(self._VALID_URL, url)
4334         if mobj is None:
4335             self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4336             return
4337
4338         video_id = mobj.group('video_id')
4339
4340         webpage = self._download_webpage(url, video_id)
4341
4342         m = re.search(r'file: "(.*?)",', webpage)
4343         if not m:
4344             self._downloader.report_error(u'unable to find video url')
4345             return
4346         video_url = m.group(1)
4347
4348         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4349         if not m:
4350             self._downloader.trouble(u'Cannot find video title')
4351         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4352
4353         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4354         if m:
4355             desc = unescapeHTML(m.group('desc'))
4356         else:
4357             desc = None
4358
4359         m = re.search(r'By:.*?(\w+)</a>', webpage)
4360         if m:
4361             uploader = clean_html(m.group(1))
4362         else:
4363             uploader = None
4364
4365         info = {
4366             'id':  video_id,
4367             'url': video_url,
4368             'ext': 'mp4',
4369             'title': title,
4370             'description': desc,
4371             'uploader': uploader
4372         }
4373
4374         return [info]
4375
4376 class ARDIE(InfoExtractor):
4377     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4378     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4379     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4380
4381     def _real_extract(self, url):
4382         # determine video id from url
4383         m = re.match(self._VALID_URL, url)
4384
4385         numid = re.search(r'documentId=([0-9]+)', url)
4386         if numid:
4387             video_id = numid.group(1)
4388         else:
4389             video_id = m.group('video_id')
4390
4391         # determine title and media streams from webpage
4392         html = self._download_webpage(url, video_id)
4393         title = re.search(self._TITLE, html).group('title')
4394         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4395         if not streams:
4396             assert '"fsk"' in html
4397             self._downloader.report_error(u'this video is only available after 8:00 pm')
4398             return
4399
4400         # choose default media type and highest quality for now
4401         stream = max([s for s in streams if int(s["media_type"]) == 0],
4402                      key=lambda s: int(s["quality"]))
4403
4404         # there's two possibilities: RTMP stream or HTTP download
4405         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4406         if stream['rtmp_url']:
4407             self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4408             assert stream['video_url'].startswith('mp4:')
4409             info["url"] = stream["rtmp_url"]
4410             info["play_path"] = stream['video_url']
4411         else:
4412             assert stream["video_url"].endswith('.mp4')
4413             info["url"] = stream["video_url"]
4414         return [info]
4415
4416
4417 def gen_extractors():
4418     """ Return a list of an instance of every supported extractor.
4419     The order does matter; the first extractor matched is the one handling the URL.
4420     """
4421     return [
4422         YoutubePlaylistIE(),
4423         YoutubeChannelIE(),
4424         YoutubeUserIE(),
4425         YoutubeSearchIE(),
4426         YoutubeIE(),
4427         MetacafeIE(),
4428         DailymotionIE(),
4429         GoogleSearchIE(),
4430         PhotobucketIE(),
4431         YahooIE(),
4432         YahooSearchIE(),
4433         DepositFilesIE(),
4434         FacebookIE(),
4435         BlipTVUserIE(),
4436         BlipTVIE(),
4437         VimeoIE(),
4438         MyVideoIE(),
4439         ComedyCentralIE(),
4440         EscapistIE(),
4441         CollegeHumorIE(),
4442         XVideosIE(),
4443         SoundcloudSetIE(),
4444         SoundcloudIE(),
4445         InfoQIE(),
4446         MixcloudIE(),
4447         StanfordOpenClassroomIE(),
4448         MTVIE(),
4449         YoukuIE(),
4450         XNXXIE(),
4451         YouJizzIE(),
4452         PornotubeIE(),
4453         YouPornIE(),
4454         GooglePlusIE(),
4455         ArteTvIE(),
4456         NBAIE(),
4457         WorldStarHipHopIE(),
4458         JustinTVIE(),
4459         FunnyOrDieIE(),
4460         SteamIE(),
4461         UstreamIE(),
4462         RBMARadioIE(),
4463         EightTracksIE(),
4464         KeekIE(),
4465         TEDIE(),
4466         MySpassIE(),
4467         SpiegelIE(),
4468         LiveLeakIE(),
4469         ARDIE(),
4470         GenericIE()
4471     ]
4472
4473 def get_info_extractor(ie_name):
4474     """Returns the info extractor class with the given ie_name"""
4475     return globals()[ie_name+'IE']