git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 106                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 107                              (?:                                                  # the various things that can precede the ID:
 108                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 109                                  |(?:                                             # or the v= param in all its forms
 110                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 111                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 112                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 113                                      v=
 114                                  )
 115                              )?                                                   # optional -> youtube.com/xxxx is OK
 116                          )?                                                       # all until now is optional -> you can pass the naked ID
 117                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 118                          (?(1).+)?                                                # if we found the ID, everything can follow
 119                          $"""
 120         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 121         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 122         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 123         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 124         _NETRC_MACHINE = 'youtube'
 125         # Listed in order of quality
 126         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 127         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 128         _video_extensions = {
 129                 '13': '3gp',
 130                 '17': 'mp4',
 131                 '18': 'mp4',
 132                 '22': 'mp4',
 133                 '37': 'mp4',
 134                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 135                 '43': 'webm',
 136                 '44': 'webm',
 137                 '45': 'webm',
 138                 '46': 'webm',
 139         }
 140         _video_dimensions = {
 141                 '5': '240x400',
 142                 '6': '???',
 143                 '13': '???',
 144                 '17': '144x176',
 145                 '18': '360x640',
 146                 '22': '720x1280',
 147                 '34': '360x640',
 148                 '35': '480x854',
 149                 '37': '1080x1920',
 150                 '38': '3072x4096',
 151                 '43': '360x640',
 152                 '44': '480x854',
 153                 '45': '720x1280',
 154                 '46': '1080x1920',
 155         }
 156         IE_NAME = u'youtube'
 157
 158         def suitable(self, url):
 159                 """Receives a URL and returns True if suitable for this IE."""
 160                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 161
 162         def report_lang(self):
 163                 """Report attempt to set language."""
 164                 self._downloader.to_screen(u'[youtube] Setting language')
 165
 166         def report_login(self):
 167                 """Report attempt to log in."""
 168                 self._downloader.to_screen(u'[youtube] Logging in')
 169
 170         def report_age_confirmation(self):
 171                 """Report attempt to confirm age."""
 172                 self._downloader.to_screen(u'[youtube] Confirming age')
 173
 174         def report_video_webpage_download(self, video_id):
 175                 """Report attempt to download video webpage."""
 176                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 177
 178         def report_video_info_webpage_download(self, video_id):
 179                 """Report attempt to download video info webpage."""
 180                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 181
 182         def report_video_subtitles_download(self, video_id):
 183                 """Report attempt to download video info webpage."""
 184                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 185
 186         def report_information_extraction(self, video_id):
 187                 """Report attempt to extract video information."""
 188                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 189
 190         def report_unavailable_format(self, video_id, format):
 191                 """Report extracted video URL."""
 192                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 193
 194         def report_rtmp_download(self):
 195                 """Indicate the download will use the RTMP protocol."""
 196                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 197
 198         def _closed_captions_xml_to_srt(self, xml_string):
 199                 srt = ''
 200                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 201                 # TODO parse xml instead of regex
 202                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 203                         if not dur: dur = '4'
 204                         start = float(start)
 205                         end = start + float(dur)
 206                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 207                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 208                         caption = unescapeHTML(caption)
 209                         caption = unescapeHTML(caption) # double cycle, intentional
 210                         srt += str(n+1) + '\n'
 211                         srt += start + ' --> ' + end + '\n'
 212                         srt += caption + '\n\n'
 213                 return srt
 214
 215         def _print_formats(self, formats):
 216                 print 'Available formats:'
 217                 for x in formats:
 218                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 219
 220         def _real_initialize(self):
 221                 if self._downloader is None:
 222                         return
 223
 224                 username = None
 225                 password = None
 226                 downloader_params = self._downloader.params
 227
 228                 # Attempt to use provided username and password or .netrc data
 229                 if downloader_params.get('username', None) is not None:
 230                         username = downloader_params['username']
 231                         password = downloader_params['password']
 232                 elif downloader_params.get('usenetrc', False):
 233                         try:
 234                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 235                                 if info is not None:
 236                                         username = info[0]
 237                                         password = info[2]
 238                                 else:
 239                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 240                         except (IOError, netrc.NetrcParseError), err:
 241                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 242                                 return
 243
 244                 # Set language
 245                 request = urllib2.Request(self._LANG_URL)
 246                 try:
 247                         self.report_lang()
 248                         urllib2.urlopen(request).read()
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 251                         return
 252
 253                 # No authentication to be performed
 254                 if username is None:
 255                         return
 256
 257                 # Log in
 258                 login_form = {
 259                                 'current_form': 'loginForm',
 260                                 'next':         '/',
 261                                 'action_login': 'Log In',
 262                                 'username':     username,
 263                                 'password':     password,
 264                                 }
 265                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 266                 try:
 267                         self.report_login()
 268                         login_results = urllib2.urlopen(request).read()
 269                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 270                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 271                                 return
 272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 273                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 274                         return
 275
 276                 # Confirm age
 277                 age_form = {
 278                                 'next_url':             '/',
 279                                 'action_confirm':       'Confirm',
 280                                 }
 281                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 282                 try:
 283                         self.report_age_confirmation()
 284                         age_results = urllib2.urlopen(request).read()
 285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 286                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 287                         return
 288
 289         def _real_extract(self, url):
 290                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 291                 mobj = re.search(self._NEXT_URL_RE, url)
 292                 if mobj:
 293                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 294
 295                 # Extract video id from URL
 296                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 297                 if mobj is None:
 298                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 299                         return
 300                 video_id = mobj.group(2)
 301
 302                 # Get video webpage
 303                 self.report_video_webpage_download(video_id)
 304                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 305                 try:
 306                         video_webpage = urllib2.urlopen(request).read()
 307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 308                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 309                         return
 310
 311                 # Attempt to extract SWF player URL
 312                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 313                 if mobj is not None:
 314                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 315                 else:
 316                         player_url = None
 317
 318                 # Get video info
 319                 self.report_video_info_webpage_download(video_id)
 320                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 321                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 322                                         % (video_id, el_type))
 323                         request = urllib2.Request(video_info_url)
 324                         try:
 325                                 video_info_webpage = urllib2.urlopen(request).read()
 326                                 video_info = parse_qs(video_info_webpage)
 327                                 if 'token' in video_info:
 328                                         break
 329                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 330                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 331                                 return
 332                 if 'token' not in video_info:
 333                         if 'reason' in video_info:
 334                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 335                         else:
 336                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 337                         return
 338
 339                 # Check for "rental" videos
 340                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 341                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 342                         return
 343
 344                 # Start extracting information
 345                 self.report_information_extraction(video_id)
 346
 347                 # uploader
 348                 if 'author' not in video_info:
 349                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 350                         return
 351                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 352
 353                 # title
 354                 if 'title' not in video_info:
 355                         self._downloader.trouble(u'ERROR: unable to extract video title')
 356                         return
 357                 video_title = urllib.unquote_plus(video_info['title'][0])
 358                 video_title = video_title.decode('utf-8')
 359
 360                 # thumbnail image
 361                 if 'thumbnail_url' not in video_info:
 362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 363                         video_thumbnail = ''
 364                 else:   # don't panic if we can't find it
 365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 366
 367                 # upload date
 368                 upload_date = u'NA'
 369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 370                 if mobj is not None:
 371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 373                         for expression in format_expressions:
 374                                 try:
 375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 376                                 except:
 377                                         pass
 378
 379                 # description
 380                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 381                 if video_description: video_description = clean_html(video_description)
 382                 else: video_description = ''
 383
 384                 # closed captions
 385                 video_subtitles = None
 386                 if self._downloader.params.get('writesubtitles', False):
 387                         try:
 388                                 self.report_video_subtitles_download(video_id)
 389                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 390                                 try:
 391                                         srt_list = urllib2.urlopen(request).read()
 392                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 393                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 394                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 395                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 396                                 if not srt_lang_list:
 397                                         raise Trouble(u'WARNING: video has no closed captions')
 398                                 if self._downloader.params.get('subtitleslang', False):
 399                                         srt_lang = self._downloader.params.get('subtitleslang')
 400                                 elif 'en' in srt_lang_list:
 401                                         srt_lang = 'en'
 402                                 else:
 403                                         srt_lang = srt_lang_list.keys()[0]
 404                                 if not srt_lang in srt_lang_list:
 405                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 406                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 407                                 try:
 408                                         srt_xml = urllib2.urlopen(request).read()
 409                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 410                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 411                                 if not srt_xml:
 412                                         raise Trouble(u'WARNING: unable to download video subtitles')
 413                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 414                         except Trouble as trouble:
 415                                 self._downloader.trouble(trouble[0])
 416
 417                 if 'length_seconds' not in video_info:
 418                         self._downloader.trouble(u'WARNING: unable to extract video duration')
 419                         video_duration = ''
 420                 else:
 421                         video_duration = urllib.unquote_plus(video_info['length_seconds'][0])
 422
 423                 # token
 424                 video_token = urllib.unquote_plus(video_info['token'][0])
 425
 426                 # Decide which formats to download
 427                 req_format = self._downloader.params.get('format', None)
 428
 429                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 430                         self.report_rtmp_download()
 431                         video_url_list = [(None, video_info['conn'][0])]
 432                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 433                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 434                         url_data = [parse_qs(uds) for uds in url_data_strs]
 435                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 436                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 437
 438                         format_limit = self._downloader.params.get('format_limit', None)
 439                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 440                         if format_limit is not None and format_limit in available_formats:
 441                                 format_list = available_formats[available_formats.index(format_limit):]
 442                         else:
 443                                 format_list = available_formats
 444                         existing_formats = [x for x in format_list if x in url_map]
 445                         if len(existing_formats) == 0:
 446                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 447                                 return
 448                         if self._downloader.params.get('listformats', None):
 449                                 self._print_formats(existing_formats)
 450                                 return
 451                         if req_format is None or req_format == 'best':
 452                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 453                         elif req_format == 'worst':
 454                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 455                         elif req_format in ('-1', 'all'):
 456                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 457                         else:
 458                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 459                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 460                                 req_formats = req_format.split('/')
 461                                 video_url_list = None
 462                                 for rf in req_formats:
 463                                         if rf in url_map:
 464                                                 video_url_list = [(rf, url_map[rf])]
 465                                                 break
 466                                 if video_url_list is None:
 467                                         self._downloader.trouble(u'ERROR: requested format not available')
 468                                         return
 469                 else:
 470                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 471                         return
 472
 473                 results = []
 474                 for format_param, video_real_url in video_url_list:
 475                         # Extension
 476                         video_extension = self._video_extensions.get(format_param, 'flv')
 477
 478                         results.append({
 479                                 'id':           video_id.decode('utf-8'),
 480                                 'url':          video_real_url.decode('utf-8'),
 481                                 'uploader':     video_uploader.decode('utf-8'),
 482                                 'upload_date':  upload_date,
 483                                 'title':        video_title,
 484                                 'ext':          video_extension.decode('utf-8'),
 485                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 486                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 487                                 'description':  video_description,
 488                                 'player_url':   player_url,
 489                                 'subtitles':    video_subtitles,
 490                                 'duration':             video_duration
 491                         })
 492                 return results
 493
 494
 495 class MetacafeIE(InfoExtractor):
 496         """Information Extractor for metacafe.com."""
 497
 498         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 499         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 500         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 501         IE_NAME = u'metacafe'
 502
 503         def __init__(self, downloader=None):
 504                 InfoExtractor.__init__(self, downloader)
 505
 506         def report_disclaimer(self):
 507                 """Report disclaimer retrieval."""
 508                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 509
 510         def report_age_confirmation(self):
 511                 """Report attempt to confirm age."""
 512                 self._downloader.to_screen(u'[metacafe] Confirming age')
 513
 514         def report_download_webpage(self, video_id):
 515                 """Report webpage download."""
 516                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 517
 518         def report_extraction(self, video_id):
 519                 """Report information extraction."""
 520                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 521
 522         def _real_initialize(self):
 523                 # Retrieve disclaimer
 524                 request = urllib2.Request(self._DISCLAIMER)
 525                 try:
 526                         self.report_disclaimer()
 527                         disclaimer = urllib2.urlopen(request).read()
 528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 529                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 530                         return
 531
 532                 # Confirm age
 533                 disclaimer_form = {
 534                         'filters': '0',
 535                         'submit': "Continue - I'm over 18",
 536                         }
 537                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 538                 try:
 539                         self.report_age_confirmation()
 540                         disclaimer = urllib2.urlopen(request).read()
 541                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 542                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 543                         return
 544
 545         def _real_extract(self, url):
 546                 # Extract id and simplified title from URL
 547                 mobj = re.match(self._VALID_URL, url)
 548                 if mobj is None:
 549                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 550                         return
 551
 552                 video_id = mobj.group(1)
 553
 554                 # Check if video comes from YouTube
 555                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 556                 if mobj2 is not None:
 557                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 558                         return
 559
 560                 # Retrieve video webpage to extract further information
 561                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 562                 try:
 563                         self.report_download_webpage(video_id)
 564                         webpage = urllib2.urlopen(request).read()
 565                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 566                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 567                         return
 568
 569                 # Extract URL, uploader and title from webpage
 570                 self.report_extraction(video_id)
 571                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 572                 if mobj is not None:
 573                         mediaURL = urllib.unquote(mobj.group(1))
 574                         video_extension = mediaURL[-3:]
 575
 576                         # Extract gdaKey if available
 577                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 578                         if mobj is None:
 579                                 video_url = mediaURL
 580                         else:
 581                                 gdaKey = mobj.group(1)
 582                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 583                 else:
 584                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 585                         if mobj is None:
 586                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 587                                 return
 588                         vardict = parse_qs(mobj.group(1))
 589                         if 'mediaData' not in vardict:
 590                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 591                                 return
 592                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 593                         if mobj is None:
 594                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 595                                 return
 596                         mediaURL = mobj.group(1).replace('\\/', '/')
 597                         video_extension = mediaURL[-3:]
 598                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 599
 600                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 601                 if mobj is None:
 602                         self._downloader.trouble(u'ERROR: unable to extract title')
 603                         return
 604                 video_title = mobj.group(1).decode('utf-8')
 605
 606                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 607                 if mobj is None:
 608                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 609                         return
 610                 video_uploader = mobj.group(1)
 611
 612                 return [{
 613                         'id':           video_id.decode('utf-8'),
 614                         'url':          video_url.decode('utf-8'),
 615                         'uploader':     video_uploader.decode('utf-8'),
 616                         'upload_date':  u'NA',
 617                         'title':        video_title,
 618                         'ext':          video_extension.decode('utf-8'),
 619                         'format':       u'NA',
 620                         'player_url':   None,
 621                 }]
 622
 623
 624 class DailymotionIE(InfoExtractor):
 625         """Information Extractor for Dailymotion"""
 626
 627         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 628         IE_NAME = u'dailymotion'
 629
 630         def __init__(self, downloader=None):
 631                 InfoExtractor.__init__(self, downloader)
 632
 633         def report_download_webpage(self, video_id):
 634                 """Report webpage download."""
 635                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 636
 637         def report_extraction(self, video_id):
 638                 """Report information extraction."""
 639                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 640
 641         def _real_extract(self, url):
 642                 # Extract id and simplified title from URL
 643                 mobj = re.match(self._VALID_URL, url)
 644                 if mobj is None:
 645                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 646                         return
 647
 648                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 649
 650                 video_extension = 'mp4'
 651
 652                 # Retrieve video webpage to extract further information
 653                 request = urllib2.Request(url)
 654                 request.add_header('Cookie', 'family_filter=off')
 655                 try:
 656                         self.report_download_webpage(video_id)
 657                         webpage = urllib2.urlopen(request).read()
 658                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 659                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 660                         return
 661
 662                 # Extract URL, uploader and title from webpage
 663                 self.report_extraction(video_id)
 664                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 665                 if mobj is None:
 666                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 667                         return
 668                 flashvars = urllib.unquote(mobj.group(1))
 669
 670                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 671                         if key in flashvars:
 672                                 max_quality = key
 673                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 674                                 break
 675                 else:
 676                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 677                         return
 678
 679                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 680                 if mobj is None:
 681                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 682                         return
 683
 684                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 685
 686                 # TODO: support choosing qualities
 687
 688                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 689                 if mobj is None:
 690                         self._downloader.trouble(u'ERROR: unable to extract title')
 691                         return
 692                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 693
 694                 video_uploader = u'NA'
 695                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 696                 if mobj is None:
 697                         self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 698                 else:
 699                         video_uploader = mobj.group(1)
 700
 701                 video_upload_date = u'NA'
 702                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 703                 if mobj is not None:
 704                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 705
 706                 return [{
 707                         'id':           video_id.decode('utf-8'),
 708                         'url':          video_url.decode('utf-8'),
 709                         'uploader':     video_uploader.decode('utf-8'),
 710                         'upload_date':  video_upload_date,
 711                         'title':        video_title,
 712                         'ext':          video_extension.decode('utf-8'),
 713                         'format':       u'NA',
 714                         'player_url':   None,
 715                 }]
 716
 717
 718 class GoogleIE(InfoExtractor):
 719         """Information extractor for video.google.com."""
 720
 721         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 722         IE_NAME = u'video.google'
 723
 724         def __init__(self, downloader=None):
 725                 InfoExtractor.__init__(self, downloader)
 726
 727         def report_download_webpage(self, video_id):
 728                 """Report webpage download."""
 729                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 730
 731         def report_extraction(self, video_id):
 732                 """Report information extraction."""
 733                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 734
 735         def _real_extract(self, url):
 736                 # Extract id from URL
 737                 mobj = re.match(self._VALID_URL, url)
 738                 if mobj is None:
 739                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 740                         return
 741
 742                 video_id = mobj.group(1)
 743
 744                 video_extension = 'mp4'
 745
 746                 # Retrieve video webpage to extract further information
 747                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 748                 try:
 749                         self.report_download_webpage(video_id)
 750                         webpage = urllib2.urlopen(request).read()
 751                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 752                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 753                         return
 754
 755                 # Extract URL, uploader, and title from webpage
 756                 self.report_extraction(video_id)
 757                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 758                 if mobj is None:
 759                         video_extension = 'flv'
 760                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 761                 if mobj is None:
 762                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 763                         return
 764                 mediaURL = urllib.unquote(mobj.group(1))
 765                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 766                 mediaURL = mediaURL.replace('\\x26', '\x26')
 767
 768                 video_url = mediaURL
 769
 770                 mobj = re.search(r'<title>(.*)</title>', webpage)
 771                 if mobj is None:
 772                         self._downloader.trouble(u'ERROR: unable to extract title')
 773                         return
 774                 video_title = mobj.group(1).decode('utf-8')
 775
 776                 # Extract video description
 777                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 778                 if mobj is None:
 779                         self._downloader.trouble(u'ERROR: unable to extract video description')
 780                         return
 781                 video_description = mobj.group(1).decode('utf-8')
 782                 if not video_description:
 783                         video_description = 'No description available.'
 784
 785                 # Extract video thumbnail
 786                 if self._downloader.params.get('forcethumbnail', False):
 787                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 788                         try:
 789                                 webpage = urllib2.urlopen(request).read()
 790                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 791                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 792                                 return
 793                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 794                         if mobj is None:
 795                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 796                                 return
 797                         video_thumbnail = mobj.group(1)
 798                 else:   # we need something to pass to process_info
 799                         video_thumbnail = ''
 800
 801                 return [{
 802                         'id':           video_id.decode('utf-8'),
 803                         'url':          video_url.decode('utf-8'),
 804                         'uploader':     u'NA',
 805                         'upload_date':  u'NA',
 806                         'title':        video_title,
 807                         'ext':          video_extension.decode('utf-8'),
 808                         'format':       u'NA',
 809                         'player_url':   None,
 810                 }]
 811
 812
 813 class PhotobucketIE(InfoExtractor):
 814         """Information extractor for photobucket.com."""
 815
 816         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 817         IE_NAME = u'photobucket'
 818
 819         def __init__(self, downloader=None):
 820                 InfoExtractor.__init__(self, downloader)
 821
 822         def report_download_webpage(self, video_id):
 823                 """Report webpage download."""
 824                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 825
 826         def report_extraction(self, video_id):
 827                 """Report information extraction."""
 828                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 829
 830         def _real_extract(self, url):
 831                 # Extract id from URL
 832                 mobj = re.match(self._VALID_URL, url)
 833                 if mobj is None:
 834                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 835                         return
 836
 837                 video_id = mobj.group(1)
 838
 839                 video_extension = 'flv'
 840
 841                 # Retrieve video webpage to extract further information
 842                 request = urllib2.Request(url)
 843                 try:
 844                         self.report_download_webpage(video_id)
 845                         webpage = urllib2.urlopen(request).read()
 846                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 847                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 848                         return
 849
 850                 # Extract URL, uploader, and title from webpage
 851                 self.report_extraction(video_id)
 852                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 853                 if mobj is None:
 854                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 855                         return
 856                 mediaURL = urllib.unquote(mobj.group(1))
 857
 858                 video_url = mediaURL
 859
 860                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 861                 if mobj is None:
 862                         self._downloader.trouble(u'ERROR: unable to extract title')
 863                         return
 864                 video_title = mobj.group(1).decode('utf-8')
 865
 866                 video_uploader = mobj.group(2).decode('utf-8')
 867
 868                 return [{
 869                         'id':           video_id.decode('utf-8'),
 870                         'url':          video_url.decode('utf-8'),
 871                         'uploader':     video_uploader,
 872                         'upload_date':  u'NA',
 873                         'title':        video_title,
 874                         'ext':          video_extension.decode('utf-8'),
 875                         'format':       u'NA',
 876                         'player_url':   None,
 877                 }]
 878
 879
 880 class YahooIE(InfoExtractor):
 881         """Information extractor for video.yahoo.com."""
 882
 883         # _VALID_URL matches all Yahoo! Video URLs
 884         # _VPAGE_URL matches only the extractable '/watch/' URLs
 885         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 886         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 887         IE_NAME = u'video.yahoo'
 888
 889         def __init__(self, downloader=None):
 890                 InfoExtractor.__init__(self, downloader)
 891
 892         def report_download_webpage(self, video_id):
 893                 """Report webpage download."""
 894                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 895
 896         def report_extraction(self, video_id):
 897                 """Report information extraction."""
 898                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 899
 900         def _real_extract(self, url, new_video=True):
 901                 # Extract ID from URL
 902                 mobj = re.match(self._VALID_URL, url)
 903                 if mobj is None:
 904                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 905                         return
 906
 907                 video_id = mobj.group(2)
 908                 video_extension = 'flv'
 909
 910                 # Rewrite valid but non-extractable URLs as
 911                 # extractable English language /watch/ URLs
 912                 if re.match(self._VPAGE_URL, url) is None:
 913                         request = urllib2.Request(url)
 914                         try:
 915                                 webpage = urllib2.urlopen(request).read()
 916                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 917                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 918                                 return
 919
 920                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 921                         if mobj is None:
 922                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 923                                 return
 924                         yahoo_id = mobj.group(1)
 925
 926                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 927                         if mobj is None:
 928                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 929                                 return
 930                         yahoo_vid = mobj.group(1)
 931
 932                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 933                         return self._real_extract(url, new_video=False)
 934
 935                 # Retrieve video webpage to extract further information
 936                 request = urllib2.Request(url)
 937                 try:
 938                         self.report_download_webpage(video_id)
 939                         webpage = urllib2.urlopen(request).read()
 940                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 941                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 942                         return
 943
 944                 # Extract uploader and title from webpage
 945                 self.report_extraction(video_id)
 946                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 947                 if mobj is None:
 948                         self._downloader.trouble(u'ERROR: unable to extract video title')
 949                         return
 950                 video_title = mobj.group(1).decode('utf-8')
 951
 952                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 953                 if mobj is None:
 954                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 955                         return
 956                 video_uploader = mobj.group(1).decode('utf-8')
 957
 958                 # Extract video thumbnail
 959                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 960                 if mobj is None:
 961                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 962                         return
 963                 video_thumbnail = mobj.group(1).decode('utf-8')
 964
 965                 # Extract video description
 966                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 967                 if mobj is None:
 968                         self._downloader.trouble(u'ERROR: unable to extract video description')
 969                         return
 970                 video_description = mobj.group(1).decode('utf-8')
 971                 if not video_description:
 972                         video_description = 'No description available.'
 973
 974                 # Extract video height and width
 975                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 976                 if mobj is None:
 977                         self._downloader.trouble(u'ERROR: unable to extract video height')
 978                         return
 979                 yv_video_height = mobj.group(1)
 980
 981                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 982                 if mobj is None:
 983                         self._downloader.trouble(u'ERROR: unable to extract video width')
 984                         return
 985                 yv_video_width = mobj.group(1)
 986
 987                 # Retrieve video playlist to extract media URL
 988                 # I'm not completely sure what all these options are, but we
 989                 # seem to need most of them, otherwise the server sends a 401.
 990                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 991                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 992                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 993                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 994                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 995                 try:
 996                         self.report_download_webpage(video_id)
 997                         webpage = urllib2.urlopen(request).read()
 998                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 999                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1000                         return
1001
1002                 # Extract media URL from playlist XML
1003                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1004                 if mobj is None:
1005                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1006                         return
1007                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1008                 video_url = unescapeHTML(video_url)
1009
1010                 return [{
1011                         'id':           video_id.decode('utf-8'),
1012                         'url':          video_url,
1013                         'uploader':     video_uploader,
1014                         'upload_date':  u'NA',
1015                         'title':        video_title,
1016                         'ext':          video_extension.decode('utf-8'),
1017                         'thumbnail':    video_thumbnail.decode('utf-8'),
1018                         'description':  video_description,
1019                         'thumbnail':    video_thumbnail,
1020                         'player_url':   None,
1021                 }]
1022
1023
1024 class VimeoIE(InfoExtractor):
1025         """Information extractor for vimeo.com."""
1026
1027         # _VALID_URL matches Vimeo URLs
1028         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1029         IE_NAME = u'vimeo'
1030
1031         def __init__(self, downloader=None):
1032                 InfoExtractor.__init__(self, downloader)
1033
1034         def report_download_webpage(self, video_id):
1035                 """Report webpage download."""
1036                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1037
1038         def report_extraction(self, video_id):
1039                 """Report information extraction."""
1040                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1041
1042         def _real_extract(self, url, new_video=True):
1043                 # Extract ID from URL
1044                 mobj = re.match(self._VALID_URL, url)
1045                 if mobj is None:
1046                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1047                         return
1048
1049                 video_id = mobj.group(1)
1050
1051                 # Retrieve video webpage to extract further information
1052                 request = urllib2.Request(url, None, std_headers)
1053                 try:
1054                         self.report_download_webpage(video_id)
1055                         webpage = urllib2.urlopen(request).read()
1056                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1057                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1058                         return
1059
1060                 # Now we begin extracting as much information as we can from what we
1061                 # retrieved. First we extract the information common to all extractors,
1062                 # and latter we extract those that are Vimeo specific.
1063                 self.report_extraction(video_id)
1064
1065                 # Extract the config JSON
1066                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1067                 try:
1068                         config = json.loads(config)
1069                 except:
1070                         self._downloader.trouble(u'ERROR: unable to extract info section')
1071                         return
1072
1073                 # Extract title
1074                 video_title = config["video"]["title"]
1075
1076                 # Extract uploader
1077                 video_uploader = config["video"]["owner"]["name"]
1078
1079                 # Extract video thumbnail
1080                 video_thumbnail = config["video"]["thumbnail"]
1081
1082                 # Extract video description
1083                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1084                 if video_description: video_description = clean_html(video_description)
1085                 else: video_description = ''
1086
1087                 # Extract upload date
1088                 video_upload_date = u'NA'
1089                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1090                 if mobj is not None:
1091                         video_upload_date = mobj.group(1)
1092
1093                 # Vimeo specific: extract request signature and timestamp
1094                 sig = config['request']['signature']
1095                 timestamp = config['request']['timestamp']
1096
1097                 # Vimeo specific: extract video codec and quality information
1098                 # TODO bind to format param
1099                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100                 for codec in codecs:
1101                         if codec[0] in config["video"]["files"]:
1102                                 video_codec = codec[0]
1103                                 video_extension = codec[1]
1104                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1105                                 else: quality = 'sd'
1106                                 break
1107                 else:
1108                         self._downloader.trouble(u'ERROR: no known codec found')
1109                         return
1110
1111                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1113
1114                 return [{
1115                         'id':           video_id,
1116                         'url':          video_url,
1117                         'uploader':     video_uploader,
1118                         'upload_date':  video_upload_date,
1119                         'title':        video_title,
1120                         'ext':          video_extension,
1121                         'thumbnail':    video_thumbnail,
1122                         'description':  video_description,
1123                         'player_url':   None,
1124                 }]
1125
1126
1127 class GenericIE(InfoExtractor):
1128         """Generic last-resort information extractor."""
1129
1130         _VALID_URL = r'.*'
1131         IE_NAME = u'generic'
1132
1133         def __init__(self, downloader=None):
1134                 InfoExtractor.__init__(self, downloader)
1135
1136         def report_download_webpage(self, video_id):
1137                 """Report webpage download."""
1138                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1139                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1140
1141         def report_extraction(self, video_id):
1142                 """Report information extraction."""
1143                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1144
1145         def report_following_redirect(self, new_url):
1146                 """Report information extraction."""
1147                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1148
1149         def _test_redirect(self, url):
1150                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1151                 class HeadRequest(urllib2.Request):
1152                         def get_method(self):
1153                                 return "HEAD"
1154
1155                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1156                         """
1157                         Subclass the HTTPRedirectHandler to make it use our
1158                         HeadRequest also on the redirected URL
1159                         """
1160                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1161                                 if code in (301, 302, 303, 307):
1162                                         newurl = newurl.replace(' ', '%20')
1163                                         newheaders = dict((k,v) for k,v in req.headers.items()
1164                                                                           if k.lower() not in ("content-length", "content-type"))
1165                                         return HeadRequest(newurl,
1166                                                                            headers=newheaders,
1167                                                                            origin_req_host=req.get_origin_req_host(),
1168                                                                            unverifiable=True)
1169                                 else:
1170                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1171
1172                 class HTTPMethodFallback(urllib2.BaseHandler):
1173                         """
1174                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1175                         """
1176                         def http_error_405(self, req, fp, code, msg, headers):
1177                                 fp.read()
1178                                 fp.close()
1179
1180                                 newheaders = dict((k,v) for k,v in req.headers.items()
1181                                                                   if k.lower() not in ("content-length", "content-type"))
1182                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1183                                                                                                  headers=newheaders,
1184                                                                                                  origin_req_host=req.get_origin_req_host(),
1185                                                                                                  unverifiable=True))
1186
1187                 # Build our opener
1188                 opener = urllib2.OpenerDirector()
1189                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1190                                                 HTTPMethodFallback, HEADRedirectHandler,
1191                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1192                         opener.add_handler(handler())
1193
1194                 response = opener.open(HeadRequest(url))
1195                 new_url = response.geturl()
1196
1197                 if url == new_url: return False
1198
1199                 self.report_following_redirect(new_url)
1200                 self._downloader.download([new_url])
1201                 return True
1202
1203         def _real_extract(self, url):
1204                 if self._test_redirect(url): return
1205
1206                 video_id = url.split('/')[-1]
1207                 request = urllib2.Request(url)
1208                 try:
1209                         self.report_download_webpage(video_id)
1210                         webpage = urllib2.urlopen(request).read()
1211                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1212                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1213                         return
1214                 except ValueError, err:
1215                         # since this is the last-resort InfoExtractor, if
1216                         # this error is thrown, it'll be thrown here
1217                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1218                         return
1219
1220                 self.report_extraction(video_id)
1221                 # Start with something easy: JW Player in SWFObject
1222                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1223                 if mobj is None:
1224                         # Broaden the search a little bit
1225                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1226                 if mobj is None:
1227                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1228                         return
1229
1230                 # It's possible that one of the regexes
1231                 # matched, but returned an empty group:
1232                 if mobj.group(1) is None:
1233                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1234                         return
1235
1236                 video_url = urllib.unquote(mobj.group(1))
1237                 video_id = os.path.basename(video_url)
1238
1239                 # here's a fun little line of code for you:
1240                 video_extension = os.path.splitext(video_id)[1][1:]
1241                 video_id = os.path.splitext(video_id)[0]
1242
1243                 # it's tempting to parse this further, but you would
1244                 # have to take into account all the variations like
1245                 #   Video Title - Site Name
1246                 #   Site Name | Video Title
1247                 #   Video Title - Tagline | Site Name
1248                 # and so on and so forth; it's just not practical
1249                 mobj = re.search(r'<title>(.*)</title>', webpage)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: unable to extract title')
1252                         return
1253                 video_title = mobj.group(1).decode('utf-8')
1254
1255                 # video uploader is domain name
1256                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1257                 if mobj is None:
1258                         self._downloader.trouble(u'ERROR: unable to extract title')
1259                         return
1260                 video_uploader = mobj.group(1).decode('utf-8')
1261
1262                 return [{
1263                         'id':           video_id.decode('utf-8'),
1264                         'url':          video_url.decode('utf-8'),
1265                         'uploader':     video_uploader,
1266                         'upload_date':  u'NA',
1267                         'title':        video_title,
1268                         'ext':          video_extension.decode('utf-8'),
1269                         'format':       u'NA',
1270                         'player_url':   None,
1271                 }]
1272
1273
1274 class YoutubeSearchIE(InfoExtractor):
1275         """Information Extractor for YouTube search queries."""
1276         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1277         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1278         _max_youtube_results = 1000
1279         IE_NAME = u'youtube:search'
1280
1281         def __init__(self, downloader=None):
1282                 InfoExtractor.__init__(self, downloader)
1283
1284         def report_download_page(self, query, pagenum):
1285                 """Report attempt to download search page with given number."""
1286                 query = query.decode(preferredencoding())
1287                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1288
1289         def _real_extract(self, query):
1290                 mobj = re.match(self._VALID_URL, query)
1291                 if mobj is None:
1292                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1293                         return
1294
1295                 prefix, query = query.split(':')
1296                 prefix = prefix[8:]
1297                 query = query.encode('utf-8')
1298                 if prefix == '':
1299                         self._download_n_results(query, 1)
1300                         return
1301                 elif prefix == 'all':
1302                         self._download_n_results(query, self._max_youtube_results)
1303                         return
1304                 else:
1305                         try:
1306                                 n = long(prefix)
1307                                 if n <= 0:
1308                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1309                                         return
1310                                 elif n > self._max_youtube_results:
1311                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1312                                         n = self._max_youtube_results
1313                                 self._download_n_results(query, n)
1314                                 return
1315                         except ValueError: # parsing prefix as integer fails
1316                                 self._download_n_results(query, 1)
1317                                 return
1318
1319         def _download_n_results(self, query, n):
1320                 """Downloads a specified number of results for a query"""
1321
1322                 video_ids = []
1323                 pagenum = 0
1324                 limit = n
1325
1326                 while (50 * pagenum) < limit:
1327                         self.report_download_page(query, pagenum+1)
1328                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1329                         request = urllib2.Request(result_url)
1330                         try:
1331                                 data = urllib2.urlopen(request).read()
1332                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1333                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1334                                 return
1335                         api_response = json.loads(data)['data']
1336
1337                         new_ids = list(video['id'] for video in api_response['items'])
1338                         video_ids += new_ids
1339
1340                         limit = min(n, api_response['totalItems'])
1341                         pagenum += 1
1342
1343                 if len(video_ids) > n:
1344                         video_ids = video_ids[:n]
1345                 for id in video_ids:
1346                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1347                 return
1348
1349
1350 class GoogleSearchIE(InfoExtractor):
1351         """Information Extractor for Google Video search queries."""
1352         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1353         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1354         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1355         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1356         _max_google_results = 1000
1357         IE_NAME = u'video.google:search'
1358
1359         def __init__(self, downloader=None):
1360                 InfoExtractor.__init__(self, downloader)
1361
1362         def report_download_page(self, query, pagenum):
1363                 """Report attempt to download playlist page with given number."""
1364                 query = query.decode(preferredencoding())
1365                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1366
1367         def _real_extract(self, query):
1368                 mobj = re.match(self._VALID_URL, query)
1369                 if mobj is None:
1370                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1371                         return
1372
1373                 prefix, query = query.split(':')
1374                 prefix = prefix[8:]
1375                 query = query.encode('utf-8')
1376                 if prefix == '':
1377                         self._download_n_results(query, 1)
1378                         return
1379                 elif prefix == 'all':
1380                         self._download_n_results(query, self._max_google_results)
1381                         return
1382                 else:
1383                         try:
1384                                 n = long(prefix)
1385                                 if n <= 0:
1386                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1387                                         return
1388                                 elif n > self._max_google_results:
1389                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1390                                         n = self._max_google_results
1391                                 self._download_n_results(query, n)
1392                                 return
1393                         except ValueError: # parsing prefix as integer fails
1394                                 self._download_n_results(query, 1)
1395                                 return
1396
1397         def _download_n_results(self, query, n):
1398                 """Downloads a specified number of results for a query"""
1399
1400                 video_ids = []
1401                 pagenum = 0
1402
1403                 while True:
1404                         self.report_download_page(query, pagenum)
1405                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1406                         request = urllib2.Request(result_url)
1407                         try:
1408                                 page = urllib2.urlopen(request).read()
1409                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1411                                 return
1412
1413                         # Extract video identifiers
1414                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1415                                 video_id = mobj.group(1)
1416                                 if video_id not in video_ids:
1417                                         video_ids.append(video_id)
1418                                         if len(video_ids) == n:
1419                                                 # Specified n videos reached
1420                                                 for id in video_ids:
1421                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1422                                                 return
1423
1424                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1425                                 for id in video_ids:
1426                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1427                                 return
1428
1429                         pagenum = pagenum + 1
1430
1431
1432 class YahooSearchIE(InfoExtractor):
1433         """Information Extractor for Yahoo! Video search queries."""
1434         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1435         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1436         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1437         _MORE_PAGES_INDICATOR = r'\s*Next'
1438         _max_yahoo_results = 1000
1439         IE_NAME = u'video.yahoo:search'
1440
1441         def __init__(self, downloader=None):
1442                 InfoExtractor.__init__(self, downloader)
1443
1444         def report_download_page(self, query, pagenum):
1445                 """Report attempt to download playlist page with given number."""
1446                 query = query.decode(preferredencoding())
1447                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1448
1449         def _real_extract(self, query):
1450                 mobj = re.match(self._VALID_URL, query)
1451                 if mobj is None:
1452                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1453                         return
1454
1455                 prefix, query = query.split(':')
1456                 prefix = prefix[8:]
1457                 query = query.encode('utf-8')
1458                 if prefix == '':
1459                         self._download_n_results(query, 1)
1460                         return
1461                 elif prefix == 'all':
1462                         self._download_n_results(query, self._max_yahoo_results)
1463                         return
1464                 else:
1465                         try:
1466                                 n = long(prefix)
1467                                 if n <= 0:
1468                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1469                                         return
1470                                 elif n > self._max_yahoo_results:
1471                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1472                                         n = self._max_yahoo_results
1473                                 self._download_n_results(query, n)
1474                                 return
1475                         except ValueError: # parsing prefix as integer fails
1476                                 self._download_n_results(query, 1)
1477                                 return
1478
1479         def _download_n_results(self, query, n):
1480                 """Downloads a specified number of results for a query"""
1481
1482                 video_ids = []
1483                 already_seen = set()
1484                 pagenum = 1
1485
1486                 while True:
1487                         self.report_download_page(query, pagenum)
1488                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1489                         request = urllib2.Request(result_url)
1490                         try:
1491                                 page = urllib2.urlopen(request).read()
1492                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1494                                 return
1495
1496                         # Extract video identifiers
1497                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1498                                 video_id = mobj.group(1)
1499                                 if video_id not in already_seen:
1500                                         video_ids.append(video_id)
1501                                         already_seen.add(video_id)
1502                                         if len(video_ids) == n:
1503                                                 # Specified n videos reached
1504                                                 for id in video_ids:
1505                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1506                                                 return
1507
1508                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509                                 for id in video_ids:
1510                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1511                                 return
1512
1513                         pagenum = pagenum + 1
1514
1515
1516 class YoutubePlaylistIE(InfoExtractor):
1517         """Information Extractor for YouTube playlists."""
1518
1519         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1520         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1521         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1522         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1523         IE_NAME = u'youtube:playlist'
1524
1525         def __init__(self, downloader=None):
1526                 InfoExtractor.__init__(self, downloader)
1527
1528         def report_download_page(self, playlist_id, pagenum):
1529                 """Report attempt to download playlist page with given number."""
1530                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1531
1532         def _real_extract(self, url):
1533                 # Extract playlist id
1534                 mobj = re.match(self._VALID_URL, url)
1535                 if mobj is None:
1536                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1537                         return
1538
1539                 # Single video case
1540                 if mobj.group(3) is not None:
1541                         self._downloader.download([mobj.group(3)])
1542                         return
1543
1544                 # Download playlist pages
1545                 # prefix is 'p' as default for playlists but there are other types that need extra care
1546                 playlist_prefix = mobj.group(1)
1547                 if playlist_prefix == 'a':
1548                         playlist_access = 'artist'
1549                 else:
1550                         playlist_prefix = 'p'
1551                         playlist_access = 'view_play_list'
1552                 playlist_id = mobj.group(2)
1553                 video_ids = []
1554                 pagenum = 1
1555
1556                 while True:
1557                         self.report_download_page(playlist_id, pagenum)
1558                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1559                         request = urllib2.Request(url)
1560                         try:
1561                                 page = urllib2.urlopen(request).read()
1562                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1564                                 return
1565
1566                         # Extract video identifiers
1567                         ids_in_page = []
1568                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1569                                 if mobj.group(1) not in ids_in_page:
1570                                         ids_in_page.append(mobj.group(1))
1571                         video_ids.extend(ids_in_page)
1572
1573                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1574                                 break
1575                         pagenum = pagenum + 1
1576
1577                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1578                 playlistend = self._downloader.params.get('playlistend', -1)
1579                 if playlistend == -1:
1580                         video_ids = video_ids[playliststart:]
1581                 else:
1582                         video_ids = video_ids[playliststart:playlistend]
1583
1584                 for id in video_ids:
1585                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1586                 return
1587
1588
1589 class YoutubeChannelIE(InfoExtractor):
1590         """Information Extractor for YouTube channels."""
1591
1592         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1593         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1594         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1595         IE_NAME = u'youtube:channel'
1596
1597         def report_download_page(self, channel_id, pagenum):
1598                 """Report attempt to download channel page with given number."""
1599                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1600
1601         def _real_extract(self, url):
1602                 # Extract channel id
1603                 mobj = re.match(self._VALID_URL, url)
1604                 if mobj is None:
1605                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1606                         return
1607
1608                 # Download channel pages
1609                 channel_id = mobj.group(1)
1610                 video_ids = []
1611                 pagenum = 1
1612
1613                 while True:
1614                         self.report_download_page(channel_id, pagenum)
1615                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1616                         request = urllib2.Request(url)
1617                         try:
1618                                 page = urllib2.urlopen(request).read()
1619                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1620                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1621                                 return
1622
1623                         # Extract video identifiers
1624                         ids_in_page = []
1625                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1626                                 if mobj.group(1) not in ids_in_page:
1627                                         ids_in_page.append(mobj.group(1))
1628                         video_ids.extend(ids_in_page)
1629
1630                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1631                                 break
1632                         pagenum = pagenum + 1
1633
1634                 for id in video_ids:
1635                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1636                 return
1637
1638
1639 class YoutubeUserIE(InfoExtractor):
1640         """Information Extractor for YouTube users."""
1641
1642         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1643         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1644         _GDATA_PAGE_SIZE = 50
1645         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1646         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1647         IE_NAME = u'youtube:user'
1648
1649         def __init__(self, downloader=None):
1650                 InfoExtractor.__init__(self, downloader)
1651
1652         def report_download_page(self, username, start_index):
1653                 """Report attempt to download user page."""
1654                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1655                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1656
1657         def _real_extract(self, url):
1658                 # Extract username
1659                 mobj = re.match(self._VALID_URL, url)
1660                 if mobj is None:
1661                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1662                         return
1663
1664                 username = mobj.group(1)
1665
1666                 # Download video ids using YouTube Data API. Result size per
1667                 # query is limited (currently to 50 videos) so we need to query
1668                 # page by page until there are no video ids - it means we got
1669                 # all of them.
1670
1671                 video_ids = []
1672                 pagenum = 0
1673
1674                 while True:
1675                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1676                         self.report_download_page(username, start_index)
1677
1678                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1679
1680                         try:
1681                                 page = urllib2.urlopen(request).read()
1682                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1683                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1684                                 return
1685
1686                         # Extract video identifiers
1687                         ids_in_page = []
1688
1689                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690                                 if mobj.group(1) not in ids_in_page:
1691                                         ids_in_page.append(mobj.group(1))
1692
1693                         video_ids.extend(ids_in_page)
1694
1695                         # A little optimization - if current page is not
1696                         # "full", ie. does not contain PAGE_SIZE video ids then
1697                         # we can assume that this page is the last one - there
1698                         # are no more ids on further pages - no need to query
1699                         # again.
1700
1701                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1702                                 break
1703
1704                         pagenum += 1
1705
1706                 all_ids_count = len(video_ids)
1707                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1708                 playlistend = self._downloader.params.get('playlistend', -1)
1709
1710                 if playlistend == -1:
1711                         video_ids = video_ids[playliststart:]
1712                 else:
1713                         video_ids = video_ids[playliststart:playlistend]
1714
1715                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1716                                 (username, all_ids_count, len(video_ids)))
1717
1718                 for video_id in video_ids:
1719                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1720
1721
1722 class BlipTVUserIE(InfoExtractor):
1723         """Information Extractor for blip.tv users."""
1724
1725         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1726         _PAGE_SIZE = 12
1727         IE_NAME = u'blip.tv:user'
1728
1729         def __init__(self, downloader=None):
1730                 InfoExtractor.__init__(self, downloader)
1731
1732         def report_download_page(self, username, pagenum):
1733                 """Report attempt to download user page."""
1734                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1735                                 (self.IE_NAME, username, pagenum))
1736
1737         def _real_extract(self, url):
1738                 # Extract username
1739                 mobj = re.match(self._VALID_URL, url)
1740                 if mobj is None:
1741                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1742                         return
1743
1744                 username = mobj.group(1)
1745
1746                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1747
1748                 request = urllib2.Request(url)
1749
1750                 try:
1751                         page = urllib2.urlopen(request).read().decode('utf-8')
1752                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1753                         page_base = page_base % mobj.group(1)
1754                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1755                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1756                         return
1757
1758
1759                 # Download video ids using BlipTV Ajax calls. Result size per
1760                 # query is limited (currently to 12 videos) so we need to query
1761                 # page by page until there are no video ids - it means we got
1762                 # all of them.
1763
1764                 video_ids = []
1765                 pagenum = 1
1766
1767                 while True:
1768                         self.report_download_page(username, pagenum)
1769
1770                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1771
1772                         try:
1773                                 page = urllib2.urlopen(request).read().decode('utf-8')
1774                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1776                                 return
1777
1778                         # Extract video identifiers
1779                         ids_in_page = []
1780
1781                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1782                                 if mobj.group(1) not in ids_in_page:
1783                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1784
1785                         video_ids.extend(ids_in_page)
1786
1787                         # A little optimization - if current page is not
1788                         # "full", ie. does not contain PAGE_SIZE video ids then
1789                         # we can assume that this page is the last one - there
1790                         # are no more ids on further pages - no need to query
1791                         # again.
1792
1793                         if len(ids_in_page) < self._PAGE_SIZE:
1794                                 break
1795
1796                         pagenum += 1
1797
1798                 all_ids_count = len(video_ids)
1799                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1800                 playlistend = self._downloader.params.get('playlistend', -1)
1801
1802                 if playlistend == -1:
1803                         video_ids = video_ids[playliststart:]
1804                 else:
1805                         video_ids = video_ids[playliststart:playlistend]
1806
1807                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1808                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1809
1810                 for video_id in video_ids:
1811                         self._downloader.download([u'http://blip.tv/'+video_id])
1812
1813
1814 class DepositFilesIE(InfoExtractor):
1815         """Information extractor for depositfiles.com"""
1816
1817         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1818         IE_NAME = u'DepositFiles'
1819
1820         def __init__(self, downloader=None):
1821                 InfoExtractor.__init__(self, downloader)
1822
1823         def report_download_webpage(self, file_id):
1824                 """Report webpage download."""
1825                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1826
1827         def report_extraction(self, file_id):
1828                 """Report information extraction."""
1829                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1830
1831         def _real_extract(self, url):
1832                 file_id = url.split('/')[-1]
1833                 # Rebuild url in english locale
1834                 url = 'http://depositfiles.com/en/files/' + file_id
1835
1836                 # Retrieve file webpage with 'Free download' button pressed
1837                 free_download_indication = { 'gateway_result' : '1' }
1838                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1839                 try:
1840                         self.report_download_webpage(file_id)
1841                         webpage = urllib2.urlopen(request).read()
1842                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1844                         return
1845
1846                 # Search for the real file URL
1847                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1848                 if (mobj is None) or (mobj.group(1) is None):
1849                         # Try to figure out reason of the error.
1850                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1851                         if (mobj is not None) and (mobj.group(1) is not None):
1852                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1853                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1854                         else:
1855                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1856                         return
1857
1858                 file_url = mobj.group(1)
1859                 file_extension = os.path.splitext(file_url)[1][1:]
1860
1861                 # Search for file title
1862                 mobj = re.search(r'<b title="(.*?)">', webpage)
1863                 if mobj is None:
1864                         self._downloader.trouble(u'ERROR: unable to extract title')
1865                         return
1866                 file_title = mobj.group(1).decode('utf-8')
1867
1868                 return [{
1869                         'id':           file_id.decode('utf-8'),
1870                         'url':          file_url.decode('utf-8'),
1871                         'uploader':     u'NA',
1872                         'upload_date':  u'NA',
1873                         'title':        file_title,
1874                         'ext':          file_extension.decode('utf-8'),
1875                         'format':       u'NA',
1876                         'player_url':   None,
1877                 }]
1878
1879
1880 class FacebookIE(InfoExtractor):
1881         """Information Extractor for Facebook"""
1882
1883         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885         _NETRC_MACHINE = 'facebook'
1886         _available_formats = ['video', 'highqual', 'lowqual']
1887         _video_extensions = {
1888                 'video': 'mp4',
1889                 'highqual': 'mp4',
1890                 'lowqual': 'mp4',
1891         }
1892         IE_NAME = u'facebook'
1893
1894         def __init__(self, downloader=None):
1895                 InfoExtractor.__init__(self, downloader)
1896
1897         def _reporter(self, message):
1898                 """Add header and report message."""
1899                 self._downloader.to_screen(u'[facebook] %s' % message)
1900
1901         def report_login(self):
1902                 """Report attempt to log in."""
1903                 self._reporter(u'Logging in')
1904
1905         def report_video_webpage_download(self, video_id):
1906                 """Report attempt to download video webpage."""
1907                 self._reporter(u'%s: Downloading video webpage' % video_id)
1908
1909         def report_information_extraction(self, video_id):
1910                 """Report attempt to extract video information."""
1911                 self._reporter(u'%s: Extracting video information' % video_id)
1912
1913         def _parse_page(self, video_webpage):
1914                 """Extract video information from page"""
1915                 # General data
1916                 data = {'title': r'\("video_title", "(.*?)"\)',
1917                         'description': r'<div class="datawrap">(.*?)</div>',
1918                         'owner': r'\("video_owner_name", "(.*?)"\)',
1919                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1920                         }
1921                 video_info = {}
1922                 for piece in data.keys():
1923                         mobj = re.search(data[piece], video_webpage)
1924                         if mobj is not None:
1925                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1926
1927                 # Video urls
1928                 video_urls = {}
1929                 for fmt in self._available_formats:
1930                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1931                         if mobj is not None:
1932                                 # URL is in a Javascript segment inside an escaped Unicode format within
1933                                 # the generally utf-8 page
1934                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1935                 video_info['video_urls'] = video_urls
1936
1937                 return video_info
1938
1939         def _real_initialize(self):
1940                 if self._downloader is None:
1941                         return
1942
1943                 useremail = None
1944                 password = None
1945                 downloader_params = self._downloader.params
1946
1947                 # Attempt to use provided username and password or .netrc data
1948                 if downloader_params.get('username', None) is not None:
1949                         useremail = downloader_params['username']
1950                         password = downloader_params['password']
1951                 elif downloader_params.get('usenetrc', False):
1952                         try:
1953                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1954                                 if info is not None:
1955                                         useremail = info[0]
1956                                         password = info[2]
1957                                 else:
1958                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1959                         except (IOError, netrc.NetrcParseError), err:
1960                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1961                                 return
1962
1963                 if useremail is None:
1964                         return
1965
1966                 # Log in
1967                 login_form = {
1968                         'email': useremail,
1969                         'pass': password,
1970                         'login': 'Log+In'
1971                         }
1972                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1973                 try:
1974                         self.report_login()
1975                         login_results = urllib2.urlopen(request).read()
1976                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1977                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1978                                 return
1979                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1980                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1981                         return
1982
1983         def _real_extract(self, url):
1984                 mobj = re.match(self._VALID_URL, url)
1985                 if mobj is None:
1986                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1987                         return
1988                 video_id = mobj.group('ID')
1989
1990                 # Get video webpage
1991                 self.report_video_webpage_download(video_id)
1992                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1993                 try:
1994                         page = urllib2.urlopen(request)
1995                         video_webpage = page.read()
1996                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1997                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1998                         return
1999
2000                 # Start extracting information
2001                 self.report_information_extraction(video_id)
2002
2003                 # Extract information
2004                 video_info = self._parse_page(video_webpage)
2005
2006                 # uploader
2007                 if 'owner' not in video_info:
2008                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2009                         return
2010                 video_uploader = video_info['owner']
2011
2012                 # title
2013                 if 'title' not in video_info:
2014                         self._downloader.trouble(u'ERROR: unable to extract video title')
2015                         return
2016                 video_title = video_info['title']
2017                 video_title = video_title.decode('utf-8')
2018
2019                 # thumbnail image
2020                 if 'thumbnail' not in video_info:
2021                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2022                         video_thumbnail = ''
2023                 else:
2024                         video_thumbnail = video_info['thumbnail']
2025
2026                 # upload date
2027                 upload_date = u'NA'
2028                 if 'upload_date' in video_info:
2029                         upload_time = video_info['upload_date']
2030                         timetuple = email.utils.parsedate_tz(upload_time)
2031                         if timetuple is not None:
2032                                 try:
2033                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2034                                 except:
2035                                         pass
2036
2037                 # description
2038                 video_description = video_info.get('description', 'No description available.')
2039
2040                 url_map = video_info['video_urls']
2041                 if len(url_map.keys()) > 0:
2042                         # Decide which formats to download
2043                         req_format = self._downloader.params.get('format', None)
2044                         format_limit = self._downloader.params.get('format_limit', None)
2045
2046                         if format_limit is not None and format_limit in self._available_formats:
2047                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2048                         else:
2049                                 format_list = self._available_formats
2050                         existing_formats = [x for x in format_list if x in url_map]
2051                         if len(existing_formats) == 0:
2052                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2053                                 return
2054                         if req_format is None:
2055                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2056                         elif req_format == 'worst':
2057                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2058                         elif req_format == '-1':
2059                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2060                         else:
2061                                 # Specific format
2062                                 if req_format not in url_map:
2063                                         self._downloader.trouble(u'ERROR: requested format not available')
2064                                         return
2065                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2066
2067                 results = []
2068                 for format_param, video_real_url in video_url_list:
2069                         # Extension
2070                         video_extension = self._video_extensions.get(format_param, 'mp4')
2071
2072                         results.append({
2073                                 'id':           video_id.decode('utf-8'),
2074                                 'url':          video_real_url.decode('utf-8'),
2075                                 'uploader':     video_uploader.decode('utf-8'),
2076                                 'upload_date':  upload_date,
2077                                 'title':        video_title,
2078                                 'ext':          video_extension.decode('utf-8'),
2079                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2080                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2081                                 'description':  video_description.decode('utf-8'),
2082                                 'player_url':   None,
2083                         })
2084                 return results
2085
2086 class BlipTVIE(InfoExtractor):
2087         """Information extractor for blip.tv"""
2088
2089         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2090         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2091         IE_NAME = u'blip.tv'
2092
2093         def report_extraction(self, file_id):
2094                 """Report information extraction."""
2095                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2096
2097         def report_direct_download(self, title):
2098                 """Report information extraction."""
2099                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2100
2101         def _real_extract(self, url):
2102                 mobj = re.match(self._VALID_URL, url)
2103                 if mobj is None:
2104                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2105                         return
2106
2107                 if '?' in url:
2108                         cchar = '&'
2109                 else:
2110                         cchar = '?'
2111                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2112                 request = urllib2.Request(json_url.encode('utf-8'))
2113                 self.report_extraction(mobj.group(1))
2114                 info = None
2115                 try:
2116                         urlh = urllib2.urlopen(request)
2117                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2118                                 basename = url.split('/')[-1]
2119                                 title,ext = os.path.splitext(basename)
2120                                 title = title.decode('UTF-8')
2121                                 ext = ext.replace('.', '')
2122                                 self.report_direct_download(title)
2123                                 info = {
2124                                         'id': title,
2125                                         'url': url,
2126                                         'title': title,
2127                                         'ext': ext,
2128                                         'urlhandle': urlh
2129                                 }
2130                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2131                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2132                         return
2133                 if info is None: # Regular URL
2134                         try:
2135                                 json_code = urlh.read()
2136                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2137                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2138                                 return
2139
2140                         try:
2141                                 json_data = json.loads(json_code)
2142                                 if 'Post' in json_data:
2143                                         data = json_data['Post']
2144                                 else:
2145                                         data = json_data
2146
2147                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2148                                 video_url = data['media']['url']
2149                                 umobj = re.match(self._URL_EXT, video_url)
2150                                 if umobj is None:
2151                                         raise ValueError('Can not determine filename extension')
2152                                 ext = umobj.group(1)
2153
2154                                 info = {
2155                                         'id': data['item_id'],
2156                                         'url': video_url,
2157                                         'uploader': data['display_name'],
2158                                         'upload_date': upload_date,
2159                                         'title': data['title'],
2160                                         'ext': ext,
2161                                         'format': data['media']['mimeType'],
2162                                         'thumbnail': data['thumbnailUrl'],
2163                                         'description': data['description'],
2164                                         'player_url': data['embedUrl']
2165                                 }
2166                         except (ValueError,KeyError), err:
2167                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2168                                 return
2169
2170                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2171                 return [info]
2172
2173
2174 class MyVideoIE(InfoExtractor):
2175         """Information Extractor for myvideo.de."""
2176
2177         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2178         IE_NAME = u'myvideo'
2179
2180         def __init__(self, downloader=None):
2181                 InfoExtractor.__init__(self, downloader)
2182
2183         def report_download_webpage(self, video_id):
2184                 """Report webpage download."""
2185                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2186
2187         def report_extraction(self, video_id):
2188                 """Report information extraction."""
2189                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2190
2191         def _real_extract(self,url):
2192                 mobj = re.match(self._VALID_URL, url)
2193                 if mobj is None:
2194                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2195                         return
2196
2197                 video_id = mobj.group(1)
2198
2199                 # Get video webpage
2200                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2201                 try:
2202                         self.report_download_webpage(video_id)
2203                         webpage = urllib2.urlopen(request).read()
2204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2206                         return
2207
2208                 self.report_extraction(video_id)
2209                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2210                                  webpage)
2211                 if mobj is None:
2212                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2213                         return
2214                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2215
2216                 mobj = re.search('<title>([^<]+)</title>', webpage)
2217                 if mobj is None:
2218                         self._downloader.trouble(u'ERROR: unable to extract title')
2219                         return
2220
2221                 video_title = mobj.group(1)
2222
2223                 return [{
2224                         'id':           video_id,
2225                         'url':          video_url,
2226                         'uploader':     u'NA',
2227                         'upload_date':  u'NA',
2228                         'title':        video_title,
2229                         'ext':          u'flv',
2230                         'format':       u'NA',
2231                         'player_url':   None,
2232                 }]
2233
2234 class ComedyCentralIE(InfoExtractor):
2235         """Information extractor for The Daily Show and Colbert Report """
2236
2237         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2238         IE_NAME = u'comedycentral'
2239
2240         def report_extraction(self, episode_id):
2241                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2242
2243         def report_config_download(self, episode_id):
2244                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2245
2246         def report_index_download(self, episode_id):
2247                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2248
2249         def report_player_url(self, episode_id):
2250                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2251
2252         def _real_extract(self, url):
2253                 mobj = re.match(self._VALID_URL, url)
2254                 if mobj is None:
2255                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2256                         return
2257
2258                 if mobj.group('shortname'):
2259                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2260                                 url = u'http://www.thedailyshow.com/full-episodes/'
2261                         else:
2262                                 url = u'http://www.colbertnation.com/full-episodes/'
2263                         mobj = re.match(self._VALID_URL, url)
2264                         assert mobj is not None
2265
2266                 dlNewest = not mobj.group('episode')
2267                 if dlNewest:
2268                         epTitle = mobj.group('showname')
2269                 else:
2270                         epTitle = mobj.group('episode')
2271
2272                 req = urllib2.Request(url)
2273                 self.report_extraction(epTitle)
2274                 try:
2275                         htmlHandle = urllib2.urlopen(req)
2276                         html = htmlHandle.read()
2277                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2278                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2279                         return
2280                 if dlNewest:
2281                         url = htmlHandle.geturl()
2282                         mobj = re.match(self._VALID_URL, url)
2283                         if mobj is None:
2284                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2285                                 return
2286                         if mobj.group('episode') == '':
2287                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2288                                 return
2289                         epTitle = mobj.group('episode')
2290
2291                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2292                 if len(mMovieParams) == 0:
2293                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2294                         return
2295
2296                 playerUrl_raw = mMovieParams[0][0]
2297                 self.report_player_url(epTitle)
2298                 try:
2299                         urlHandle = urllib2.urlopen(playerUrl_raw)
2300                         playerUrl = urlHandle.geturl()
2301                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2302                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2303                         return
2304
2305                 uri = mMovieParams[0][1]
2306                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2307                 self.report_index_download(epTitle)
2308                 try:
2309                         indexXml = urllib2.urlopen(indexUrl).read()
2310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2312                         return
2313
2314                 results = []
2315
2316                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2317                 itemEls = idoc.findall('.//item')
2318                 for itemEl in itemEls:
2319                         mediaId = itemEl.findall('./guid')[0].text
2320                         shortMediaId = mediaId.split(':')[-1]
2321                         showId = mediaId.split(':')[-2].replace('.com', '')
2322                         officialTitle = itemEl.findall('./title')[0].text
2323                         officialDate = itemEl.findall('./pubDate')[0].text
2324
2325                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2326                                                 urllib.urlencode({'uri': mediaId}))
2327                         configReq = urllib2.Request(configUrl)
2328                         self.report_config_download(epTitle)
2329                         try:
2330                                 configXml = urllib2.urlopen(configReq).read()
2331                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2332                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2333                                 return
2334
2335                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2336                         turls = []
2337                         for rendition in cdoc.findall('.//rendition'):
2338                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2339                                 turls.append(finfo)
2340
2341                         if len(turls) == 0:
2342                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2343                                 continue
2344
2345                         # For now, just pick the highest bitrate
2346                         format,video_url = turls[-1]
2347
2348                         effTitle = showId + u'-' + epTitle
2349                         info = {
2350                                 'id': shortMediaId,
2351                                 'url': video_url,
2352                                 'uploader': showId,
2353                                 'upload_date': officialDate,
2354                                 'title': effTitle,
2355                                 'ext': 'mp4',
2356                                 'format': format,
2357                                 'thumbnail': None,
2358                                 'description': officialTitle,
2359                                 'player_url': playerUrl
2360                         }
2361
2362                         results.append(info)
2363
2364                 return results
2365
2366
2367 class EscapistIE(InfoExtractor):
2368         """Information extractor for The Escapist """
2369
2370         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371         IE_NAME = u'escapist'
2372
2373         def report_extraction(self, showName):
2374                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2375
2376         def report_config_download(self, showName):
2377                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2378
2379         def _real_extract(self, url):
2380                 mobj = re.match(self._VALID_URL, url)
2381                 if mobj is None:
2382                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2383                         return
2384                 showName = mobj.group('showname')
2385                 videoId = mobj.group('episode')
2386
2387                 self.report_extraction(showName)
2388                 try:
2389                         webPage = urllib2.urlopen(url)
2390                         webPageBytes = webPage.read()
2391                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2392                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2394                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2395                         return
2396
2397                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2398                 description = unescapeHTML(descMatch.group(1))
2399                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2400                 imgUrl = unescapeHTML(imgMatch.group(1))
2401                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2402                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2403                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2404                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2405
2406                 self.report_config_download(showName)
2407                 try:
2408                         configJSON = urllib2.urlopen(configUrl).read()
2409                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2410                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2411                         return
2412
2413                 # Technically, it's JavaScript, not JSON
2414                 configJSON = configJSON.replace("'", '"')
2415
2416                 try:
2417                         config = json.loads(configJSON)
2418                 except (ValueError,), err:
2419                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2420                         return
2421
2422                 playlist = config['playlist']
2423                 videoUrl = playlist[1]['url']
2424
2425                 info = {
2426                         'id': videoId,
2427                         'url': videoUrl,
2428                         'uploader': showName,
2429                         'upload_date': None,
2430                         'title': showName,
2431                         'ext': 'flv',
2432                         'format': 'flv',
2433                         'thumbnail': imgUrl,
2434                         'description': description,
2435                         'player_url': playerUrl,
2436                 }
2437
2438                 return [info]
2439
2440
2441 class CollegeHumorIE(InfoExtractor):
2442         """Information extractor for collegehumor.com"""
2443
2444         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2445         IE_NAME = u'collegehumor'
2446
2447         def report_webpage(self, video_id):
2448                 """Report information extraction."""
2449                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2450
2451         def report_extraction(self, video_id):
2452                 """Report information extraction."""
2453                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2454
2455         def _real_extract(self, url):
2456                 mobj = re.match(self._VALID_URL, url)
2457                 if mobj is None:
2458                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2459                         return
2460                 video_id = mobj.group('videoid')
2461
2462                 self.report_webpage(video_id)
2463                 request = urllib2.Request(url)
2464                 try:
2465                         webpage = urllib2.urlopen(request).read()
2466                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2468                         return
2469
2470                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2471                 if m is None:
2472                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2473                         return
2474                 internal_video_id = m.group('internalvideoid')
2475
2476                 info = {
2477                         'id': video_id,
2478                         'internal_id': internal_video_id,
2479                 }
2480
2481                 self.report_extraction(video_id)
2482                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2483                 try:
2484                         metaXml = urllib2.urlopen(xmlUrl).read()
2485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2487                         return
2488
2489                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2490                 try:
2491                         videoNode = mdoc.findall('./video')[0]
2492                         info['description'] = videoNode.findall('./description')[0].text
2493                         info['title'] = videoNode.findall('./caption')[0].text
2494                         info['url'] = videoNode.findall('./file')[0].text
2495                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2496                         info['ext'] = info['url'].rpartition('.')[2]
2497                         info['format'] = info['ext']
2498                 except IndexError:
2499                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2500                         return
2501
2502                 return [info]
2503
2504
2505 class XVideosIE(InfoExtractor):
2506         """Information extractor for xvideos.com"""
2507
2508         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2509         IE_NAME = u'xvideos'
2510
2511         def report_webpage(self, video_id):
2512                 """Report information extraction."""
2513                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2514
2515         def report_extraction(self, video_id):
2516                 """Report information extraction."""
2517                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2518
2519         def _real_extract(self, url):
2520                 mobj = re.match(self._VALID_URL, url)
2521                 if mobj is None:
2522                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2523                         return
2524                 video_id = mobj.group(1).decode('utf-8')
2525
2526                 self.report_webpage(video_id)
2527
2528                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2529                 try:
2530                         webpage = urllib2.urlopen(request).read()
2531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2532                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2533                         return
2534
2535                 self.report_extraction(video_id)
2536
2537
2538                 # Extract video URL
2539                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2540                 if mobj is None:
2541                         self._downloader.trouble(u'ERROR: unable to extract video url')
2542                         return
2543                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2544
2545
2546                 # Extract title
2547                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2548                 if mobj is None:
2549                         self._downloader.trouble(u'ERROR: unable to extract video title')
2550                         return
2551                 video_title = mobj.group(1).decode('utf-8')
2552
2553
2554                 # Extract video thumbnail
2555                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2556                 if mobj is None:
2557                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2558                         return
2559                 video_thumbnail = mobj.group(0).decode('utf-8')
2560
2561                 info = {
2562                         'id': video_id,
2563                         'url': video_url,
2564                         'uploader': None,
2565                         'upload_date': None,
2566                         'title': video_title,
2567                         'ext': 'flv',
2568                         'format': 'flv',
2569                         'thumbnail': video_thumbnail,
2570                         'description': None,
2571                         'player_url': None,
2572                 }
2573
2574                 return [info]
2575
2576
2577 class SoundcloudIE(InfoExtractor):
2578         """Information extractor for soundcloud.com
2579            To access the media, the uid of the song and a stream token
2580            must be extracted from the page source and the script must make
2581            a request to media.soundcloud.com/crossdomain.xml. Then
2582            the media can be grabbed by requesting from an url composed
2583            of the stream token and uid
2584          """
2585
2586         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2587         IE_NAME = u'soundcloud'
2588
2589         def __init__(self, downloader=None):
2590                 InfoExtractor.__init__(self, downloader)
2591
2592         def report_webpage(self, video_id):
2593                 """Report information extraction."""
2594                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2595
2596         def report_extraction(self, video_id):
2597                 """Report information extraction."""
2598                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2599
2600         def _real_extract(self, url):
2601                 mobj = re.match(self._VALID_URL, url)
2602                 if mobj is None:
2603                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2604                         return
2605
2606                 # extract uploader (which is in the url)
2607                 uploader = mobj.group(1).decode('utf-8')
2608                 # extract simple title (uploader + slug of song title)
2609                 slug_title =  mobj.group(2).decode('utf-8')
2610                 simple_title = uploader + u'-' + slug_title
2611
2612                 self.report_webpage('%s/%s' % (uploader, slug_title))
2613
2614                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2615                 try:
2616                         webpage = urllib2.urlopen(request).read()
2617                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2618                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2619                         return
2620
2621                 self.report_extraction('%s/%s' % (uploader, slug_title))
2622
2623                 # extract uid and stream token that soundcloud hands out for access
2624                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2625                 if mobj:
2626                         video_id = mobj.group(1)
2627                         stream_token = mobj.group(2)
2628
2629                 # extract unsimplified title
2630                 mobj = re.search('"title":"(.*?)",', webpage)
2631                 if mobj:
2632                         title = mobj.group(1).decode('utf-8')
2633                 else:
2634                         title = simple_title
2635
2636                 # construct media url (with uid/token)
2637                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2638                 mediaURL = mediaURL % (video_id, stream_token)
2639
2640                 # description
2641                 description = u'No description available'
2642                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2643                 if mobj:
2644                         description = mobj.group(1)
2645
2646                 # upload date
2647                 upload_date = None
2648                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2649                 if mobj:
2650                         try:
2651                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2652                         except Exception, e:
2653                                 self._downloader.to_stderr(str(e))
2654
2655                 # for soundcloud, a request to a cross domain is required for cookies
2656                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2657
2658                 return [{
2659                         'id':           video_id.decode('utf-8'),
2660                         'url':          mediaURL,
2661                         'uploader':     uploader.decode('utf-8'),
2662                         'upload_date':  upload_date,
2663                         'title':        title,
2664                         'ext':          u'mp3',
2665                         'format':       u'NA',
2666                         'player_url':   None,
2667                         'description': description.decode('utf-8')
2668                 }]
2669
2670
2671 class InfoQIE(InfoExtractor):
2672         """Information extractor for infoq.com"""
2673
2674         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2675         IE_NAME = u'infoq'
2676
2677         def report_webpage(self, video_id):
2678                 """Report information extraction."""
2679                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2680
2681         def report_extraction(self, video_id):
2682                 """Report information extraction."""
2683                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2684
2685         def _real_extract(self, url):
2686                 mobj = re.match(self._VALID_URL, url)
2687                 if mobj is None:
2688                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2689                         return
2690
2691                 self.report_webpage(url)
2692
2693                 request = urllib2.Request(url)
2694                 try:
2695                         webpage = urllib2.urlopen(request).read()
2696                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2697                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2698                         return
2699
2700                 self.report_extraction(url)
2701
2702
2703                 # Extract video URL
2704                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2705                 if mobj is None:
2706                         self._downloader.trouble(u'ERROR: unable to extract video url')
2707                         return
2708                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2709
2710
2711                 # Extract title
2712                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2713                 if mobj is None:
2714                         self._downloader.trouble(u'ERROR: unable to extract video title')
2715                         return
2716                 video_title = mobj.group(1).decode('utf-8')
2717
2718                 # Extract description
2719                 video_description = u'No description available.'
2720                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2721                 if mobj is not None:
2722                         video_description = mobj.group(1).decode('utf-8')
2723
2724                 video_filename = video_url.split('/')[-1]
2725                 video_id, extension = video_filename.split('.')
2726
2727                 info = {
2728                         'id': video_id,
2729                         'url': video_url,
2730                         'uploader': None,
2731                         'upload_date': None,
2732                         'title': video_title,
2733                         'ext': extension,
2734                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2735                         'thumbnail': None,
2736                         'description': video_description,
2737                         'player_url': None,
2738                 }
2739
2740                 return [info]
2741
2742 class MixcloudIE(InfoExtractor):
2743         """Information extractor for www.mixcloud.com"""
2744         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2745         IE_NAME = u'mixcloud'
2746
2747         def __init__(self, downloader=None):
2748                 InfoExtractor.__init__(self, downloader)
2749
2750         def report_download_json(self, file_id):
2751                 """Report JSON download."""
2752                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2753
2754         def report_extraction(self, file_id):
2755                 """Report information extraction."""
2756                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2757
2758         def get_urls(self, jsonData, fmt, bitrate='best'):
2759                 """Get urls from 'audio_formats' section in json"""
2760                 file_url = None
2761                 try:
2762                         bitrate_list = jsonData[fmt]
2763                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764                                 bitrate = max(bitrate_list) # select highest
2765
2766                         url_list = jsonData[fmt][bitrate]
2767                 except TypeError: # we have no bitrate info.
2768                         url_list = jsonData[fmt]
2769                 return url_list
2770
2771         def check_urls(self, url_list):
2772                 """Returns 1st active url from list"""
2773                 for url in url_list:
2774                         try:
2775                                 urllib2.urlopen(url)
2776                                 return url
2777                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778                                 url = None
2779
2780                 return None
2781
2782         def _print_formats(self, formats):
2783                 print 'Available formats:'
2784                 for fmt in formats.keys():
2785                         for b in formats[fmt]:
2786                                 try:
2787                                         ext = formats[fmt][b][0]
2788                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2789                                 except TypeError: # we have no bitrate info
2790                                         ext = formats[fmt][0]
2791                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2792                                         break
2793
2794         def _real_extract(self, url):
2795                 mobj = re.match(self._VALID_URL, url)
2796                 if mobj is None:
2797                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798                         return
2799                 # extract uploader & filename from url
2800                 uploader = mobj.group(1).decode('utf-8')
2801                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2802
2803                 # construct API request
2804                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2805                 # retrieve .json file with links to files
2806                 request = urllib2.Request(file_url)
2807                 try:
2808                         self.report_download_json(file_url)
2809                         jsonData = urllib2.urlopen(request).read()
2810                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2811                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2812                         return
2813
2814                 # parse JSON
2815                 json_data = json.loads(jsonData)
2816                 player_url = json_data['player_swf_url']
2817                 formats = dict(json_data['audio_formats'])
2818
2819                 req_format = self._downloader.params.get('format', None)
2820                 bitrate = None
2821
2822                 if self._downloader.params.get('listformats', None):
2823                         self._print_formats(formats)
2824                         return
2825
2826                 if req_format is None or req_format == 'best':
2827                         for format_param in formats.keys():
2828                                 url_list = self.get_urls(formats, format_param)
2829                                 # check urls
2830                                 file_url = self.check_urls(url_list)
2831                                 if file_url is not None:
2832                                         break # got it!
2833                 else:
2834                         if req_format not in formats.keys():
2835                                 self._downloader.trouble(u'ERROR: format is not available')
2836                                 return
2837
2838                         url_list = self.get_urls(formats, req_format)
2839                         file_url = self.check_urls(url_list)
2840                         format_param = req_format
2841
2842                 return [{
2843                         'id': file_id.decode('utf-8'),
2844                         'url': file_url.decode('utf-8'),
2845                         'uploader':     uploader.decode('utf-8'),
2846                         'upload_date': u'NA',
2847                         'title': json_data['name'],
2848                         'ext': file_url.split('.')[-1].decode('utf-8'),
2849                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2850                         'thumbnail': json_data['thumbnail_url'],
2851                         'description': json_data['description'],
2852                         'player_url': player_url.decode('utf-8'),
2853                 }]
2854
2855 class StanfordOpenClassroomIE(InfoExtractor):
2856         """Information extractor for Stanford's Open ClassRoom"""
2857
2858         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2859         IE_NAME = u'stanfordoc'
2860
2861         def report_download_webpage(self, objid):
2862                 """Report information extraction."""
2863                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2864
2865         def report_extraction(self, video_id):
2866                 """Report information extraction."""
2867                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2868
2869         def _real_extract(self, url):
2870                 mobj = re.match(self._VALID_URL, url)
2871                 if mobj is None:
2872                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2873                         return
2874
2875                 if mobj.group('course') and mobj.group('video'): # A specific video
2876                         course = mobj.group('course')
2877                         video = mobj.group('video')
2878                         info = {
2879                                 'id': course + '_' + video,
2880                         }
2881
2882                         self.report_extraction(info['id'])
2883                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2884                         xmlUrl = baseUrl + video + '.xml'
2885                         try:
2886                                 metaXml = urllib2.urlopen(xmlUrl).read()
2887                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2888                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2889                                 return
2890                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2891                         try:
2892                                 info['title'] = mdoc.findall('./title')[0].text
2893                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2894                         except IndexError:
2895                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2896                                 return
2897                         info['ext'] = info['url'].rpartition('.')[2]
2898                         info['format'] = info['ext']
2899                         return [info]
2900                 elif mobj.group('course'): # A course page
2901                         course = mobj.group('course')
2902                         info = {
2903                                 'id': course,
2904                                 'type': 'playlist',
2905                         }
2906
2907                         self.report_download_webpage(info['id'])
2908                         try:
2909                                 coursepage = urllib2.urlopen(url).read()
2910                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2911                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2912                                 return
2913
2914                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2915                         if m:
2916                                 info['title'] = unescapeHTML(m.group(1))
2917                         else:
2918                                 info['title'] = info['id']
2919
2920                         m = re.search('<description>([^<]+)</description>', coursepage)
2921                         if m:
2922                                 info['description'] = unescapeHTML(m.group(1))
2923
2924                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2925                         info['list'] = [
2926                                 {
2927                                         'type': 'reference',
2928                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2929                                 }
2930                                         for vpage in links]
2931                         results = []
2932                         for entry in info['list']:
2933                                 assert entry['type'] == 'reference'
2934                                 results += self.extract(entry['url'])
2935                         return results
2936
2937                 else: # Root page
2938                         info = {
2939                                 'id': 'Stanford OpenClassroom',
2940                                 'type': 'playlist',
2941                         }
2942
2943                         self.report_download_webpage(info['id'])
2944                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2945                         try:
2946                                 rootpage = urllib2.urlopen(rootURL).read()
2947                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2948                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2949                                 return
2950
2951                         info['title'] = info['id']
2952
2953                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2954                         info['list'] = [
2955                                 {
2956                                         'type': 'reference',
2957                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2958                                 }
2959                                         for cpage in links]
2960
2961                         results = []
2962                         for entry in info['list']:
2963                                 assert entry['type'] == 'reference'
2964                                 results += self.extract(entry['url'])
2965                         return results
2966
2967 class MTVIE(InfoExtractor):
2968         """Information extractor for MTV.com"""
2969
2970         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2971         IE_NAME = u'mtv'
2972
2973         def report_webpage(self, video_id):
2974                 """Report information extraction."""
2975                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2976
2977         def report_extraction(self, video_id):
2978                 """Report information extraction."""
2979                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2980
2981         def _real_extract(self, url):
2982                 mobj = re.match(self._VALID_URL, url)
2983                 if mobj is None:
2984                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2985                         return
2986                 if not mobj.group('proto'):
2987                         url = 'http://' + url
2988                 video_id = mobj.group('videoid')
2989                 self.report_webpage(video_id)
2990
2991                 request = urllib2.Request(url)
2992                 try:
2993                         webpage = urllib2.urlopen(request).read()
2994                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2996                         return
2997
2998                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2999                 if mobj is None:
3000                         self._downloader.trouble(u'ERROR: unable to extract song name')
3001                         return
3002                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3003                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3004                 if mobj is None:
3005                         self._downloader.trouble(u'ERROR: unable to extract performer')
3006                         return
3007                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3008                 video_title = performer + ' - ' + song_name
3009
3010                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3011                 if mobj is None:
3012                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3013                         return
3014                 mtvn_uri = mobj.group(1)
3015
3016                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3017                 if mobj is None:
3018                         self._downloader.trouble(u'ERROR: unable to extract content id')
3019                         return
3020                 content_id = mobj.group(1)
3021
3022                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3023                 self.report_extraction(video_id)
3024                 request = urllib2.Request(videogen_url)
3025                 try:
3026                         metadataXml = urllib2.urlopen(request).read()
3027                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3028                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3029                         return
3030
3031                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3032                 renditions = mdoc.findall('.//rendition')
3033
3034                 # For now, always pick the highest quality.
3035                 rendition = renditions[-1]
3036
3037                 try:
3038                         _,_,ext = rendition.attrib['type'].partition('/')
3039                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3040                         video_url = rendition.find('./src').text
3041                 except KeyError:
3042                         self._downloader.trouble('Invalid rendition field.')
3043                         return
3044
3045                 info = {
3046                         'id': video_id,
3047                         'url': video_url,
3048                         'uploader': performer,
3049                         'title': video_title,
3050                         'ext': ext,
3051                         'format': format,
3052                 }
3053
3054                 return [info]
3055
3056
3057 class YoukuIE(InfoExtractor):
3058
3059         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3060         IE_NAME = u'Youku'
3061
3062         def __init__(self, downloader=None):
3063                 InfoExtractor.__init__(self, downloader)
3064
3065         def report_download_webpage(self, file_id):
3066                 """Report webpage download."""
3067                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3068
3069         def report_extraction(self, file_id):
3070                 """Report information extraction."""
3071                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3072
3073         def _gen_sid(self):
3074                 nowTime = int(time.time() * 1000)
3075                 random1 = random.randint(1000,1998)
3076                 random2 = random.randint(1000,9999)
3077
3078                 return "%d%d%d" %(nowTime,random1,random2)
3079
3080         def _get_file_ID_mix_string(self, seed):
3081                 mixed = []
3082                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3083                 seed = float(seed)
3084                 for i in range(len(source)):
3085                         seed  =  (seed * 211 + 30031 ) % 65536
3086                         index  =  math.floor(seed / 65536 * len(source) )
3087                         mixed.append(source[int(index)])
3088                         source.remove(source[int(index)])
3089                 #return ''.join(mixed)
3090                 return mixed
3091
3092         def _get_file_id(self, fileId, seed):
3093                 mixed = self._get_file_ID_mix_string(seed)
3094                 ids = fileId.split('*')
3095                 realId = []
3096                 for ch in ids:
3097                         if ch:
3098                                 realId.append(mixed[int(ch)])
3099                 return ''.join(realId)
3100
3101         def _real_extract(self, url):
3102                 mobj = re.match(self._VALID_URL, url)
3103                 if mobj is None:
3104                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3105                         return
3106                 video_id = mobj.group('ID')
3107
3108                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3109
3110                 request = urllib2.Request(info_url, None, std_headers)
3111                 try:
3112                         self.report_download_webpage(video_id)
3113                         jsondata = urllib2.urlopen(request).read()
3114                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3115                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3116                         return
3117
3118                 self.report_extraction(video_id)
3119                 try:
3120                         config = json.loads(jsondata)
3121
3122                         video_title =  config['data'][0]['title']
3123                         seed = config['data'][0]['seed']
3124
3125                         format = self._downloader.params.get('format', None)
3126                         supported_format = config['data'][0]['streamfileids'].keys()
3127
3128                         if format is None or format == 'best':
3129                                 if 'hd2' in supported_format:
3130                                         format = 'hd2'
3131                                 else:
3132                                         format = 'flv'
3133                                 ext = u'flv'
3134                         elif format == 'worst':
3135                                 format = 'mp4'
3136                                 ext = u'mp4'
3137                         else:
3138                                 format = 'flv'
3139                                 ext = u'flv'
3140
3141
3142                         fileid = config['data'][0]['streamfileids'][format]
3143                         seg_number = len(config['data'][0]['segs'][format])
3144
3145                         keys=[]
3146                         for i in xrange(seg_number):
3147                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3148
3149                         #TODO check error
3150                         #youku only could be viewed from mainland china
3151                 except:
3152                         self._downloader.trouble(u'ERROR: unable to extract info section')
3153                         return
3154
3155                 files_info=[]
3156                 sid = self._gen_sid()
3157                 fileid = self._get_file_id(fileid, seed)
3158
3159                 #column 8,9 of fileid represent the segment number
3160                 #fileid[7:9] should be changed
3161                 for index, key in enumerate(keys):
3162
3163                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3164                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3165
3166                         info = {
3167                                 'id': '%s_part%02d' % (video_id, index),
3168                                 'url': download_url,
3169                                 'uploader': None,
3170                                 'title': video_title,
3171                                 'ext': ext,
3172                                 'format': u'NA'
3173                         }
3174                         files_info.append(info)
3175
3176                 return files_info
3177
3178
3179 class XNXXIE(InfoExtractor):
3180         """Information extractor for xnxx.com"""
3181
3182         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3183         IE_NAME = u'xnxx'
3184         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3185         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3186         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3187
3188         def report_webpage(self, video_id):
3189                 """Report information extraction"""
3190                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3191
3192         def report_extraction(self, video_id):
3193                 """Report information extraction"""
3194                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3195
3196         def _real_extract(self, url):
3197                 mobj = re.match(self._VALID_URL, url)
3198                 if mobj is None:
3199                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3200                         return
3201                 video_id = mobj.group(1).decode('utf-8')
3202
3203                 self.report_webpage(video_id)
3204
3205                 # Get webpage content
3206                 try:
3207                         webpage = urllib2.urlopen(url).read()
3208                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3209                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3210                         return
3211
3212                 result = re.search(self.VIDEO_URL_RE, webpage)
3213                 if result is None:
3214                         self._downloader.trouble(u'ERROR: unable to extract video url')
3215                         return
3216                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3217
3218                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3219                 if result is None:
3220                         self._downloader.trouble(u'ERROR: unable to extract video title')
3221                         return
3222                 video_title = result.group(1).decode('utf-8')
3223
3224                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3225                 if result is None:
3226                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3227                         return
3228                 video_thumbnail = result.group(1).decode('utf-8')
3229
3230                 info = {'id': video_id,
3231                                 'url': video_url,
3232                                 'uploader': None,
3233                                 'upload_date': None,
3234                                 'title': video_title,
3235                                 'ext': 'flv',
3236                                 'format': 'flv',
3237                                 'thumbnail': video_thumbnail,
3238                                 'description': None,
3239                                 'player_url': None}
3240
3241                 return [info]
3242
3243
3244 class GooglePlusIE(InfoExtractor):
3245         """Information extractor for plus.google.com."""
3246
3247         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3248         IE_NAME = u'plus.google'
3249
3250         def __init__(self, downloader=None):
3251                 InfoExtractor.__init__(self, downloader)
3252
3253         def report_extract_entry(self, url):
3254                 """Report downloading extry"""
3255                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3256
3257         def report_date(self, upload_date):
3258                 """Report downloading extry"""
3259                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3260
3261         def report_uploader(self, uploader):
3262                 """Report downloading extry"""
3263                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3264
3265         def report_title(self, video_title):
3266                 """Report downloading extry"""
3267                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3268
3269         def report_extract_vid_page(self, video_page):
3270                 """Report information extraction."""
3271                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3272
3273         def _real_extract(self, url):
3274                 # Extract id from URL
3275                 mobj = re.match(self._VALID_URL, url)
3276                 if mobj is None:
3277                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3278                         return
3279
3280                 post_url = mobj.group(0)
3281                 video_id = mobj.group(2)
3282
3283                 video_extension = 'flv'
3284
3285                 # Step 1, Retrieve post webpage to extract further information
3286                 self.report_extract_entry(post_url)
3287                 request = urllib2.Request(post_url)
3288                 try:
3289                         webpage = urllib2.urlopen(request).read()
3290                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3291                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3292                         return
3293
3294                 # Extract update date
3295                 upload_date = u'NA'
3296                 pattern = 'title="Timestamp">(.*?)</a>'
3297                 mobj = re.search(pattern, webpage)
3298                 if mobj:
3299                         upload_date = mobj.group(1)
3300                         # Convert timestring to a format suitable for filename
3301                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3302                         upload_date = upload_date.strftime('%Y%m%d')
3303                 self.report_date(upload_date)
3304
3305                 # Extract uploader
3306                 uploader = u'NA'
3307                 pattern = r'rel\="author".*?>(.*?)</a>'
3308                 mobj = re.search(pattern, webpage)
3309                 if mobj:
3310                         uploader = mobj.group(1)
3311                 self.report_uploader(uploader)
3312
3313                 # Extract title
3314                 # Get the first line for title
3315                 video_title = u'NA'
3316                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3317                 mobj = re.search(pattern, webpage)
3318                 if mobj:
3319                         video_title = mobj.group(1)
3320                 self.report_title(video_title)
3321
3322                 # Step 2, Stimulate clicking the image box to launch video
3323                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3324                 mobj = re.search(pattern, webpage)
3325                 if mobj is None:
3326                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3327
3328                 video_page = mobj.group(1)
3329                 request = urllib2.Request(video_page)
3330                 try:
3331                         webpage = urllib2.urlopen(request).read()
3332                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3333                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3334                         return
3335                 self.report_extract_vid_page(video_page)
3336
3337
3338                 # Extract video links on video page
3339                 """Extract video links of all sizes"""
3340                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3341                 mobj = re.findall(pattern, webpage)
3342                 if len(mobj) == 0:
3343                         self._downloader.trouble(u'ERROR: unable to extract video links')
3344
3345                 # Sort in resolution
3346                 links = sorted(mobj)
3347
3348                 # Choose the lowest of the sort, i.e. highest resolution
3349                 video_url = links[-1]
3350                 # Only get the url. The resolution part in the tuple has no use anymore
3351                 video_url = video_url[-1]
3352                 # Treat escaped \u0026 style hex
3353                 video_url = unicode(video_url, "unicode_escape")
3354
3355
3356                 return [{
3357                         'id':           video_id.decode('utf-8'),
3358                         'url':          video_url,
3359                         'uploader':     uploader.decode('utf-8'),
3360                         'upload_date':  upload_date.decode('utf-8'),
3361                         'title':        video_title.decode('utf-8'),
3362                         'ext':          video_extension.decode('utf-8'),
3363                         'format':       u'NA',
3364                         'player_url':   None,
3365                 }]