git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import HTMLParser
   6 import httplib
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import time
  12 import urllib
  13 import urllib2
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 from urlparse import parse_qs
  19
  20 try:
  21         import cStringIO as StringIO
  22 except ImportError:
  23         import StringIO
  24
  25 from utils import *
  26
  27
  28 class InfoExtractor(object):
  29         """Information Extractor class.
  30
  31         Information extractors are the classes that, given a URL, extract
  32         information from the video (or videos) the URL refers to. This
  33         information includes the real video URL, the video title and simplified
  34         title, author and others. The information is stored in a dictionary
  35         which is then passed to the FileDownloader. The FileDownloader
  36         processes this information possibly downloading the video to the file
  37         system, among other possible outcomes. The dictionaries must include
  38         the following fields:
  39
  40         id:             Video identifier.
  41         url:            Final video URL.
  42         uploader:       Nickname of the video uploader.
  43         title:          Literal title.
  44         ext:            Video filename extension.
  45         format:         Video format.
  46         player_url:     SWF Player URL (may be None).
  47
  48         The following fields are optional. Their primary purpose is to allow
  49         youtube-dl to serve as the backend for a video search function, such
  50         as the one in youtube2mp3.  They are only used when their respective
  51         forced printing functions are called:
  52
  53         thumbnail:      Full URL to a video thumbnail image.
  54         description:    One-line video description.
  55
  56         Subclasses of this one should re-define the _real_initialize() and
  57         _real_extract() methods and define a _VALID_URL regexp.
  58         Probably, they should also be added to the list of extractors.
  59         """
  60
  61         _ready = False
  62         _downloader = None
  63
  64         def __init__(self, downloader=None):
  65                 """Constructor. Receives an optional downloader."""
  66                 self._ready = False
  67                 self.set_downloader(downloader)
  68
  69         def suitable(self, url):
  70                 """Receives a URL and returns True if suitable for this IE."""
  71                 return re.match(self._VALID_URL, url) is not None
  72
  73         def initialize(self):
  74                 """Initializes an instance (authentication, etc)."""
  75                 if not self._ready:
  76                         self._real_initialize()
  77                         self._ready = True
  78
  79         def extract(self, url):
  80                 """Extracts URL information and returns it in list of dicts."""
  81                 self.initialize()
  82                 return self._real_extract(url)
  83
  84         def set_downloader(self, downloader):
  85                 """Sets the downloader for this IE."""
  86                 self._downloader = downloader
  87
  88         def _real_initialize(self):
  89                 """Real initialization process. Redefine in subclasses."""
  90                 pass
  91
  92         def _real_extract(self, url):
  93                 """Real extraction process. Redefine in subclasses."""
  94                 pass
  95
  96
  97 class YoutubeIE(InfoExtractor):
  98         """Information extractor for youtube.com."""
  99
 100         _VALID_URL = r"""^
 101                          (
 102                              (?:https?://)?                                       # http(s):// (optional)
 103                              (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 104                                 tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 105                              (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 106                              (?!view_play_list|my_playlists|artist|playlist)      # ignore playlist URLs
 107                              (?:                                                  # the various things that can precede the ID:
 108                                  (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 109                                  |(?:                                             # or the v= param in all its forms
 110                                      (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 111                                      (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 112                                      (?:.+&)?                                     # any other preceding param (like /?s=tuff&v=xxxx)
 113                                      v=
 114                                  )
 115                              )?                                                   # optional -> youtube.com/xxxx is OK
 116                          )?                                                       # all until now is optional -> you can pass the naked ID
 117                          ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 118                          (?(1).+)?                                                # if we found the ID, everything can follow
 119                          $"""
 120         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 121         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 122         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 123         _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 124         _NETRC_MACHINE = 'youtube'
 125         # Listed in order of quality
 126         _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 127         _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 128         _video_extensions = {
 129                 '13': '3gp',
 130                 '17': 'mp4',
 131                 '18': 'mp4',
 132                 '22': 'mp4',
 133                 '37': 'mp4',
 134                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 135                 '43': 'webm',
 136                 '44': 'webm',
 137                 '45': 'webm',
 138                 '46': 'webm',
 139         }
 140         _video_dimensions = {
 141                 '5': '240x400',
 142                 '6': '???',
 143                 '13': '???',
 144                 '17': '144x176',
 145                 '18': '360x640',
 146                 '22': '720x1280',
 147                 '34': '360x640',
 148                 '35': '480x854',
 149                 '37': '1080x1920',
 150                 '38': '3072x4096',
 151                 '43': '360x640',
 152                 '44': '480x854',
 153                 '45': '720x1280',
 154                 '46': '1080x1920',
 155         }
 156         IE_NAME = u'youtube'
 157
 158         def suitable(self, url):
 159                 """Receives a URL and returns True if suitable for this IE."""
 160                 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
 161
 162         def report_lang(self):
 163                 """Report attempt to set language."""
 164                 self._downloader.to_screen(u'[youtube] Setting language')
 165
 166         def report_login(self):
 167                 """Report attempt to log in."""
 168                 self._downloader.to_screen(u'[youtube] Logging in')
 169
 170         def report_age_confirmation(self):
 171                 """Report attempt to confirm age."""
 172                 self._downloader.to_screen(u'[youtube] Confirming age')
 173
 174         def report_video_webpage_download(self, video_id):
 175                 """Report attempt to download video webpage."""
 176                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 177
 178         def report_video_info_webpage_download(self, video_id):
 179                 """Report attempt to download video info webpage."""
 180                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 181
 182         def report_video_subtitles_download(self, video_id):
 183                 """Report attempt to download video info webpage."""
 184                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
 185
 186         def report_information_extraction(self, video_id):
 187                 """Report attempt to extract video information."""
 188                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 189
 190         def report_unavailable_format(self, video_id, format):
 191                 """Report extracted video URL."""
 192                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 193
 194         def report_rtmp_download(self):
 195                 """Indicate the download will use the RTMP protocol."""
 196                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 197
 198         def _closed_captions_xml_to_srt(self, xml_string):
 199                 srt = ''
 200                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
 201                 # TODO parse xml instead of regex
 202                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
 203                         if not dur: dur = '4'
 204                         start = float(start)
 205                         end = start + float(dur)
 206                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
 207                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
 208                         caption = unescapeHTML(caption)
 209                         caption = unescapeHTML(caption) # double cycle, intentional
 210                         srt += str(n+1) + '\n'
 211                         srt += start + ' --> ' + end + '\n'
 212                         srt += caption + '\n\n'
 213                 return srt
 214
 215         def _print_formats(self, formats):
 216                 print 'Available formats:'
 217                 for x in formats:
 218                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
 219
 220         def _real_initialize(self):
 221                 if self._downloader is None:
 222                         return
 223
 224                 username = None
 225                 password = None
 226                 downloader_params = self._downloader.params
 227
 228                 # Attempt to use provided username and password or .netrc data
 229                 if downloader_params.get('username', None) is not None:
 230                         username = downloader_params['username']
 231                         password = downloader_params['password']
 232                 elif downloader_params.get('usenetrc', False):
 233                         try:
 234                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 235                                 if info is not None:
 236                                         username = info[0]
 237                                         password = info[2]
 238                                 else:
 239                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 240                         except (IOError, netrc.NetrcParseError), err:
 241                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 242                                 return
 243
 244                 # Set language
 245                 request = urllib2.Request(self._LANG_URL)
 246                 try:
 247                         self.report_lang()
 248                         urllib2.urlopen(request).read()
 249                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 250                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 251                         return
 252
 253                 # No authentication to be performed
 254                 if username is None:
 255                         return
 256
 257                 # Log in
 258                 login_form = {
 259                                 'current_form': 'loginForm',
 260                                 'next':         '/',
 261                                 'action_login': 'Log In',
 262                                 'username':     username,
 263                                 'password':     password,
 264                                 }
 265                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 266                 try:
 267                         self.report_login()
 268                         login_results = urllib2.urlopen(request).read()
 269                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 270                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 271                                 return
 272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 273                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 274                         return
 275
 276                 # Confirm age
 277                 age_form = {
 278                                 'next_url':             '/',
 279                                 'action_confirm':       'Confirm',
 280                                 }
 281                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 282                 try:
 283                         self.report_age_confirmation()
 284                         age_results = urllib2.urlopen(request).read()
 285                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 286                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 287                         return
 288
 289         def _real_extract(self, url):
 290                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 291                 mobj = re.search(self._NEXT_URL_RE, url)
 292                 if mobj:
 293                         url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
 294
 295                 # Extract video id from URL
 296                 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 297                 if mobj is None:
 298                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 299                         return
 300                 video_id = mobj.group(2)
 301
 302                 # Get video webpage
 303                 self.report_video_webpage_download(video_id)
 304                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
 305                 try:
 306                         video_webpage = urllib2.urlopen(request).read()
 307                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 308                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 309                         return
 310
 311                 # Attempt to extract SWF player URL
 312                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 313                 if mobj is not None:
 314                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 315                 else:
 316                         player_url = None
 317
 318                 # Get video info
 319                 self.report_video_info_webpage_download(video_id)
 320                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 321                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 322                                         % (video_id, el_type))
 323                         request = urllib2.Request(video_info_url)
 324                         try:
 325                                 video_info_webpage = urllib2.urlopen(request).read()
 326                                 video_info = parse_qs(video_info_webpage)
 327                                 if 'token' in video_info:
 328                                         break
 329                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 330                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 331                                 return
 332                 if 'token' not in video_info:
 333                         if 'reason' in video_info:
 334                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 335                         else:
 336                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 337                         return
 338
 339                 # Check for "rental" videos
 340                 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 341                         self._downloader.trouble(u'ERROR: "rental" videos not supported')
 342                         return
 343
 344                 # Start extracting information
 345                 self.report_information_extraction(video_id)
 346
 347                 # uploader
 348                 if 'author' not in video_info:
 349                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 350                         return
 351                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 352
 353                 # title
 354                 if 'title' not in video_info:
 355                         self._downloader.trouble(u'ERROR: unable to extract video title')
 356                         return
 357                 video_title = urllib.unquote_plus(video_info['title'][0])
 358                 video_title = video_title.decode('utf-8')
 359
 360                 # thumbnail image
 361                 if 'thumbnail_url' not in video_info:
 362                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 363                         video_thumbnail = ''
 364                 else:   # don't panic if we can't find it
 365                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 366
 367                 # upload date
 368                 upload_date = u'NA'
 369                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 370                 if mobj is not None:
 371                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 372                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 373                         for expression in format_expressions:
 374                                 try:
 375                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 376                                 except:
 377                                         pass
 378
 379                 # description
 380                 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
 381                 if video_description: video_description = clean_html(video_description)
 382                 else: video_description = ''
 383
 384                 # closed captions
 385                 video_subtitles = None
 386                 if self._downloader.params.get('writesubtitles', False):
 387                         try:
 388                                 self.report_video_subtitles_download(video_id)
 389                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 390                                 try:
 391                                         srt_list = urllib2.urlopen(request).read()
 392                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 393                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 394                                 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
 395                                 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
 396                                 if not srt_lang_list:
 397                                         raise Trouble(u'WARNING: video has no closed captions')
 398                                 if self._downloader.params.get('subtitleslang', False):
 399                                         srt_lang = self._downloader.params.get('subtitleslang')
 400                                 elif 'en' in srt_lang_list:
 401                                         srt_lang = 'en'
 402                                 else:
 403                                         srt_lang = srt_lang_list.keys()[0]
 404                                 if not srt_lang in srt_lang_list:
 405                                         raise Trouble(u'WARNING: no closed captions found in the specified language')
 406                                 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
 407                                 try:
 408                                         srt_xml = urllib2.urlopen(request).read()
 409                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 410                                         raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
 411                                 if not srt_xml:
 412                                         raise Trouble(u'WARNING: unable to download video subtitles')
 413                                 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
 414                         except Trouble as trouble:
 415                                 self._downloader.trouble(trouble[0])
 416
 417                 # token
 418                 video_token = urllib.unquote_plus(video_info['token'][0])
 419
 420                 # Decide which formats to download
 421                 req_format = self._downloader.params.get('format', None)
 422
 423                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 424                         self.report_rtmp_download()
 425                         video_url_list = [(None, video_info['conn'][0])]
 426                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 427                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 428                         url_data = [parse_qs(uds) for uds in url_data_strs]
 429                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
 430                         url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 431
 432                         format_limit = self._downloader.params.get('format_limit', None)
 433                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 434                         if format_limit is not None and format_limit in available_formats:
 435                                 format_list = available_formats[available_formats.index(format_limit):]
 436                         else:
 437                                 format_list = available_formats
 438                         existing_formats = [x for x in format_list if x in url_map]
 439                         if len(existing_formats) == 0:
 440                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 441                                 return
 442                         if self._downloader.params.get('listformats', None):
 443                                 self._print_formats(existing_formats)
 444                                 return
 445                         if req_format is None or req_format == 'best':
 446                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 447                         elif req_format == 'worst':
 448                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 449                         elif req_format in ('-1', 'all'):
 450                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 451                         else:
 452                                 # Specific formats. We pick the first in a slash-delimeted sequence.
 453                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 454                                 req_formats = req_format.split('/')
 455                                 video_url_list = None
 456                                 for rf in req_formats:
 457                                         if rf in url_map:
 458                                                 video_url_list = [(rf, url_map[rf])]
 459                                                 break
 460                                 if video_url_list is None:
 461                                         self._downloader.trouble(u'ERROR: requested format not available')
 462                                         return
 463                 else:
 464                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
 465                         return
 466
 467                 results = []
 468                 for format_param, video_real_url in video_url_list:
 469                         # Extension
 470                         video_extension = self._video_extensions.get(format_param, 'flv')
 471
 472                         results.append({
 473                                 'id':           video_id.decode('utf-8'),
 474                                 'url':          video_real_url.decode('utf-8'),
 475                                 'uploader':     video_uploader.decode('utf-8'),
 476                                 'upload_date':  upload_date,
 477                                 'title':        video_title,
 478                                 'ext':          video_extension.decode('utf-8'),
 479                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 480                                 'thumbnail':    video_thumbnail.decode('utf-8'),
 481                                 'description':  video_description,
 482                                 'player_url':   player_url,
 483                                 'subtitles':    video_subtitles
 484                         })
 485                 return results
 486
 487
 488 class MetacafeIE(InfoExtractor):
 489         """Information Extractor for metacafe.com."""
 490
 491         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 492         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 493         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 494         IE_NAME = u'metacafe'
 495
 496         def __init__(self, downloader=None):
 497                 InfoExtractor.__init__(self, downloader)
 498
 499         def report_disclaimer(self):
 500                 """Report disclaimer retrieval."""
 501                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 502
 503         def report_age_confirmation(self):
 504                 """Report attempt to confirm age."""
 505                 self._downloader.to_screen(u'[metacafe] Confirming age')
 506
 507         def report_download_webpage(self, video_id):
 508                 """Report webpage download."""
 509                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
 510
 511         def report_extraction(self, video_id):
 512                 """Report information extraction."""
 513                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
 514
 515         def _real_initialize(self):
 516                 # Retrieve disclaimer
 517                 request = urllib2.Request(self._DISCLAIMER)
 518                 try:
 519                         self.report_disclaimer()
 520                         disclaimer = urllib2.urlopen(request).read()
 521                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 522                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 523                         return
 524
 525                 # Confirm age
 526                 disclaimer_form = {
 527                         'filters': '0',
 528                         'submit': "Continue - I'm over 18",
 529                         }
 530                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
 531                 try:
 532                         self.report_age_confirmation()
 533                         disclaimer = urllib2.urlopen(request).read()
 534                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 535                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 536                         return
 537
 538         def _real_extract(self, url):
 539                 # Extract id and simplified title from URL
 540                 mobj = re.match(self._VALID_URL, url)
 541                 if mobj is None:
 542                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 543                         return
 544
 545                 video_id = mobj.group(1)
 546
 547                 # Check if video comes from YouTube
 548                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 549                 if mobj2 is not None:
 550                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
 551                         return
 552
 553                 # Retrieve video webpage to extract further information
 554                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 555                 try:
 556                         self.report_download_webpage(video_id)
 557                         webpage = urllib2.urlopen(request).read()
 558                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 559                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 560                         return
 561
 562                 # Extract URL, uploader and title from webpage
 563                 self.report_extraction(video_id)
 564                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 565                 if mobj is not None:
 566                         mediaURL = urllib.unquote(mobj.group(1))
 567                         video_extension = mediaURL[-3:]
 568
 569                         # Extract gdaKey if available
 570                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 571                         if mobj is None:
 572                                 video_url = mediaURL
 573                         else:
 574                                 gdaKey = mobj.group(1)
 575                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 576                 else:
 577                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 578                         if mobj is None:
 579                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 580                                 return
 581                         vardict = parse_qs(mobj.group(1))
 582                         if 'mediaData' not in vardict:
 583                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 584                                 return
 585                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
 586                         if mobj is None:
 587                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
 588                                 return
 589                         mediaURL = mobj.group(1).replace('\\/', '/')
 590                         video_extension = mediaURL[-3:]
 591                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
 592
 593                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 594                 if mobj is None:
 595                         self._downloader.trouble(u'ERROR: unable to extract title')
 596                         return
 597                 video_title = mobj.group(1).decode('utf-8')
 598
 599                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
 600                 if mobj is None:
 601                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 602                         return
 603                 video_uploader = mobj.group(1)
 604
 605                 return [{
 606                         'id':           video_id.decode('utf-8'),
 607                         'url':          video_url.decode('utf-8'),
 608                         'uploader':     video_uploader.decode('utf-8'),
 609                         'upload_date':  u'NA',
 610                         'title':        video_title,
 611                         'ext':          video_extension.decode('utf-8'),
 612                         'format':       u'NA',
 613                         'player_url':   None,
 614                 }]
 615
 616
 617 class DailymotionIE(InfoExtractor):
 618         """Information Extractor for Dailymotion"""
 619
 620         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 621         IE_NAME = u'dailymotion'
 622
 623         def __init__(self, downloader=None):
 624                 InfoExtractor.__init__(self, downloader)
 625
 626         def report_download_webpage(self, video_id):
 627                 """Report webpage download."""
 628                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
 629
 630         def report_extraction(self, video_id):
 631                 """Report information extraction."""
 632                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
 633
 634         def _real_extract(self, url):
 635                 # Extract id and simplified title from URL
 636                 mobj = re.match(self._VALID_URL, url)
 637                 if mobj is None:
 638                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 639                         return
 640
 641                 video_id = mobj.group(1).split('_')[0].split('?')[0]
 642
 643                 video_extension = 'mp4'
 644
 645                 # Retrieve video webpage to extract further information
 646                 request = urllib2.Request(url)
 647                 request.add_header('Cookie', 'family_filter=off')
 648                 try:
 649                         self.report_download_webpage(video_id)
 650                         webpage = urllib2.urlopen(request).read()
 651                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 652                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 653                         return
 654
 655                 # Extract URL, uploader and title from webpage
 656                 self.report_extraction(video_id)
 657                 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 658                 if mobj is None:
 659                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 660                         return
 661                 flashvars = urllib.unquote(mobj.group(1))
 662
 663                 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 664                         if key in flashvars:
 665                                 max_quality = key
 666                                 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
 667                                 break
 668                 else:
 669                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 670                         return
 671
 672                 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 673                 if mobj is None:
 674                         self._downloader.trouble(u'ERROR: unable to extract video URL')
 675                         return
 676
 677                 video_url = urllib.unquote(mobj.group(1)).replace('\\/', '/')
 678
 679                 # TODO: support choosing qualities
 680
 681                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 682                 if mobj is None:
 683                         self._downloader.trouble(u'ERROR: unable to extract title')
 684                         return
 685                 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
 686
 687                 video_uploader = u'NA'
 688                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
 689                 if mobj is None:
 690                         self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
 691                 else:
 692                         video_uploader = mobj.group(1)
 693
 694                 video_upload_date = u'NA'
 695                 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 696                 if mobj is not None:
 697                         video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 698
 699                 return [{
 700                         'id':           video_id.decode('utf-8'),
 701                         'url':          video_url.decode('utf-8'),
 702                         'uploader':     video_uploader.decode('utf-8'),
 703                         'upload_date':  video_upload_date,
 704                         'title':        video_title,
 705                         'ext':          video_extension.decode('utf-8'),
 706                         'format':       u'NA',
 707                         'player_url':   None,
 708                 }]
 709
 710
 711 class GoogleIE(InfoExtractor):
 712         """Information extractor for video.google.com."""
 713
 714         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
 715         IE_NAME = u'video.google'
 716
 717         def __init__(self, downloader=None):
 718                 InfoExtractor.__init__(self, downloader)
 719
 720         def report_download_webpage(self, video_id):
 721                 """Report webpage download."""
 722                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
 723
 724         def report_extraction(self, video_id):
 725                 """Report information extraction."""
 726                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
 727
 728         def _real_extract(self, url):
 729                 # Extract id from URL
 730                 mobj = re.match(self._VALID_URL, url)
 731                 if mobj is None:
 732                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 733                         return
 734
 735                 video_id = mobj.group(1)
 736
 737                 video_extension = 'mp4'
 738
 739                 # Retrieve video webpage to extract further information
 740                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
 741                 try:
 742                         self.report_download_webpage(video_id)
 743                         webpage = urllib2.urlopen(request).read()
 744                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 745                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 746                         return
 747
 748                 # Extract URL, uploader, and title from webpage
 749                 self.report_extraction(video_id)
 750                 mobj = re.search(r"download_url:'([^']+)'", webpage)
 751                 if mobj is None:
 752                         video_extension = 'flv'
 753                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
 754                 if mobj is None:
 755                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 756                         return
 757                 mediaURL = urllib.unquote(mobj.group(1))
 758                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
 759                 mediaURL = mediaURL.replace('\\x26', '\x26')
 760
 761                 video_url = mediaURL
 762
 763                 mobj = re.search(r'<title>(.*)</title>', webpage)
 764                 if mobj is None:
 765                         self._downloader.trouble(u'ERROR: unable to extract title')
 766                         return
 767                 video_title = mobj.group(1).decode('utf-8')
 768
 769                 # Extract video description
 770                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
 771                 if mobj is None:
 772                         self._downloader.trouble(u'ERROR: unable to extract video description')
 773                         return
 774                 video_description = mobj.group(1).decode('utf-8')
 775                 if not video_description:
 776                         video_description = 'No description available.'
 777
 778                 # Extract video thumbnail
 779                 if self._downloader.params.get('forcethumbnail', False):
 780                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
 781                         try:
 782                                 webpage = urllib2.urlopen(request).read()
 783                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 784                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 785                                 return
 786                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
 787                         if mobj is None:
 788                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 789                                 return
 790                         video_thumbnail = mobj.group(1)
 791                 else:   # we need something to pass to process_info
 792                         video_thumbnail = ''
 793
 794                 return [{
 795                         'id':           video_id.decode('utf-8'),
 796                         'url':          video_url.decode('utf-8'),
 797                         'uploader':     u'NA',
 798                         'upload_date':  u'NA',
 799                         'title':        video_title,
 800                         'ext':          video_extension.decode('utf-8'),
 801                         'format':       u'NA',
 802                         'player_url':   None,
 803                 }]
 804
 805
 806 class PhotobucketIE(InfoExtractor):
 807         """Information extractor for photobucket.com."""
 808
 809         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 810         IE_NAME = u'photobucket'
 811
 812         def __init__(self, downloader=None):
 813                 InfoExtractor.__init__(self, downloader)
 814
 815         def report_download_webpage(self, video_id):
 816                 """Report webpage download."""
 817                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
 818
 819         def report_extraction(self, video_id):
 820                 """Report information extraction."""
 821                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
 822
 823         def _real_extract(self, url):
 824                 # Extract id from URL
 825                 mobj = re.match(self._VALID_URL, url)
 826                 if mobj is None:
 827                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 828                         return
 829
 830                 video_id = mobj.group(1)
 831
 832                 video_extension = 'flv'
 833
 834                 # Retrieve video webpage to extract further information
 835                 request = urllib2.Request(url)
 836                 try:
 837                         self.report_download_webpage(video_id)
 838                         webpage = urllib2.urlopen(request).read()
 839                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 840                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 841                         return
 842
 843                 # Extract URL, uploader, and title from webpage
 844                 self.report_extraction(video_id)
 845                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 846                 if mobj is None:
 847                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 848                         return
 849                 mediaURL = urllib.unquote(mobj.group(1))
 850
 851                 video_url = mediaURL
 852
 853                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 854                 if mobj is None:
 855                         self._downloader.trouble(u'ERROR: unable to extract title')
 856                         return
 857                 video_title = mobj.group(1).decode('utf-8')
 858
 859                 video_uploader = mobj.group(2).decode('utf-8')
 860
 861                 return [{
 862                         'id':           video_id.decode('utf-8'),
 863                         'url':          video_url.decode('utf-8'),
 864                         'uploader':     video_uploader,
 865                         'upload_date':  u'NA',
 866                         'title':        video_title,
 867                         'ext':          video_extension.decode('utf-8'),
 868                         'format':       u'NA',
 869                         'player_url':   None,
 870                 }]
 871
 872
 873 class YahooIE(InfoExtractor):
 874         """Information extractor for video.yahoo.com."""
 875
 876         # _VALID_URL matches all Yahoo! Video URLs
 877         # _VPAGE_URL matches only the extractable '/watch/' URLs
 878         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 879         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 880         IE_NAME = u'video.yahoo'
 881
 882         def __init__(self, downloader=None):
 883                 InfoExtractor.__init__(self, downloader)
 884
 885         def report_download_webpage(self, video_id):
 886                 """Report webpage download."""
 887                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
 888
 889         def report_extraction(self, video_id):
 890                 """Report information extraction."""
 891                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
 892
 893         def _real_extract(self, url, new_video=True):
 894                 # Extract ID from URL
 895                 mobj = re.match(self._VALID_URL, url)
 896                 if mobj is None:
 897                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
 898                         return
 899
 900                 video_id = mobj.group(2)
 901                 video_extension = 'flv'
 902
 903                 # Rewrite valid but non-extractable URLs as
 904                 # extractable English language /watch/ URLs
 905                 if re.match(self._VPAGE_URL, url) is None:
 906                         request = urllib2.Request(url)
 907                         try:
 908                                 webpage = urllib2.urlopen(request).read()
 909                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 910                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 911                                 return
 912
 913                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 914                         if mobj is None:
 915                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
 916                                 return
 917                         yahoo_id = mobj.group(1)
 918
 919                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 920                         if mobj is None:
 921                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
 922                                 return
 923                         yahoo_vid = mobj.group(1)
 924
 925                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 926                         return self._real_extract(url, new_video=False)
 927
 928                 # Retrieve video webpage to extract further information
 929                 request = urllib2.Request(url)
 930                 try:
 931                         self.report_download_webpage(video_id)
 932                         webpage = urllib2.urlopen(request).read()
 933                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 934                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 935                         return
 936
 937                 # Extract uploader and title from webpage
 938                 self.report_extraction(video_id)
 939                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: unable to extract video title')
 942                         return
 943                 video_title = mobj.group(1).decode('utf-8')
 944
 945                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 946                 if mobj is None:
 947                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
 948                         return
 949                 video_uploader = mobj.group(1).decode('utf-8')
 950
 951                 # Extract video thumbnail
 952                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 953                 if mobj is None:
 954                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
 955                         return
 956                 video_thumbnail = mobj.group(1).decode('utf-8')
 957
 958                 # Extract video description
 959                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 960                 if mobj is None:
 961                         self._downloader.trouble(u'ERROR: unable to extract video description')
 962                         return
 963                 video_description = mobj.group(1).decode('utf-8')
 964                 if not video_description:
 965                         video_description = 'No description available.'
 966
 967                 # Extract video height and width
 968                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
 969                 if mobj is None:
 970                         self._downloader.trouble(u'ERROR: unable to extract video height')
 971                         return
 972                 yv_video_height = mobj.group(1)
 973
 974                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
 975                 if mobj is None:
 976                         self._downloader.trouble(u'ERROR: unable to extract video width')
 977                         return
 978                 yv_video_width = mobj.group(1)
 979
 980                 # Retrieve video playlist to extract media URL
 981                 # I'm not completely sure what all these options are, but we
 982                 # seem to need most of them, otherwise the server sends a 401.
 983                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
 984                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
 985                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
 986                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
 987                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
 988                 try:
 989                         self.report_download_webpage(video_id)
 990                         webpage = urllib2.urlopen(request).read()
 991                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 992                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
 993                         return
 994
 995                 # Extract media URL from playlist XML
 996                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
 997                 if mobj is None:
 998                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
 999                         return
1000                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1001                 video_url = unescapeHTML(video_url)
1002
1003                 return [{
1004                         'id':           video_id.decode('utf-8'),
1005                         'url':          video_url,
1006                         'uploader':     video_uploader,
1007                         'upload_date':  u'NA',
1008                         'title':        video_title,
1009                         'ext':          video_extension.decode('utf-8'),
1010                         'thumbnail':    video_thumbnail.decode('utf-8'),
1011                         'description':  video_description,
1012                         'thumbnail':    video_thumbnail,
1013                         'player_url':   None,
1014                 }]
1015
1016
1017 class VimeoIE(InfoExtractor):
1018         """Information extractor for vimeo.com."""
1019
1020         # _VALID_URL matches Vimeo URLs
1021         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1022         IE_NAME = u'vimeo'
1023
1024         def __init__(self, downloader=None):
1025                 InfoExtractor.__init__(self, downloader)
1026
1027         def report_download_webpage(self, video_id):
1028                 """Report webpage download."""
1029                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1030
1031         def report_extraction(self, video_id):
1032                 """Report information extraction."""
1033                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1034
1035         def _real_extract(self, url, new_video=True):
1036                 # Extract ID from URL
1037                 mobj = re.match(self._VALID_URL, url)
1038                 if mobj is None:
1039                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1040                         return
1041
1042                 video_id = mobj.group(1)
1043
1044                 # Retrieve video webpage to extract further information
1045                 request = urllib2.Request(url, None, std_headers)
1046                 try:
1047                         self.report_download_webpage(video_id)
1048                         webpage = urllib2.urlopen(request).read()
1049                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1050                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1051                         return
1052
1053                 # Now we begin extracting as much information as we can from what we
1054                 # retrieved. First we extract the information common to all extractors,
1055                 # and latter we extract those that are Vimeo specific.
1056                 self.report_extraction(video_id)
1057
1058                 # Extract the config JSON
1059                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1060                 try:
1061                         config = json.loads(config)
1062                 except:
1063                         self._downloader.trouble(u'ERROR: unable to extract info section')
1064                         return
1065
1066                 # Extract title
1067                 video_title = config["video"]["title"]
1068
1069                 # Extract uploader
1070                 video_uploader = config["video"]["owner"]["name"]
1071
1072                 # Extract video thumbnail
1073                 video_thumbnail = config["video"]["thumbnail"]
1074
1075                 # Extract video description
1076                 video_description = get_element_by_id("description", webpage.decode('utf8'))
1077                 if video_description: video_description = clean_html(video_description)
1078                 else: video_description = ''
1079
1080                 # Extract upload date
1081                 video_upload_date = u'NA'
1082                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1083                 if mobj is not None:
1084                         video_upload_date = mobj.group(1)
1085
1086                 # Vimeo specific: extract request signature and timestamp
1087                 sig = config['request']['signature']
1088                 timestamp = config['request']['timestamp']
1089
1090                 # Vimeo specific: extract video codec and quality information
1091                 # TODO bind to format param
1092                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1093                 for codec in codecs:
1094                         if codec[0] in config["video"]["files"]:
1095                                 video_codec = codec[0]
1096                                 video_extension = codec[1]
1097                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1098                                 else: quality = 'sd'
1099                                 break
1100                 else:
1101                         self._downloader.trouble(u'ERROR: no known codec found')
1102                         return
1103
1104                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1105                                         %(video_id, sig, timestamp, quality, video_codec.upper())
1106
1107                 return [{
1108                         'id':           video_id,
1109                         'url':          video_url,
1110                         'uploader':     video_uploader,
1111                         'upload_date':  video_upload_date,
1112                         'title':        video_title,
1113                         'ext':          video_extension,
1114                         'thumbnail':    video_thumbnail,
1115                         'description':  video_description,
1116                         'player_url':   None,
1117                 }]
1118
1119
1120 class GenericIE(InfoExtractor):
1121         """Generic last-resort information extractor."""
1122
1123         _VALID_URL = r'.*'
1124         IE_NAME = u'generic'
1125
1126         def __init__(self, downloader=None):
1127                 InfoExtractor.__init__(self, downloader)
1128
1129         def report_download_webpage(self, video_id):
1130                 """Report webpage download."""
1131                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1132                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1133
1134         def report_extraction(self, video_id):
1135                 """Report information extraction."""
1136                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1137
1138         def report_following_redirect(self, new_url):
1139                 """Report information extraction."""
1140                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1141
1142         def _test_redirect(self, url):
1143                 """Check if it is a redirect, like url shorteners, in case restart chain."""
1144                 class HeadRequest(urllib2.Request):
1145                         def get_method(self):
1146                                 return "HEAD"
1147
1148                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1149                         """
1150                         Subclass the HTTPRedirectHandler to make it use our
1151                         HeadRequest also on the redirected URL
1152                         """
1153                         def redirect_request(self, req, fp, code, msg, headers, newurl):
1154                                 if code in (301, 302, 303, 307):
1155                                         newurl = newurl.replace(' ', '%20')
1156                                         newheaders = dict((k,v) for k,v in req.headers.items()
1157                                                                           if k.lower() not in ("content-length", "content-type"))
1158                                         return HeadRequest(newurl,
1159                                                                            headers=newheaders,
1160                                                                            origin_req_host=req.get_origin_req_host(),
1161                                                                            unverifiable=True)
1162                                 else:
1163                                         raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1164
1165                 class HTTPMethodFallback(urllib2.BaseHandler):
1166                         """
1167                         Fallback to GET if HEAD is not allowed (405 HTTP error)
1168                         """
1169                         def http_error_405(self, req, fp, code, msg, headers):
1170                                 fp.read()
1171                                 fp.close()
1172
1173                                 newheaders = dict((k,v) for k,v in req.headers.items()
1174                                                                   if k.lower() not in ("content-length", "content-type"))
1175                                 return self.parent.open(urllib2.Request(req.get_full_url(),
1176                                                                                                  headers=newheaders,
1177                                                                                                  origin_req_host=req.get_origin_req_host(),
1178                                                                                                  unverifiable=True))
1179
1180                 # Build our opener
1181                 opener = urllib2.OpenerDirector()
1182                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1183                                                 HTTPMethodFallback, HEADRedirectHandler,
1184                                                 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1185                         opener.add_handler(handler())
1186
1187                 response = opener.open(HeadRequest(url))
1188                 new_url = response.geturl()
1189
1190                 if url == new_url: return False
1191
1192                 self.report_following_redirect(new_url)
1193                 self._downloader.download([new_url])
1194                 return True
1195
1196         def _real_extract(self, url):
1197                 if self._test_redirect(url): return
1198
1199                 video_id = url.split('/')[-1]
1200                 request = urllib2.Request(url)
1201                 try:
1202                         self.report_download_webpage(video_id)
1203                         webpage = urllib2.urlopen(request).read()
1204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1205                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1206                         return
1207                 except ValueError, err:
1208                         # since this is the last-resort InfoExtractor, if
1209                         # this error is thrown, it'll be thrown here
1210                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1211                         return
1212
1213                 self.report_extraction(video_id)
1214                 # Start with something easy: JW Player in SWFObject
1215                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1216                 if mobj is None:
1217                         # Broaden the search a little bit
1218                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1219                 if mobj is None:
1220                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1221                         return
1222
1223                 # It's possible that one of the regexes
1224                 # matched, but returned an empty group:
1225                 if mobj.group(1) is None:
1226                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1227                         return
1228
1229                 video_url = urllib.unquote(mobj.group(1))
1230                 video_id = os.path.basename(video_url)
1231
1232                 # here's a fun little line of code for you:
1233                 video_extension = os.path.splitext(video_id)[1][1:]
1234                 video_id = os.path.splitext(video_id)[0]
1235
1236                 # it's tempting to parse this further, but you would
1237                 # have to take into account all the variations like
1238                 #   Video Title - Site Name
1239                 #   Site Name | Video Title
1240                 #   Video Title - Tagline | Site Name
1241                 # and so on and so forth; it's just not practical
1242                 mobj = re.search(r'<title>(.*)</title>', webpage)
1243                 if mobj is None:
1244                         self._downloader.trouble(u'ERROR: unable to extract title')
1245                         return
1246                 video_title = mobj.group(1).decode('utf-8')
1247
1248                 # video uploader is domain name
1249                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: unable to extract title')
1252                         return
1253                 video_uploader = mobj.group(1).decode('utf-8')
1254
1255                 return [{
1256                         'id':           video_id.decode('utf-8'),
1257                         'url':          video_url.decode('utf-8'),
1258                         'uploader':     video_uploader,
1259                         'upload_date':  u'NA',
1260                         'title':        video_title,
1261                         'ext':          video_extension.decode('utf-8'),
1262                         'format':       u'NA',
1263                         'player_url':   None,
1264                 }]
1265
1266
1267 class YoutubeSearchIE(InfoExtractor):
1268         """Information Extractor for YouTube search queries."""
1269         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1270         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1271         _max_youtube_results = 1000
1272         IE_NAME = u'youtube:search'
1273
1274         def __init__(self, downloader=None):
1275                 InfoExtractor.__init__(self, downloader)
1276
1277         def report_download_page(self, query, pagenum):
1278                 """Report attempt to download search page with given number."""
1279                 query = query.decode(preferredencoding())
1280                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1281
1282         def _real_extract(self, query):
1283                 mobj = re.match(self._VALID_URL, query)
1284                 if mobj is None:
1285                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1286                         return
1287
1288                 prefix, query = query.split(':')
1289                 prefix = prefix[8:]
1290                 query = query.encode('utf-8')
1291                 if prefix == '':
1292                         self._download_n_results(query, 1)
1293                         return
1294                 elif prefix == 'all':
1295                         self._download_n_results(query, self._max_youtube_results)
1296                         return
1297                 else:
1298                         try:
1299                                 n = long(prefix)
1300                                 if n <= 0:
1301                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1302                                         return
1303                                 elif n > self._max_youtube_results:
1304                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1305                                         n = self._max_youtube_results
1306                                 self._download_n_results(query, n)
1307                                 return
1308                         except ValueError: # parsing prefix as integer fails
1309                                 self._download_n_results(query, 1)
1310                                 return
1311
1312         def _download_n_results(self, query, n):
1313                 """Downloads a specified number of results for a query"""
1314
1315                 video_ids = []
1316                 pagenum = 0
1317                 limit = n
1318
1319                 while (50 * pagenum) < limit:
1320                         self.report_download_page(query, pagenum+1)
1321                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1322                         request = urllib2.Request(result_url)
1323                         try:
1324                                 data = urllib2.urlopen(request).read()
1325                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1326                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1327                                 return
1328                         api_response = json.loads(data)['data']
1329
1330                         new_ids = list(video['id'] for video in api_response['items'])
1331                         video_ids += new_ids
1332
1333                         limit = min(n, api_response['totalItems'])
1334                         pagenum += 1
1335
1336                 if len(video_ids) > n:
1337                         video_ids = video_ids[:n]
1338                 for id in video_ids:
1339                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1340                 return
1341
1342
1343 class GoogleSearchIE(InfoExtractor):
1344         """Information Extractor for Google Video search queries."""
1345         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1346         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1347         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1348         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1349         _max_google_results = 1000
1350         IE_NAME = u'video.google:search'
1351
1352         def __init__(self, downloader=None):
1353                 InfoExtractor.__init__(self, downloader)
1354
1355         def report_download_page(self, query, pagenum):
1356                 """Report attempt to download playlist page with given number."""
1357                 query = query.decode(preferredencoding())
1358                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1359
1360         def _real_extract(self, query):
1361                 mobj = re.match(self._VALID_URL, query)
1362                 if mobj is None:
1363                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1364                         return
1365
1366                 prefix, query = query.split(':')
1367                 prefix = prefix[8:]
1368                 query = query.encode('utf-8')
1369                 if prefix == '':
1370                         self._download_n_results(query, 1)
1371                         return
1372                 elif prefix == 'all':
1373                         self._download_n_results(query, self._max_google_results)
1374                         return
1375                 else:
1376                         try:
1377                                 n = long(prefix)
1378                                 if n <= 0:
1379                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1380                                         return
1381                                 elif n > self._max_google_results:
1382                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1383                                         n = self._max_google_results
1384                                 self._download_n_results(query, n)
1385                                 return
1386                         except ValueError: # parsing prefix as integer fails
1387                                 self._download_n_results(query, 1)
1388                                 return
1389
1390         def _download_n_results(self, query, n):
1391                 """Downloads a specified number of results for a query"""
1392
1393                 video_ids = []
1394                 pagenum = 0
1395
1396                 while True:
1397                         self.report_download_page(query, pagenum)
1398                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1399                         request = urllib2.Request(result_url)
1400                         try:
1401                                 page = urllib2.urlopen(request).read()
1402                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1403                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1404                                 return
1405
1406                         # Extract video identifiers
1407                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1408                                 video_id = mobj.group(1)
1409                                 if video_id not in video_ids:
1410                                         video_ids.append(video_id)
1411                                         if len(video_ids) == n:
1412                                                 # Specified n videos reached
1413                                                 for id in video_ids:
1414                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1415                                                 return
1416
1417                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1418                                 for id in video_ids:
1419                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1420                                 return
1421
1422                         pagenum = pagenum + 1
1423
1424
1425 class YahooSearchIE(InfoExtractor):
1426         """Information Extractor for Yahoo! Video search queries."""
1427         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1428         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1429         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1430         _MORE_PAGES_INDICATOR = r'\s*Next'
1431         _max_yahoo_results = 1000
1432         IE_NAME = u'video.yahoo:search'
1433
1434         def __init__(self, downloader=None):
1435                 InfoExtractor.__init__(self, downloader)
1436
1437         def report_download_page(self, query, pagenum):
1438                 """Report attempt to download playlist page with given number."""
1439                 query = query.decode(preferredencoding())
1440                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1441
1442         def _real_extract(self, query):
1443                 mobj = re.match(self._VALID_URL, query)
1444                 if mobj is None:
1445                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1446                         return
1447
1448                 prefix, query = query.split(':')
1449                 prefix = prefix[8:]
1450                 query = query.encode('utf-8')
1451                 if prefix == '':
1452                         self._download_n_results(query, 1)
1453                         return
1454                 elif prefix == 'all':
1455                         self._download_n_results(query, self._max_yahoo_results)
1456                         return
1457                 else:
1458                         try:
1459                                 n = long(prefix)
1460                                 if n <= 0:
1461                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1462                                         return
1463                                 elif n > self._max_yahoo_results:
1464                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1465                                         n = self._max_yahoo_results
1466                                 self._download_n_results(query, n)
1467                                 return
1468                         except ValueError: # parsing prefix as integer fails
1469                                 self._download_n_results(query, 1)
1470                                 return
1471
1472         def _download_n_results(self, query, n):
1473                 """Downloads a specified number of results for a query"""
1474
1475                 video_ids = []
1476                 already_seen = set()
1477                 pagenum = 1
1478
1479                 while True:
1480                         self.report_download_page(query, pagenum)
1481                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1482                         request = urllib2.Request(result_url)
1483                         try:
1484                                 page = urllib2.urlopen(request).read()
1485                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1487                                 return
1488
1489                         # Extract video identifiers
1490                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1491                                 video_id = mobj.group(1)
1492                                 if video_id not in already_seen:
1493                                         video_ids.append(video_id)
1494                                         already_seen.add(video_id)
1495                                         if len(video_ids) == n:
1496                                                 # Specified n videos reached
1497                                                 for id in video_ids:
1498                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1499                                                 return
1500
1501                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1502                                 for id in video_ids:
1503                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1504                                 return
1505
1506                         pagenum = pagenum + 1
1507
1508
1509 class YoutubePlaylistIE(InfoExtractor):
1510         """Information Extractor for YouTube playlists."""
1511
1512         _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1513         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1514         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;([^&"]+&amp;)*list=.*?%s'
1515         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1516         IE_NAME = u'youtube:playlist'
1517
1518         def __init__(self, downloader=None):
1519                 InfoExtractor.__init__(self, downloader)
1520
1521         def report_download_page(self, playlist_id, pagenum):
1522                 """Report attempt to download playlist page with given number."""
1523                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1524
1525         def _real_extract(self, url):
1526                 # Extract playlist id
1527                 mobj = re.match(self._VALID_URL, url)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1530                         return
1531
1532                 # Single video case
1533                 if mobj.group(3) is not None:
1534                         self._downloader.download([mobj.group(3)])
1535                         return
1536
1537                 # Download playlist pages
1538                 # prefix is 'p' as default for playlists but there are other types that need extra care
1539                 playlist_prefix = mobj.group(1)
1540                 if playlist_prefix == 'a':
1541                         playlist_access = 'artist'
1542                 else:
1543                         playlist_prefix = 'p'
1544                         playlist_access = 'view_play_list'
1545                 playlist_id = mobj.group(2)
1546                 video_ids = []
1547                 pagenum = 1
1548
1549                 while True:
1550                         self.report_download_page(playlist_id, pagenum)
1551                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1552                         request = urllib2.Request(url)
1553                         try:
1554                                 page = urllib2.urlopen(request).read()
1555                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1556                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1557                                 return
1558
1559                         # Extract video identifiers
1560                         ids_in_page = []
1561                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1562                                 if mobj.group(1) not in ids_in_page:
1563                                         ids_in_page.append(mobj.group(1))
1564                         video_ids.extend(ids_in_page)
1565
1566                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567                                 break
1568                         pagenum = pagenum + 1
1569
1570                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1571                 playlistend = self._downloader.params.get('playlistend', -1)
1572                 if playlistend == -1:
1573                         video_ids = video_ids[playliststart:]
1574                 else:
1575                         video_ids = video_ids[playliststart:playlistend]
1576
1577                 for id in video_ids:
1578                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1579                 return
1580
1581
1582 class YoutubeChannelIE(InfoExtractor):
1583         """Information Extractor for YouTube channels."""
1584
1585         _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1586         _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587         _MORE_PAGES_INDICATOR = r'yt-uix-button-content">Next' # TODO
1588         IE_NAME = u'youtube:channel'
1589
1590         def report_download_page(self, channel_id, pagenum):
1591                 """Report attempt to download channel page with given number."""
1592                 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1593
1594         def _real_extract(self, url):
1595                 # Extract channel id
1596                 mobj = re.match(self._VALID_URL, url)
1597                 if mobj is None:
1598                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1599                         return
1600
1601                 # Download channel pages
1602                 channel_id = mobj.group(1)
1603                 video_ids = []
1604                 pagenum = 1
1605
1606                 while True:
1607                         self.report_download_page(channel_id, pagenum)
1608                         url = self._TEMPLATE_URL % (channel_id, pagenum)
1609                         request = urllib2.Request(url)
1610                         try:
1611                                 page = urllib2.urlopen(request).read()
1612                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1613                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1614                                 return
1615
1616                         # Extract video identifiers
1617                         ids_in_page = []
1618                         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1619                                 if mobj.group(1) not in ids_in_page:
1620                                         ids_in_page.append(mobj.group(1))
1621                         video_ids.extend(ids_in_page)
1622
1623                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1624                                 break
1625                         pagenum = pagenum + 1
1626
1627                 for id in video_ids:
1628                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1629                 return
1630
1631
1632 class YoutubeUserIE(InfoExtractor):
1633         """Information Extractor for YouTube users."""
1634
1635         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1636         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1637         _GDATA_PAGE_SIZE = 50
1638         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1639         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1640         IE_NAME = u'youtube:user'
1641
1642         def __init__(self, downloader=None):
1643                 InfoExtractor.__init__(self, downloader)
1644
1645         def report_download_page(self, username, start_index):
1646                 """Report attempt to download user page."""
1647                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1648                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1649
1650         def _real_extract(self, url):
1651                 # Extract username
1652                 mobj = re.match(self._VALID_URL, url)
1653                 if mobj is None:
1654                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1655                         return
1656
1657                 username = mobj.group(1)
1658
1659                 # Download video ids using YouTube Data API. Result size per
1660                 # query is limited (currently to 50 videos) so we need to query
1661                 # page by page until there are no video ids - it means we got
1662                 # all of them.
1663
1664                 video_ids = []
1665                 pagenum = 0
1666
1667                 while True:
1668                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669                         self.report_download_page(username, start_index)
1670
1671                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1672
1673                         try:
1674                                 page = urllib2.urlopen(request).read()
1675                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1677                                 return
1678
1679                         # Extract video identifiers
1680                         ids_in_page = []
1681
1682                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1683                                 if mobj.group(1) not in ids_in_page:
1684                                         ids_in_page.append(mobj.group(1))
1685
1686                         video_ids.extend(ids_in_page)
1687
1688                         # A little optimization - if current page is not
1689                         # "full", ie. does not contain PAGE_SIZE video ids then
1690                         # we can assume that this page is the last one - there
1691                         # are no more ids on further pages - no need to query
1692                         # again.
1693
1694                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1695                                 break
1696
1697                         pagenum += 1
1698
1699                 all_ids_count = len(video_ids)
1700                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1701                 playlistend = self._downloader.params.get('playlistend', -1)
1702
1703                 if playlistend == -1:
1704                         video_ids = video_ids[playliststart:]
1705                 else:
1706                         video_ids = video_ids[playliststart:playlistend]
1707
1708                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1709                                 (username, all_ids_count, len(video_ids)))
1710
1711                 for video_id in video_ids:
1712                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1713
1714
1715 class BlipTVUserIE(InfoExtractor):
1716         """Information Extractor for blip.tv users."""
1717
1718         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1719         _PAGE_SIZE = 12
1720         IE_NAME = u'blip.tv:user'
1721
1722         def __init__(self, downloader=None):
1723                 InfoExtractor.__init__(self, downloader)
1724
1725         def report_download_page(self, username, pagenum):
1726                 """Report attempt to download user page."""
1727                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1728                                 (self.IE_NAME, username, pagenum))
1729
1730         def _real_extract(self, url):
1731                 # Extract username
1732                 mobj = re.match(self._VALID_URL, url)
1733                 if mobj is None:
1734                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1735                         return
1736
1737                 username = mobj.group(1)
1738
1739                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1740
1741                 request = urllib2.Request(url)
1742
1743                 try:
1744                         page = urllib2.urlopen(request).read().decode('utf-8')
1745                         mobj = re.search(r'data-users-id="([^"]+)"', page)
1746                         page_base = page_base % mobj.group(1)
1747                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1748                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1749                         return
1750
1751
1752                 # Download video ids using BlipTV Ajax calls. Result size per
1753                 # query is limited (currently to 12 videos) so we need to query
1754                 # page by page until there are no video ids - it means we got
1755                 # all of them.
1756
1757                 video_ids = []
1758                 pagenum = 1
1759
1760                 while True:
1761                         self.report_download_page(username, pagenum)
1762
1763                         request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1764
1765                         try:
1766                                 page = urllib2.urlopen(request).read().decode('utf-8')
1767                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1769                                 return
1770
1771                         # Extract video identifiers
1772                         ids_in_page = []
1773
1774                         for mobj in re.finditer(r'href="/([^"]+)"', page):
1775                                 if mobj.group(1) not in ids_in_page:
1776                                         ids_in_page.append(unescapeHTML(mobj.group(1)))
1777
1778                         video_ids.extend(ids_in_page)
1779
1780                         # A little optimization - if current page is not
1781                         # "full", ie. does not contain PAGE_SIZE video ids then
1782                         # we can assume that this page is the last one - there
1783                         # are no more ids on further pages - no need to query
1784                         # again.
1785
1786                         if len(ids_in_page) < self._PAGE_SIZE:
1787                                 break
1788
1789                         pagenum += 1
1790
1791                 all_ids_count = len(video_ids)
1792                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1793                 playlistend = self._downloader.params.get('playlistend', -1)
1794
1795                 if playlistend == -1:
1796                         video_ids = video_ids[playliststart:]
1797                 else:
1798                         video_ids = video_ids[playliststart:playlistend]
1799
1800                 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1801                                 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1802
1803                 for video_id in video_ids:
1804                         self._downloader.download([u'http://blip.tv/'+video_id])
1805
1806
1807 class DepositFilesIE(InfoExtractor):
1808         """Information extractor for depositfiles.com"""
1809
1810         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1811         IE_NAME = u'DepositFiles'
1812
1813         def __init__(self, downloader=None):
1814                 InfoExtractor.__init__(self, downloader)
1815
1816         def report_download_webpage(self, file_id):
1817                 """Report webpage download."""
1818                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1819
1820         def report_extraction(self, file_id):
1821                 """Report information extraction."""
1822                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1823
1824         def _real_extract(self, url):
1825                 file_id = url.split('/')[-1]
1826                 # Rebuild url in english locale
1827                 url = 'http://depositfiles.com/en/files/' + file_id
1828
1829                 # Retrieve file webpage with 'Free download' button pressed
1830                 free_download_indication = { 'gateway_result' : '1' }
1831                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1832                 try:
1833                         self.report_download_webpage(file_id)
1834                         webpage = urllib2.urlopen(request).read()
1835                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1836                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1837                         return
1838
1839                 # Search for the real file URL
1840                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1841                 if (mobj is None) or (mobj.group(1) is None):
1842                         # Try to figure out reason of the error.
1843                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1844                         if (mobj is not None) and (mobj.group(1) is not None):
1845                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1846                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1847                         else:
1848                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1849                         return
1850
1851                 file_url = mobj.group(1)
1852                 file_extension = os.path.splitext(file_url)[1][1:]
1853
1854                 # Search for file title
1855                 mobj = re.search(r'<b title="(.*?)">', webpage)
1856                 if mobj is None:
1857                         self._downloader.trouble(u'ERROR: unable to extract title')
1858                         return
1859                 file_title = mobj.group(1).decode('utf-8')
1860
1861                 return [{
1862                         'id':           file_id.decode('utf-8'),
1863                         'url':          file_url.decode('utf-8'),
1864                         'uploader':     u'NA',
1865                         'upload_date':  u'NA',
1866                         'title':        file_title,
1867                         'ext':          file_extension.decode('utf-8'),
1868                         'format':       u'NA',
1869                         'player_url':   None,
1870                 }]
1871
1872
1873 class FacebookIE(InfoExtractor):
1874         """Information Extractor for Facebook"""
1875
1876         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1877         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1878         _NETRC_MACHINE = 'facebook'
1879         _available_formats = ['video', 'highqual', 'lowqual']
1880         _video_extensions = {
1881                 'video': 'mp4',
1882                 'highqual': 'mp4',
1883                 'lowqual': 'mp4',
1884         }
1885         IE_NAME = u'facebook'
1886
1887         def __init__(self, downloader=None):
1888                 InfoExtractor.__init__(self, downloader)
1889
1890         def _reporter(self, message):
1891                 """Add header and report message."""
1892                 self._downloader.to_screen(u'[facebook] %s' % message)
1893
1894         def report_login(self):
1895                 """Report attempt to log in."""
1896                 self._reporter(u'Logging in')
1897
1898         def report_video_webpage_download(self, video_id):
1899                 """Report attempt to download video webpage."""
1900                 self._reporter(u'%s: Downloading video webpage' % video_id)
1901
1902         def report_information_extraction(self, video_id):
1903                 """Report attempt to extract video information."""
1904                 self._reporter(u'%s: Extracting video information' % video_id)
1905
1906         def _parse_page(self, video_webpage):
1907                 """Extract video information from page"""
1908                 # General data
1909                 data = {'title': r'\("video_title", "(.*?)"\)',
1910                         'description': r'<div class="datawrap">(.*?)</div>',
1911                         'owner': r'\("video_owner_name", "(.*?)"\)',
1912                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1913                         }
1914                 video_info = {}
1915                 for piece in data.keys():
1916                         mobj = re.search(data[piece], video_webpage)
1917                         if mobj is not None:
1918                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1919
1920                 # Video urls
1921                 video_urls = {}
1922                 for fmt in self._available_formats:
1923                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1924                         if mobj is not None:
1925                                 # URL is in a Javascript segment inside an escaped Unicode format within
1926                                 # the generally utf-8 page
1927                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1928                 video_info['video_urls'] = video_urls
1929
1930                 return video_info
1931
1932         def _real_initialize(self):
1933                 if self._downloader is None:
1934                         return
1935
1936                 useremail = None
1937                 password = None
1938                 downloader_params = self._downloader.params
1939
1940                 # Attempt to use provided username and password or .netrc data
1941                 if downloader_params.get('username', None) is not None:
1942                         useremail = downloader_params['username']
1943                         password = downloader_params['password']
1944                 elif downloader_params.get('usenetrc', False):
1945                         try:
1946                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1947                                 if info is not None:
1948                                         useremail = info[0]
1949                                         password = info[2]
1950                                 else:
1951                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1952                         except (IOError, netrc.NetrcParseError), err:
1953                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1954                                 return
1955
1956                 if useremail is None:
1957                         return
1958
1959                 # Log in
1960                 login_form = {
1961                         'email': useremail,
1962                         'pass': password,
1963                         'login': 'Log+In'
1964                         }
1965                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1966                 try:
1967                         self.report_login()
1968                         login_results = urllib2.urlopen(request).read()
1969                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1970                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1971                                 return
1972                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1973                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1974                         return
1975
1976         def _real_extract(self, url):
1977                 mobj = re.match(self._VALID_URL, url)
1978                 if mobj is None:
1979                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1980                         return
1981                 video_id = mobj.group('ID')
1982
1983                 # Get video webpage
1984                 self.report_video_webpage_download(video_id)
1985                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1986                 try:
1987                         page = urllib2.urlopen(request)
1988                         video_webpage = page.read()
1989                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1990                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1991                         return
1992
1993                 # Start extracting information
1994                 self.report_information_extraction(video_id)
1995
1996                 # Extract information
1997                 video_info = self._parse_page(video_webpage)
1998
1999                 # uploader
2000                 if 'owner' not in video_info:
2001                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2002                         return
2003                 video_uploader = video_info['owner']
2004
2005                 # title
2006                 if 'title' not in video_info:
2007                         self._downloader.trouble(u'ERROR: unable to extract video title')
2008                         return
2009                 video_title = video_info['title']
2010                 video_title = video_title.decode('utf-8')
2011
2012                 # thumbnail image
2013                 if 'thumbnail' not in video_info:
2014                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2015                         video_thumbnail = ''
2016                 else:
2017                         video_thumbnail = video_info['thumbnail']
2018
2019                 # upload date
2020                 upload_date = u'NA'
2021                 if 'upload_date' in video_info:
2022                         upload_time = video_info['upload_date']
2023                         timetuple = email.utils.parsedate_tz(upload_time)
2024                         if timetuple is not None:
2025                                 try:
2026                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2027                                 except:
2028                                         pass
2029
2030                 # description
2031                 video_description = video_info.get('description', 'No description available.')
2032
2033                 url_map = video_info['video_urls']
2034                 if len(url_map.keys()) > 0:
2035                         # Decide which formats to download
2036                         req_format = self._downloader.params.get('format', None)
2037                         format_limit = self._downloader.params.get('format_limit', None)
2038
2039                         if format_limit is not None and format_limit in self._available_formats:
2040                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2041                         else:
2042                                 format_list = self._available_formats
2043                         existing_formats = [x for x in format_list if x in url_map]
2044                         if len(existing_formats) == 0:
2045                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2046                                 return
2047                         if req_format is None:
2048                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2049                         elif req_format == 'worst':
2050                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2051                         elif req_format == '-1':
2052                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2053                         else:
2054                                 # Specific format
2055                                 if req_format not in url_map:
2056                                         self._downloader.trouble(u'ERROR: requested format not available')
2057                                         return
2058                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2059
2060                 results = []
2061                 for format_param, video_real_url in video_url_list:
2062                         # Extension
2063                         video_extension = self._video_extensions.get(format_param, 'mp4')
2064
2065                         results.append({
2066                                 'id':           video_id.decode('utf-8'),
2067                                 'url':          video_real_url.decode('utf-8'),
2068                                 'uploader':     video_uploader.decode('utf-8'),
2069                                 'upload_date':  upload_date,
2070                                 'title':        video_title,
2071                                 'ext':          video_extension.decode('utf-8'),
2072                                 'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2073                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2074                                 'description':  video_description.decode('utf-8'),
2075                                 'player_url':   None,
2076                         })
2077                 return results
2078
2079 class BlipTVIE(InfoExtractor):
2080         """Information extractor for blip.tv"""
2081
2082         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2083         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2084         IE_NAME = u'blip.tv'
2085
2086         def report_extraction(self, file_id):
2087                 """Report information extraction."""
2088                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2089
2090         def report_direct_download(self, title):
2091                 """Report information extraction."""
2092                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2093
2094         def _real_extract(self, url):
2095                 mobj = re.match(self._VALID_URL, url)
2096                 if mobj is None:
2097                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2098                         return
2099
2100                 if '?' in url:
2101                         cchar = '&'
2102                 else:
2103                         cchar = '?'
2104                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2105                 request = urllib2.Request(json_url.encode('utf-8'))
2106                 self.report_extraction(mobj.group(1))
2107                 info = None
2108                 try:
2109                         urlh = urllib2.urlopen(request)
2110                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2111                                 basename = url.split('/')[-1]
2112                                 title,ext = os.path.splitext(basename)
2113                                 title = title.decode('UTF-8')
2114                                 ext = ext.replace('.', '')
2115                                 self.report_direct_download(title)
2116                                 info = {
2117                                         'id': title,
2118                                         'url': url,
2119                                         'title': title,
2120                                         'ext': ext,
2121                                         'urlhandle': urlh
2122                                 }
2123                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2124                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2125                         return
2126                 if info is None: # Regular URL
2127                         try:
2128                                 json_code = urlh.read()
2129                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2130                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2131                                 return
2132
2133                         try:
2134                                 json_data = json.loads(json_code)
2135                                 if 'Post' in json_data:
2136                                         data = json_data['Post']
2137                                 else:
2138                                         data = json_data
2139
2140                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2141                                 video_url = data['media']['url']
2142                                 umobj = re.match(self._URL_EXT, video_url)
2143                                 if umobj is None:
2144                                         raise ValueError('Can not determine filename extension')
2145                                 ext = umobj.group(1)
2146
2147                                 info = {
2148                                         'id': data['item_id'],
2149                                         'url': video_url,
2150                                         'uploader': data['display_name'],
2151                                         'upload_date': upload_date,
2152                                         'title': data['title'],
2153                                         'ext': ext,
2154                                         'format': data['media']['mimeType'],
2155                                         'thumbnail': data['thumbnailUrl'],
2156                                         'description': data['description'],
2157                                         'player_url': data['embedUrl']
2158                                 }
2159                         except (ValueError,KeyError), err:
2160                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2161                                 return
2162
2163                 std_headers['User-Agent'] = 'iTunes/10.6.1'
2164                 return [info]
2165
2166
2167 class MyVideoIE(InfoExtractor):
2168         """Information Extractor for myvideo.de."""
2169
2170         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2171         IE_NAME = u'myvideo'
2172
2173         def __init__(self, downloader=None):
2174                 InfoExtractor.__init__(self, downloader)
2175
2176         def report_download_webpage(self, video_id):
2177                 """Report webpage download."""
2178                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2179
2180         def report_extraction(self, video_id):
2181                 """Report information extraction."""
2182                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2183
2184         def _real_extract(self,url):
2185                 mobj = re.match(self._VALID_URL, url)
2186                 if mobj is None:
2187                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
2188                         return
2189
2190                 video_id = mobj.group(1)
2191
2192                 # Get video webpage
2193                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2194                 try:
2195                         self.report_download_webpage(video_id)
2196                         webpage = urllib2.urlopen(request).read()
2197                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2198                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2199                         return
2200
2201                 self.report_extraction(video_id)
2202                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2203                                  webpage)
2204                 if mobj is None:
2205                         self._downloader.trouble(u'ERROR: unable to extract media URL')
2206                         return
2207                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2208
2209                 mobj = re.search('<title>([^<]+)</title>', webpage)
2210                 if mobj is None:
2211                         self._downloader.trouble(u'ERROR: unable to extract title')
2212                         return
2213
2214                 video_title = mobj.group(1)
2215
2216                 return [{
2217                         'id':           video_id,
2218                         'url':          video_url,
2219                         'uploader':     u'NA',
2220                         'upload_date':  u'NA',
2221                         'title':        video_title,
2222                         'ext':          u'flv',
2223                         'format':       u'NA',
2224                         'player_url':   None,
2225                 }]
2226
2227 class ComedyCentralIE(InfoExtractor):
2228         """Information extractor for The Daily Show and Colbert Report """
2229
2230         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2231         IE_NAME = u'comedycentral'
2232
2233         def report_extraction(self, episode_id):
2234                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2235
2236         def report_config_download(self, episode_id):
2237                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2238
2239         def report_index_download(self, episode_id):
2240                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2241
2242         def report_player_url(self, episode_id):
2243                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2244
2245         def _real_extract(self, url):
2246                 mobj = re.match(self._VALID_URL, url)
2247                 if mobj is None:
2248                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2249                         return
2250
2251                 if mobj.group('shortname'):
2252                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253                                 url = u'http://www.thedailyshow.com/full-episodes/'
2254                         else:
2255                                 url = u'http://www.colbertnation.com/full-episodes/'
2256                         mobj = re.match(self._VALID_URL, url)
2257                         assert mobj is not None
2258
2259                 dlNewest = not mobj.group('episode')
2260                 if dlNewest:
2261                         epTitle = mobj.group('showname')
2262                 else:
2263                         epTitle = mobj.group('episode')
2264
2265                 req = urllib2.Request(url)
2266                 self.report_extraction(epTitle)
2267                 try:
2268                         htmlHandle = urllib2.urlopen(req)
2269                         html = htmlHandle.read()
2270                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2271                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2272                         return
2273                 if dlNewest:
2274                         url = htmlHandle.geturl()
2275                         mobj = re.match(self._VALID_URL, url)
2276                         if mobj is None:
2277                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2278                                 return
2279                         if mobj.group('episode') == '':
2280                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2281                                 return
2282                         epTitle = mobj.group('episode')
2283
2284                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2285                 if len(mMovieParams) == 0:
2286                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2287                         return
2288
2289                 playerUrl_raw = mMovieParams[0][0]
2290                 self.report_player_url(epTitle)
2291                 try:
2292                         urlHandle = urllib2.urlopen(playerUrl_raw)
2293                         playerUrl = urlHandle.geturl()
2294                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2295                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2296                         return
2297
2298                 uri = mMovieParams[0][1]
2299                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2300                 self.report_index_download(epTitle)
2301                 try:
2302                         indexXml = urllib2.urlopen(indexUrl).read()
2303                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2304                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2305                         return
2306
2307                 results = []
2308
2309                 idoc = xml.etree.ElementTree.fromstring(indexXml)
2310                 itemEls = idoc.findall('.//item')
2311                 for itemEl in itemEls:
2312                         mediaId = itemEl.findall('./guid')[0].text
2313                         shortMediaId = mediaId.split(':')[-1]
2314                         showId = mediaId.split(':')[-2].replace('.com', '')
2315                         officialTitle = itemEl.findall('./title')[0].text
2316                         officialDate = itemEl.findall('./pubDate')[0].text
2317
2318                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2319                                                 urllib.urlencode({'uri': mediaId}))
2320                         configReq = urllib2.Request(configUrl)
2321                         self.report_config_download(epTitle)
2322                         try:
2323                                 configXml = urllib2.urlopen(configReq).read()
2324                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2325                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2326                                 return
2327
2328                         cdoc = xml.etree.ElementTree.fromstring(configXml)
2329                         turls = []
2330                         for rendition in cdoc.findall('.//rendition'):
2331                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2332                                 turls.append(finfo)
2333
2334                         if len(turls) == 0:
2335                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2336                                 continue
2337
2338                         # For now, just pick the highest bitrate
2339                         format,video_url = turls[-1]
2340
2341                         effTitle = showId + u'-' + epTitle
2342                         info = {
2343                                 'id': shortMediaId,
2344                                 'url': video_url,
2345                                 'uploader': showId,
2346                                 'upload_date': officialDate,
2347                                 'title': effTitle,
2348                                 'ext': 'mp4',
2349                                 'format': format,
2350                                 'thumbnail': None,
2351                                 'description': officialTitle,
2352                                 'player_url': playerUrl
2353                         }
2354
2355                         results.append(info)
2356
2357                 return results
2358
2359
2360 class EscapistIE(InfoExtractor):
2361         """Information extractor for The Escapist """
2362
2363         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2364         IE_NAME = u'escapist'
2365
2366         def report_extraction(self, showName):
2367                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2368
2369         def report_config_download(self, showName):
2370                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2371
2372         def _real_extract(self, url):
2373                 mobj = re.match(self._VALID_URL, url)
2374                 if mobj is None:
2375                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2376                         return
2377                 showName = mobj.group('showname')
2378                 videoId = mobj.group('episode')
2379
2380                 self.report_extraction(showName)
2381                 try:
2382                         webPage = urllib2.urlopen(url)
2383                         webPageBytes = webPage.read()
2384                         m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2385                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2386                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2388                         return
2389
2390                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2391                 description = unescapeHTML(descMatch.group(1))
2392                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2393                 imgUrl = unescapeHTML(imgMatch.group(1))
2394                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2395                 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2396                 configUrlMatch = re.search('config=(.*)$', playerUrl)
2397                 configUrl = urllib2.unquote(configUrlMatch.group(1))
2398
2399                 self.report_config_download(showName)
2400                 try:
2401                         configJSON = urllib2.urlopen(configUrl).read()
2402                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2403                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2404                         return
2405
2406                 # Technically, it's JavaScript, not JSON
2407                 configJSON = configJSON.replace("'", '"')
2408
2409                 try:
2410                         config = json.loads(configJSON)
2411                 except (ValueError,), err:
2412                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2413                         return
2414
2415                 playlist = config['playlist']
2416                 videoUrl = playlist[1]['url']
2417
2418                 info = {
2419                         'id': videoId,
2420                         'url': videoUrl,
2421                         'uploader': showName,
2422                         'upload_date': None,
2423                         'title': showName,
2424                         'ext': 'flv',
2425                         'format': 'flv',
2426                         'thumbnail': imgUrl,
2427                         'description': description,
2428                         'player_url': playerUrl,
2429                 }
2430
2431                 return [info]
2432
2433
2434 class CollegeHumorIE(InfoExtractor):
2435         """Information extractor for collegehumor.com"""
2436
2437         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2438         IE_NAME = u'collegehumor'
2439
2440         def report_webpage(self, video_id):
2441                 """Report information extraction."""
2442                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2443
2444         def report_extraction(self, video_id):
2445                 """Report information extraction."""
2446                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2447
2448         def _real_extract(self, url):
2449                 mobj = re.match(self._VALID_URL, url)
2450                 if mobj is None:
2451                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2452                         return
2453                 video_id = mobj.group('videoid')
2454
2455                 self.report_webpage(video_id)
2456                 request = urllib2.Request(url)
2457                 try:
2458                         webpage = urllib2.urlopen(request).read()
2459                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2460                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2461                         return
2462
2463                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2464                 if m is None:
2465                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2466                         return
2467                 internal_video_id = m.group('internalvideoid')
2468
2469                 info = {
2470                         'id': video_id,
2471                         'internal_id': internal_video_id,
2472                 }
2473
2474                 self.report_extraction(video_id)
2475                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2476                 try:
2477                         metaXml = urllib2.urlopen(xmlUrl).read()
2478                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2479                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2480                         return
2481
2482                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2483                 try:
2484                         videoNode = mdoc.findall('./video')[0]
2485                         info['description'] = videoNode.findall('./description')[0].text
2486                         info['title'] = videoNode.findall('./caption')[0].text
2487                         info['url'] = videoNode.findall('./file')[0].text
2488                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2489                         info['ext'] = info['url'].rpartition('.')[2]
2490                         info['format'] = info['ext']
2491                 except IndexError:
2492                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2493                         return
2494
2495                 return [info]
2496
2497
2498 class XVideosIE(InfoExtractor):
2499         """Information extractor for xvideos.com"""
2500
2501         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2502         IE_NAME = u'xvideos'
2503
2504         def report_webpage(self, video_id):
2505                 """Report information extraction."""
2506                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2507
2508         def report_extraction(self, video_id):
2509                 """Report information extraction."""
2510                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2511
2512         def _real_extract(self, url):
2513                 mobj = re.match(self._VALID_URL, url)
2514                 if mobj is None:
2515                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2516                         return
2517                 video_id = mobj.group(1).decode('utf-8')
2518
2519                 self.report_webpage(video_id)
2520
2521                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2522                 try:
2523                         webpage = urllib2.urlopen(request).read()
2524                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2525                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2526                         return
2527
2528                 self.report_extraction(video_id)
2529
2530
2531                 # Extract video URL
2532                 mobj = re.search(r'flv_url=(.+?)&', webpage)
2533                 if mobj is None:
2534                         self._downloader.trouble(u'ERROR: unable to extract video url')
2535                         return
2536                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2537
2538
2539                 # Extract title
2540                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2541                 if mobj is None:
2542                         self._downloader.trouble(u'ERROR: unable to extract video title')
2543                         return
2544                 video_title = mobj.group(1).decode('utf-8')
2545
2546
2547                 # Extract video thumbnail
2548                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2549                 if mobj is None:
2550                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2551                         return
2552                 video_thumbnail = mobj.group(0).decode('utf-8')
2553
2554                 info = {
2555                         'id': video_id,
2556                         'url': video_url,
2557                         'uploader': None,
2558                         'upload_date': None,
2559                         'title': video_title,
2560                         'ext': 'flv',
2561                         'format': 'flv',
2562                         'thumbnail': video_thumbnail,
2563                         'description': None,
2564                         'player_url': None,
2565                 }
2566
2567                 return [info]
2568
2569
2570 class SoundcloudIE(InfoExtractor):
2571         """Information extractor for soundcloud.com
2572            To access the media, the uid of the song and a stream token
2573            must be extracted from the page source and the script must make
2574            a request to media.soundcloud.com/crossdomain.xml. Then
2575            the media can be grabbed by requesting from an url composed
2576            of the stream token and uid
2577          """
2578
2579         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2580         IE_NAME = u'soundcloud'
2581
2582         def __init__(self, downloader=None):
2583                 InfoExtractor.__init__(self, downloader)
2584
2585         def report_webpage(self, video_id):
2586                 """Report information extraction."""
2587                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2588
2589         def report_extraction(self, video_id):
2590                 """Report information extraction."""
2591                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2592
2593         def _real_extract(self, url):
2594                 mobj = re.match(self._VALID_URL, url)
2595                 if mobj is None:
2596                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2597                         return
2598
2599                 # extract uploader (which is in the url)
2600                 uploader = mobj.group(1).decode('utf-8')
2601                 # extract simple title (uploader + slug of song title)
2602                 slug_title =  mobj.group(2).decode('utf-8')
2603                 simple_title = uploader + u'-' + slug_title
2604
2605                 self.report_webpage('%s/%s' % (uploader, slug_title))
2606
2607                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2608                 try:
2609                         webpage = urllib2.urlopen(request).read()
2610                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2611                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2612                         return
2613
2614                 self.report_extraction('%s/%s' % (uploader, slug_title))
2615
2616                 # extract uid and stream token that soundcloud hands out for access
2617                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2618                 if mobj:
2619                         video_id = mobj.group(1)
2620                         stream_token = mobj.group(2)
2621
2622                 # extract unsimplified title
2623                 mobj = re.search('"title":"(.*?)",', webpage)
2624                 if mobj:
2625                         title = mobj.group(1).decode('utf-8')
2626                 else:
2627                         title = simple_title
2628
2629                 # construct media url (with uid/token)
2630                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2631                 mediaURL = mediaURL % (video_id, stream_token)
2632
2633                 # description
2634                 description = u'No description available'
2635                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2636                 if mobj:
2637                         description = mobj.group(1)
2638
2639                 # upload date
2640                 upload_date = None
2641                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2642                 if mobj:
2643                         try:
2644                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2645                         except Exception, e:
2646                                 self._downloader.to_stderr(str(e))
2647
2648                 # for soundcloud, a request to a cross domain is required for cookies
2649                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2650
2651                 return [{
2652                         'id':           video_id.decode('utf-8'),
2653                         'url':          mediaURL,
2654                         'uploader':     uploader.decode('utf-8'),
2655                         'upload_date':  upload_date,
2656                         'title':        title,
2657                         'ext':          u'mp3',
2658                         'format':       u'NA',
2659                         'player_url':   None,
2660                         'description': description.decode('utf-8')
2661                 }]
2662
2663
2664 class InfoQIE(InfoExtractor):
2665         """Information extractor for infoq.com"""
2666
2667         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2668         IE_NAME = u'infoq'
2669
2670         def report_webpage(self, video_id):
2671                 """Report information extraction."""
2672                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2673
2674         def report_extraction(self, video_id):
2675                 """Report information extraction."""
2676                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2677
2678         def _real_extract(self, url):
2679                 mobj = re.match(self._VALID_URL, url)
2680                 if mobj is None:
2681                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2682                         return
2683
2684                 self.report_webpage(url)
2685
2686                 request = urllib2.Request(url)
2687                 try:
2688                         webpage = urllib2.urlopen(request).read()
2689                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2690                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2691                         return
2692
2693                 self.report_extraction(url)
2694
2695
2696                 # Extract video URL
2697                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2698                 if mobj is None:
2699                         self._downloader.trouble(u'ERROR: unable to extract video url')
2700                         return
2701                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2702
2703
2704                 # Extract title
2705                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2706                 if mobj is None:
2707                         self._downloader.trouble(u'ERROR: unable to extract video title')
2708                         return
2709                 video_title = mobj.group(1).decode('utf-8')
2710
2711                 # Extract description
2712                 video_description = u'No description available.'
2713                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2714                 if mobj is not None:
2715                         video_description = mobj.group(1).decode('utf-8')
2716
2717                 video_filename = video_url.split('/')[-1]
2718                 video_id, extension = video_filename.split('.')
2719
2720                 info = {
2721                         'id': video_id,
2722                         'url': video_url,
2723                         'uploader': None,
2724                         'upload_date': None,
2725                         'title': video_title,
2726                         'ext': extension,
2727                         'format': extension, # Extension is always(?) mp4, but seems to be flv
2728                         'thumbnail': None,
2729                         'description': video_description,
2730                         'player_url': None,
2731                 }
2732
2733                 return [info]
2734
2735 class MixcloudIE(InfoExtractor):
2736         """Information extractor for www.mixcloud.com"""
2737         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2738         IE_NAME = u'mixcloud'
2739
2740         def __init__(self, downloader=None):
2741                 InfoExtractor.__init__(self, downloader)
2742
2743         def report_download_json(self, file_id):
2744                 """Report JSON download."""
2745                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2746
2747         def report_extraction(self, file_id):
2748                 """Report information extraction."""
2749                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2750
2751         def get_urls(self, jsonData, fmt, bitrate='best'):
2752                 """Get urls from 'audio_formats' section in json"""
2753                 file_url = None
2754                 try:
2755                         bitrate_list = jsonData[fmt]
2756                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2757                                 bitrate = max(bitrate_list) # select highest
2758
2759                         url_list = jsonData[fmt][bitrate]
2760                 except TypeError: # we have no bitrate info.
2761                         url_list = jsonData[fmt]
2762                 return url_list
2763
2764         def check_urls(self, url_list):
2765                 """Returns 1st active url from list"""
2766                 for url in url_list:
2767                         try:
2768                                 urllib2.urlopen(url)
2769                                 return url
2770                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2771                                 url = None
2772
2773                 return None
2774
2775         def _print_formats(self, formats):
2776                 print 'Available formats:'
2777                 for fmt in formats.keys():
2778                         for b in formats[fmt]:
2779                                 try:
2780                                         ext = formats[fmt][b][0]
2781                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2782                                 except TypeError: # we have no bitrate info
2783                                         ext = formats[fmt][0]
2784                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2785                                         break
2786
2787         def _real_extract(self, url):
2788                 mobj = re.match(self._VALID_URL, url)
2789                 if mobj is None:
2790                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2791                         return
2792                 # extract uploader & filename from url
2793                 uploader = mobj.group(1).decode('utf-8')
2794                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2795
2796                 # construct API request
2797                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2798                 # retrieve .json file with links to files
2799                 request = urllib2.Request(file_url)
2800                 try:
2801                         self.report_download_json(file_url)
2802                         jsonData = urllib2.urlopen(request).read()
2803                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2804                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2805                         return
2806
2807                 # parse JSON
2808                 json_data = json.loads(jsonData)
2809                 player_url = json_data['player_swf_url']
2810                 formats = dict(json_data['audio_formats'])
2811
2812                 req_format = self._downloader.params.get('format', None)
2813                 bitrate = None
2814
2815                 if self._downloader.params.get('listformats', None):
2816                         self._print_formats(formats)
2817                         return
2818
2819                 if req_format is None or req_format == 'best':
2820                         for format_param in formats.keys():
2821                                 url_list = self.get_urls(formats, format_param)
2822                                 # check urls
2823                                 file_url = self.check_urls(url_list)
2824                                 if file_url is not None:
2825                                         break # got it!
2826                 else:
2827                         if req_format not in formats.keys():
2828                                 self._downloader.trouble(u'ERROR: format is not available')
2829                                 return
2830
2831                         url_list = self.get_urls(formats, req_format)
2832                         file_url = self.check_urls(url_list)
2833                         format_param = req_format
2834
2835                 return [{
2836                         'id': file_id.decode('utf-8'),
2837                         'url': file_url.decode('utf-8'),
2838                         'uploader':     uploader.decode('utf-8'),
2839                         'upload_date': u'NA',
2840                         'title': json_data['name'],
2841                         'ext': file_url.split('.')[-1].decode('utf-8'),
2842                         'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2843                         'thumbnail': json_data['thumbnail_url'],
2844                         'description': json_data['description'],
2845                         'player_url': player_url.decode('utf-8'),
2846                 }]
2847
2848 class StanfordOpenClassroomIE(InfoExtractor):
2849         """Information extractor for Stanford's Open ClassRoom"""
2850
2851         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2852         IE_NAME = u'stanfordoc'
2853
2854         def report_download_webpage(self, objid):
2855                 """Report information extraction."""
2856                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2857
2858         def report_extraction(self, video_id):
2859                 """Report information extraction."""
2860                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2861
2862         def _real_extract(self, url):
2863                 mobj = re.match(self._VALID_URL, url)
2864                 if mobj is None:
2865                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2866                         return
2867
2868                 if mobj.group('course') and mobj.group('video'): # A specific video
2869                         course = mobj.group('course')
2870                         video = mobj.group('video')
2871                         info = {
2872                                 'id': course + '_' + video,
2873                         }
2874
2875                         self.report_extraction(info['id'])
2876                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2877                         xmlUrl = baseUrl + video + '.xml'
2878                         try:
2879                                 metaXml = urllib2.urlopen(xmlUrl).read()
2880                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2881                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2882                                 return
2883                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2884                         try:
2885                                 info['title'] = mdoc.findall('./title')[0].text
2886                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2887                         except IndexError:
2888                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2889                                 return
2890                         info['ext'] = info['url'].rpartition('.')[2]
2891                         info['format'] = info['ext']
2892                         return [info]
2893                 elif mobj.group('course'): # A course page
2894                         course = mobj.group('course')
2895                         info = {
2896                                 'id': course,
2897                                 'type': 'playlist',
2898                         }
2899
2900                         self.report_download_webpage(info['id'])
2901                         try:
2902                                 coursepage = urllib2.urlopen(url).read()
2903                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2904                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2905                                 return
2906
2907                         m = re.search('<h1>([^<]+)</h1>', coursepage)
2908                         if m:
2909                                 info['title'] = unescapeHTML(m.group(1))
2910                         else:
2911                                 info['title'] = info['id']
2912
2913                         m = re.search('<description>([^<]+)</description>', coursepage)
2914                         if m:
2915                                 info['description'] = unescapeHTML(m.group(1))
2916
2917                         links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2918                         info['list'] = [
2919                                 {
2920                                         'type': 'reference',
2921                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2922                                 }
2923                                         for vpage in links]
2924                         results = []
2925                         for entry in info['list']:
2926                                 assert entry['type'] == 'reference'
2927                                 results += self.extract(entry['url'])
2928                         return results
2929
2930                 else: # Root page
2931                         info = {
2932                                 'id': 'Stanford OpenClassroom',
2933                                 'type': 'playlist',
2934                         }
2935
2936                         self.report_download_webpage(info['id'])
2937                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2938                         try:
2939                                 rootpage = urllib2.urlopen(rootURL).read()
2940                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2941                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2942                                 return
2943
2944                         info['title'] = info['id']
2945
2946                         links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2947                         info['list'] = [
2948                                 {
2949                                         'type': 'reference',
2950                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2951                                 }
2952                                         for cpage in links]
2953
2954                         results = []
2955                         for entry in info['list']:
2956                                 assert entry['type'] == 'reference'
2957                                 results += self.extract(entry['url'])
2958                         return results
2959
2960 class MTVIE(InfoExtractor):
2961         """Information extractor for MTV.com"""
2962
2963         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2964         IE_NAME = u'mtv'
2965
2966         def report_webpage(self, video_id):
2967                 """Report information extraction."""
2968                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2969
2970         def report_extraction(self, video_id):
2971                 """Report information extraction."""
2972                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2973
2974         def _real_extract(self, url):
2975                 mobj = re.match(self._VALID_URL, url)
2976                 if mobj is None:
2977                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2978                         return
2979                 if not mobj.group('proto'):
2980                         url = 'http://' + url
2981                 video_id = mobj.group('videoid')
2982                 self.report_webpage(video_id)
2983
2984                 request = urllib2.Request(url)
2985                 try:
2986                         webpage = urllib2.urlopen(request).read()
2987                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2988                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2989                         return
2990
2991                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2992                 if mobj is None:
2993                         self._downloader.trouble(u'ERROR: unable to extract song name')
2994                         return
2995                 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2996                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2997                 if mobj is None:
2998                         self._downloader.trouble(u'ERROR: unable to extract performer')
2999                         return
3000                 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3001                 video_title = performer + ' - ' + song_name
3002
3003                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3004                 if mobj is None:
3005                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3006                         return
3007                 mtvn_uri = mobj.group(1)
3008
3009                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3010                 if mobj is None:
3011                         self._downloader.trouble(u'ERROR: unable to extract content id')
3012                         return
3013                 content_id = mobj.group(1)
3014
3015                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3016                 self.report_extraction(video_id)
3017                 request = urllib2.Request(videogen_url)
3018                 try:
3019                         metadataXml = urllib2.urlopen(request).read()
3020                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3021                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3022                         return
3023
3024                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3025                 renditions = mdoc.findall('.//rendition')
3026
3027                 # For now, always pick the highest quality.
3028                 rendition = renditions[-1]
3029
3030                 try:
3031                         _,_,ext = rendition.attrib['type'].partition('/')
3032                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3033                         video_url = rendition.find('./src').text
3034                 except KeyError:
3035                         self._downloader.trouble('Invalid rendition field.')
3036                         return
3037
3038                 info = {
3039                         'id': video_id,
3040                         'url': video_url,
3041                         'uploader': performer,
3042                         'title': video_title,
3043                         'ext': ext,
3044                         'format': format,
3045                 }
3046
3047                 return [info]
3048
3049
3050 class YoukuIE(InfoExtractor):
3051
3052         _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3053         IE_NAME = u'Youku'
3054
3055         def __init__(self, downloader=None):
3056                 InfoExtractor.__init__(self, downloader)
3057
3058         def report_download_webpage(self, file_id):
3059                 """Report webpage download."""
3060                 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3061
3062         def report_extraction(self, file_id):
3063                 """Report information extraction."""
3064                 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3065
3066         def _gen_sid(self):
3067                 nowTime = int(time.time() * 1000)
3068                 random1 = random.randint(1000,1998)
3069                 random2 = random.randint(1000,9999)
3070
3071                 return "%d%d%d" %(nowTime,random1,random2)
3072
3073         def _get_file_ID_mix_string(self, seed):
3074                 mixed = []
3075                 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3076                 seed = float(seed)
3077                 for i in range(len(source)):
3078                         seed  =  (seed * 211 + 30031 ) % 65536
3079                         index  =  math.floor(seed / 65536 * len(source) )
3080                         mixed.append(source[int(index)])
3081                         source.remove(source[int(index)])
3082                 #return ''.join(mixed)
3083                 return mixed
3084
3085         def _get_file_id(self, fileId, seed):
3086                 mixed = self._get_file_ID_mix_string(seed)
3087                 ids = fileId.split('*')
3088                 realId = []
3089                 for ch in ids:
3090                         if ch:
3091                                 realId.append(mixed[int(ch)])
3092                 return ''.join(realId)
3093
3094         def _real_extract(self, url):
3095                 mobj = re.match(self._VALID_URL, url)
3096                 if mobj is None:
3097                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3098                         return
3099                 video_id = mobj.group('ID')
3100
3101                 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3102
3103                 request = urllib2.Request(info_url, None, std_headers)
3104                 try:
3105                         self.report_download_webpage(video_id)
3106                         jsondata = urllib2.urlopen(request).read()
3107                 except (urllib2.URLError, httplib.HTTPException, socket.error) as err:
3108                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3109                         return
3110
3111                 self.report_extraction(video_id)
3112                 try:
3113                         config = json.loads(jsondata)
3114
3115                         video_title =  config['data'][0]['title']
3116                         seed = config['data'][0]['seed']
3117
3118                         format = self._downloader.params.get('format', None)
3119                         supported_format = config['data'][0]['streamfileids'].keys()
3120
3121                         if format is None or format == 'best':
3122                                 if 'hd2' in supported_format:
3123                                         format = 'hd2'
3124                                 else:
3125                                         format = 'flv'
3126                                 ext = u'flv'
3127                         elif format == 'worst':
3128                                 format = 'mp4'
3129                                 ext = u'mp4'
3130                         else:
3131                                 format = 'flv'
3132                                 ext = u'flv'
3133
3134
3135                         fileid = config['data'][0]['streamfileids'][format]
3136                         seg_number = len(config['data'][0]['segs'][format])
3137
3138                         keys=[]
3139                         for i in xrange(seg_number):
3140                                 keys.append(config['data'][0]['segs'][format][i]['k'])
3141
3142                         #TODO check error
3143                         #youku only could be viewed from mainland china
3144                 except:
3145                         self._downloader.trouble(u'ERROR: unable to extract info section')
3146                         return
3147
3148                 files_info=[]
3149                 sid = self._gen_sid()
3150                 fileid = self._get_file_id(fileid, seed)
3151
3152                 #column 8,9 of fileid represent the segment number
3153                 #fileid[7:9] should be changed
3154                 for index, key in enumerate(keys):
3155
3156                         temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3157                         download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3158
3159                         info = {
3160                                 'id': '%s_part%02d' % (video_id, index),
3161                                 'url': download_url,
3162                                 'uploader': None,
3163                                 'title': video_title,
3164                                 'ext': ext,
3165                                 'format': u'NA'
3166                         }
3167                         files_info.append(info)
3168
3169                 return files_info
3170
3171
3172 class XNXXIE(InfoExtractor):
3173         """Information extractor for xnxx.com"""
3174
3175         _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3176         IE_NAME = u'xnxx'
3177         VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3178         VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3179         VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3180
3181         def report_webpage(self, video_id):
3182                 """Report information extraction"""
3183                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3184
3185         def report_extraction(self, video_id):
3186                 """Report information extraction"""
3187                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3188
3189         def _real_extract(self, url):
3190                 mobj = re.match(self._VALID_URL, url)
3191                 if mobj is None:
3192                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3193                         return
3194                 video_id = mobj.group(1).decode('utf-8')
3195
3196                 self.report_webpage(video_id)
3197
3198                 # Get webpage content
3199                 try:
3200                         webpage = urllib2.urlopen(url).read()
3201                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3202                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3203                         return
3204
3205                 result = re.search(self.VIDEO_URL_RE, webpage)
3206                 if result is None:
3207                         self._downloader.trouble(u'ERROR: unable to extract video url')
3208                         return
3209                 video_url = urllib.unquote(result.group(1).decode('utf-8'))
3210
3211                 result = re.search(self.VIDEO_TITLE_RE, webpage)
3212                 if result is None:
3213                         self._downloader.trouble(u'ERROR: unable to extract video title')
3214                         return
3215                 video_title = result.group(1).decode('utf-8')
3216
3217                 result = re.search(self.VIDEO_THUMB_RE, webpage)
3218                 if result is None:
3219                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3220                         return
3221                 video_thumbnail = result.group(1).decode('utf-8')
3222
3223                 info = {'id': video_id,
3224                                 'url': video_url,
3225                                 'uploader': None,
3226                                 'upload_date': None,
3227                                 'title': video_title,
3228                                 'ext': 'flv',
3229                                 'format': 'flv',
3230                                 'thumbnail': video_thumbnail,
3231                                 'description': None,
3232                                 'player_url': None}
3233
3234                 return [info]
3235
3236
3237 class GooglePlusIE(InfoExtractor):
3238         """Information extractor for plus.google.com."""
3239
3240         _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3241         IE_NAME = u'plus.google'
3242
3243         def __init__(self, downloader=None):
3244                 InfoExtractor.__init__(self, downloader)
3245
3246         def report_extract_entry(self, url):
3247                 """Report downloading extry"""
3248                 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3249
3250         def report_date(self, upload_date):
3251                 """Report downloading extry"""
3252                 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3253
3254         def report_uploader(self, uploader):
3255                 """Report downloading extry"""
3256                 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3257
3258         def report_title(self, video_title):
3259                 """Report downloading extry"""
3260                 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3261
3262         def report_extract_vid_page(self, video_page):
3263                 """Report information extraction."""
3264                 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3265
3266         def _real_extract(self, url):
3267                 # Extract id from URL
3268                 mobj = re.match(self._VALID_URL, url)
3269                 if mobj is None:
3270                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3271                         return
3272
3273                 post_url = mobj.group(0)
3274                 video_id = mobj.group(2)
3275
3276                 video_extension = 'flv'
3277
3278                 # Step 1, Retrieve post webpage to extract further information
3279                 self.report_extract_entry(post_url)
3280                 request = urllib2.Request(post_url)
3281                 try:
3282                         webpage = urllib2.urlopen(request).read()
3283                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3284                         self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % str(err))
3285                         return
3286
3287                 # Extract update date
3288                 upload_date = u'NA'
3289                 pattern = 'title="Timestamp">(.*?)</a>'
3290                 mobj = re.search(pattern, webpage)
3291                 if mobj:
3292                         upload_date = mobj.group(1)
3293                         # Convert timestring to a format suitable for filename
3294                         upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3295                         upload_date = upload_date.strftime('%Y%m%d')
3296                 self.report_date(upload_date)
3297
3298                 # Extract uploader
3299                 uploader = u'NA'
3300                 pattern = r'rel\="author".*?>(.*?)</a>'
3301                 mobj = re.search(pattern, webpage)
3302                 if mobj:
3303                         uploader = mobj.group(1)
3304                 self.report_uploader(uploader)
3305
3306                 # Extract title
3307                 # Get the first line for title
3308                 video_title = u'NA'
3309                 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3310                 mobj = re.search(pattern, webpage)
3311                 if mobj:
3312                         video_title = mobj.group(1)
3313                 self.report_title(video_title)
3314
3315                 # Step 2, Stimulate clicking the image box to launch video
3316                 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3317                 mobj = re.search(pattern, webpage)
3318                 if mobj is None:
3319                         self._downloader.trouble(u'ERROR: unable to extract video page URL')
3320
3321                 video_page = mobj.group(1)
3322                 request = urllib2.Request(video_page)
3323                 try:
3324                         webpage = urllib2.urlopen(request).read()
3325                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3326                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3327                         return
3328                 self.report_extract_vid_page(video_page)
3329
3330
3331                 # Extract video links on video page
3332                 """Extract video links of all sizes"""
3333                 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3334                 mobj = re.findall(pattern, webpage)
3335                 if len(mobj) == 0:
3336                         self._downloader.trouble(u'ERROR: unable to extract video links')
3337
3338                 # Sort in resolution
3339                 links = sorted(mobj)
3340
3341                 # Choose the lowest of the sort, i.e. highest resolution
3342                 video_url = links[-1]
3343                 # Only get the url. The resolution part in the tuple has no use anymore
3344                 video_url = video_url[-1]
3345                 # Treat escaped \u0026 style hex
3346                 video_url = unicode(video_url, "unicode_escape")
3347
3348
3349                 return [{
3350                         'id':           video_id.decode('utf-8'),
3351                         'url':          video_url,
3352                         'uploader':     uploader.decode('utf-8'),
3353                         'upload_date':  upload_date.decode('utf-8'),
3354                         'title':        video_title.decode('utf-8'),
3355                         'ext':          video_extension.decode('utf-8'),
3356                         'format':       u'NA',
3357                         'player_url':   None,
3358                 }]