_ Git - youtube-dl/blob - youtube_dl/__init__.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 __authors__  = (
   5         'Ricardo Garcia Gonzalez',
   6         'Danny Colligan',
   7         'Benjamin Johnson',
   8         'Vasyl\' Vavrychuk',
   9         'Witold Baryluk',
  10         'Paweł Paprota',
  11         'Gergely Imreh',
  12         'Rogério Brito',
  13         'Philipp Hagemeister',
  14         'Sören Schulze',
  15         'Kevin Ngo',
  16         'Ori Avtalion',
  17         'shizeeg',
  18         'Filippo Valsorda',
  19         )
  20
  21 __license__ = 'Public Domain'
  22 __version__ = '2012.02.27'
  23
  24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
  25
  26
  27 import cookielib
  28 import datetime
  29 import getpass
  30 import gzip
  31 import htmlentitydefs
  32 import HTMLParser
  33 import httplib
  34 import locale
  35 import math
  36 import netrc
  37 import optparse
  38 import os
  39 import os.path
  40 import re
  41 import shlex
  42 import socket
  43 import string
  44 import subprocess
  45 import sys
  46 import time
  47 import urllib
  48 import urllib2
  49 import warnings
  50 import zlib
  51
  52 if os.name == 'nt':
  53         import ctypes
  54
  55 try:
  56         import email.utils
  57 except ImportError: # Python 2.4
  58         import email.Utils
  59 try:
  60         import cStringIO as StringIO
  61 except ImportError:
  62         import StringIO
  63
  64 # parse_qs was moved from the cgi module to the urlparse module recently.
  65 try:
  66         from urlparse import parse_qs
  67 except ImportError:
  68         from cgi import parse_qs
  69
  70 try:
  71         import lxml.etree
  72 except ImportError:
  73         pass # Handled below
  74
  75 try:
  76         import xml.etree.ElementTree
  77 except ImportError: # Python<2.5: Not officially supported, but let it slip
  78         warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
  79
  80 std_headers = {
  81         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  82         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  83         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  84         'Accept-Encoding': 'gzip, deflate',
  85         'Accept-Language': 'en-us,en;q=0.5',
  86 }
  87
  88 try:
  89         import json
  90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  91         import re
  92         class json(object):
  93                 @staticmethod
  94                 def loads(s):
  95                         s = s.decode('UTF-8')
  96                         def raiseError(msg, i):
  97                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  98                         def skipSpace(i, expectMore=True):
  99                                 while i < len(s) and s[i] in ' \t\r\n':
 100                                         i += 1
 101                                 if expectMore:
 102                                         if i >= len(s):
 103                                                 raiseError('Premature end', i)
 104                                 return i
 105                         def decodeEscape(match):
 106                                 esc = match.group(1)
 107                                 _STATIC = {
 108                                         '"': '"',
 109                                         '\\': '\\',
 110                                         '/': '/',
 111                                         'b': unichr(0x8),
 112                                         'f': unichr(0xc),
 113                                         'n': '\n',
 114                                         'r': '\r',
 115                                         't': '\t',
 116                                 }
 117                                 if esc in _STATIC:
 118                                         return _STATIC[esc]
 119                                 if esc[0] == 'u':
 120                                         if len(esc) == 1+4:
 121                                                 return unichr(int(esc[1:5], 16))
 122                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
 123                                                 hi = int(esc[1:5], 16)
 124                                                 low = int(esc[7:11], 16)
 125                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 126                                 raise ValueError('Unknown escape ' + str(esc))
 127                         def parseString(i):
 128                                 i += 1
 129                                 e = i
 130                                 while True:
 131                                         e = s.index('"', e)
 132                                         bslashes = 0
 133                                         while s[e-bslashes-1] == '\\':
 134                                                 bslashes += 1
 135                                         if bslashes % 2 == 1:
 136                                                 e += 1
 137                                                 continue
 138                                         break
 139                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 140                                 stri = rexp.sub(decodeEscape, s[i:e])
 141                                 return (e+1,stri)
 142                         def parseObj(i):
 143                                 i += 1
 144                                 res = {}
 145                                 i = skipSpace(i)
 146                                 if s[i] == '}': # Empty dictionary
 147                                         return (i+1,res)
 148                                 while True:
 149                                         if s[i] != '"':
 150                                                 raiseError('Expected a string object key', i)
 151                                         i,key = parseString(i)
 152                                         i = skipSpace(i)
 153                                         if i >= len(s) or s[i] != ':':
 154                                                 raiseError('Expected a colon', i)
 155                                         i,val = parse(i+1)
 156                                         res[key] = val
 157                                         i = skipSpace(i)
 158                                         if s[i] == '}':
 159                                                 return (i+1, res)
 160                                         if s[i] != ',':
 161                                                 raiseError('Expected comma or closing curly brace', i)
 162                                         i = skipSpace(i+1)
 163                         def parseArray(i):
 164                                 res = []
 165                                 i = skipSpace(i+1)
 166                                 if s[i] == ']': # Empty array
 167                                         return (i+1,res)
 168                                 while True:
 169                                         i,val = parse(i)
 170                                         res.append(val)
 171                                         i = skipSpace(i) # Raise exception if premature end
 172                                         if s[i] == ']':
 173                                                 return (i+1, res)
 174                                         if s[i] != ',':
 175                                                 raiseError('Expected a comma or closing bracket', i)
 176                                         i = skipSpace(i+1)
 177                         def parseDiscrete(i):
 178                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 179                                         if s.startswith(k, i):
 180                                                 return (i+len(k), v)
 181                                 raiseError('Not a boolean (or null)', i)
 182                         def parseNumber(i):
 183                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 184                                 if mobj is None:
 185                                         raiseError('Not a number', i)
 186                                 nums = mobj.group(1)
 187                                 if '.' in nums or 'e' in nums or 'E' in nums:
 188                                         return (i+len(nums), float(nums))
 189                                 return (i+len(nums), int(nums))
 190                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 191                         def parse(i):
 192                                 i = skipSpace(i)
 193                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 194                                 i = skipSpace(i, False)
 195                                 return (i,res)
 196                         i,res = parse(0)
 197                         if i < len(s):
 198                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 199                         return res
 200
 201 def preferredencoding():
 202         """Get preferred encoding.
 203
 204         Returns the best encoding scheme for the system, based on
 205         locale.getpreferredencoding() and some further tweaks.
 206         """
 207         def yield_preferredencoding():
 208                 try:
 209                         pref = locale.getpreferredencoding()
 210                         u'TEST'.encode(pref)
 211                 except:
 212                         pref = 'UTF-8'
 213                 while True:
 214                         yield pref
 215         return yield_preferredencoding().next()
 216
 217
 218 def htmlentity_transform(matchobj):
 219         """Transforms an HTML entity to a Unicode character.
 220
 221         This function receives a match object and is intended to be used with
 222         the re.sub() function.
 223         """
 224         entity = matchobj.group(1)
 225
 226         # Known non-numeric HTML entity
 227         if entity in htmlentitydefs.name2codepoint:
 228                 return unichr(htmlentitydefs.name2codepoint[entity])
 229
 230         # Unicode character
 231         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 232         if mobj is not None:
 233                 numstr = mobj.group(1)
 234                 if numstr.startswith(u'x'):
 235                         base = 16
 236                         numstr = u'0%s' % numstr
 237                 else:
 238                         base = 10
 239                 return unichr(long(numstr, base))
 240
 241         # Unknown entity in name, return its literal representation
 242         return (u'&%s;' % entity)
 243
 244
 245 def sanitize_title(utitle):
 246         """Sanitizes a video title so it could be used as part of a filename."""
 247         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 248         return utitle.replace(unicode(os.sep), u'%')
 249
 250
 251 def sanitize_open(filename, open_mode):
 252         """Try to open the given filename, and slightly tweak it if this fails.
 253
 254         Attempts to open the given filename. If this fails, it tries to change
 255         the filename slightly, step by step, until it's either able to open it
 256         or it fails and raises a final exception, like the standard open()
 257         function.
 258
 259         It returns the tuple (stream, definitive_file_name).
 260         """
 261         try:
 262                 if filename == u'-':
 263                         if sys.platform == 'win32':
 264                                 import msvcrt
 265                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 266                         return (sys.stdout, filename)
 267                 stream = open(_encodeFilename(filename), open_mode)
 268                 return (stream, filename)
 269         except (IOError, OSError), err:
 270                 # In case of error, try to remove win32 forbidden chars
 271                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 272
 273                 # An exception here should be caught in the caller
 274                 stream = open(_encodeFilename(filename), open_mode)
 275                 return (stream, filename)
 276
 277
 278 def timeconvert(timestr):
 279         """Convert RFC 2822 defined time string into system timestamp"""
 280         timestamp = None
 281         timetuple = email.utils.parsedate_tz(timestr)
 282         if timetuple is not None:
 283                 timestamp = email.utils.mktime_tz(timetuple)
 284         return timestamp
 285
 286 def _simplify_title(title):
 287         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 288         return expr.sub(u'_', title).strip(u'_')
 289
 290 def _orderedSet(iterable):
 291         """ Remove all duplicates from the input iterable """
 292         res = []
 293         for el in iterable:
 294                 if el not in res:
 295                         res.append(el)
 296         return res
 297
 298 def _unescapeHTML(s):
 299         """
 300         @param s a string (of type unicode)
 301         """
 302         assert type(s) == type(u'')
 303
 304         htmlParser = HTMLParser.HTMLParser()
 305         return htmlParser.unescape(s)
 306
 307 def _encodeFilename(s):
 308         """
 309         @param s The name of the file (of type unicode)
 310         """
 311
 312         assert type(s) == type(u'')
 313
 314         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 315                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 316                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 317                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 318                 return s
 319         else:
 320                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 321
 322 class DownloadError(Exception):
 323         """Download Error exception.
 324
 325         This exception may be thrown by FileDownloader objects if they are not
 326         configured to continue on errors. They will contain the appropriate
 327         error message.
 328         """
 329         pass
 330
 331
 332 class SameFileError(Exception):
 333         """Same File exception.
 334
 335         This exception will be thrown by FileDownloader objects if they detect
 336         multiple files would have to be downloaded to the same file on disk.
 337         """
 338         pass
 339
 340
 341 class PostProcessingError(Exception):
 342         """Post Processing exception.
 343
 344         This exception may be raised by PostProcessor's .run() method to
 345         indicate an error in the postprocessing task.
 346         """
 347         pass
 348
 349 class MaxDownloadsReached(Exception):
 350         """ --max-downloads limit has been reached. """
 351         pass
 352
 353
 354 class UnavailableVideoError(Exception):
 355         """Unavailable Format exception.
 356
 357         This exception will be thrown when a video is requested
 358         in a format that is not available for that video.
 359         """
 360         pass
 361
 362
 363 class ContentTooShortError(Exception):
 364         """Content Too Short exception.
 365
 366         This exception may be raised by FileDownloader objects when a file they
 367         download is too small for what the server announced first, indicating
 368         the connection was probably interrupted.
 369         """
 370         # Both in bytes
 371         downloaded = None
 372         expected = None
 373
 374         def __init__(self, downloaded, expected):
 375                 self.downloaded = downloaded
 376                 self.expected = expected
 377
 378
 379 class YoutubeDLHandler(urllib2.HTTPHandler):
 380         """Handler for HTTP requests and responses.
 381
 382         This class, when installed with an OpenerDirector, automatically adds
 383         the standard headers to every HTTP request and handles gzipped and
 384         deflated responses from web servers. If compression is to be avoided in
 385         a particular request, the original request in the program code only has
 386         to include the HTTP header "Youtubedl-No-Compression", which will be
 387         removed before making the real request.
 388
 389         Part of this code was copied from:
 390
 391         http://techknack.net/python-urllib2-handlers/
 392
 393         Andrew Rowls, the author of that code, agreed to release it to the
 394         public domain.
 395         """
 396
 397         @staticmethod
 398         def deflate(data):
 399                 try:
 400                         return zlib.decompress(data, -zlib.MAX_WBITS)
 401                 except zlib.error:
 402                         return zlib.decompress(data)
 403
 404         @staticmethod
 405         def addinfourl_wrapper(stream, headers, url, code):
 406                 if hasattr(urllib2.addinfourl, 'getcode'):
 407                         return urllib2.addinfourl(stream, headers, url, code)
 408                 ret = urllib2.addinfourl(stream, headers, url)
 409                 ret.code = code
 410                 return ret
 411
 412         def http_request(self, req):
 413                 for h in std_headers:
 414                         if h in req.headers:
 415                                 del req.headers[h]
 416                         req.add_header(h, std_headers[h])
 417                 if 'Youtubedl-no-compression' in req.headers:
 418                         if 'Accept-encoding' in req.headers:
 419                                 del req.headers['Accept-encoding']
 420                         del req.headers['Youtubedl-no-compression']
 421                 return req
 422
 423         def http_response(self, req, resp):
 424                 old_resp = resp
 425                 # gzip
 426                 if resp.headers.get('Content-encoding', '') == 'gzip':
 427                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 428                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 429                         resp.msg = old_resp.msg
 430                 # deflate
 431                 if resp.headers.get('Content-encoding', '') == 'deflate':
 432                         gz = StringIO.StringIO(self.deflate(resp.read()))
 433                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 434                         resp.msg = old_resp.msg
 435                 return resp
 436
 437
 438 class FileDownloader(object):
 439         """File Downloader class.
 440
 441         File downloader objects are the ones responsible of downloading the
 442         actual video file and writing it to disk if the user has requested
 443         it, among some other tasks. In most cases there should be one per
 444         program. As, given a video URL, the downloader doesn't know how to
 445         extract all the needed information, task that InfoExtractors do, it
 446         has to pass the URL to one of them.
 447
 448         For this, file downloader objects have a method that allows
 449         InfoExtractors to be registered in a given order. When it is passed
 450         a URL, the file downloader handles it to the first InfoExtractor it
 451         finds that reports being able to handle it. The InfoExtractor extracts
 452         all the information about the video or videos the URL refers to, and
 453         asks the FileDownloader to process the video information, possibly
 454         downloading the video.
 455
 456         File downloaders accept a lot of parameters. In order not to saturate
 457         the object constructor with arguments, it receives a dictionary of
 458         options instead. These options are available through the params
 459         attribute for the InfoExtractors to use. The FileDownloader also
 460         registers itself as the downloader in charge for the InfoExtractors
 461         that are added to it, so this is a "mutual registration".
 462
 463         Available options:
 464
 465         username:         Username for authentication purposes.
 466         password:         Password for authentication purposes.
 467         usenetrc:         Use netrc for authentication instead.
 468         quiet:            Do not print messages to stdout.
 469         forceurl:         Force printing final URL.
 470         forcetitle:       Force printing title.
 471         forcethumbnail:   Force printing thumbnail URL.
 472         forcedescription: Force printing description.
 473         forcefilename:    Force printing final filename.
 474         simulate:         Do not download the video files.
 475         format:           Video format code.
 476         format_limit:     Highest quality format to try.
 477         outtmpl:          Template for output names.
 478         ignoreerrors:     Do not stop on download errors.
 479         ratelimit:        Download speed limit, in bytes/sec.
 480         nooverwrites:     Prevent overwriting files.
 481         retries:          Number of times to retry for HTTP error 5xx
 482         continuedl:       Try to continue downloads if possible.
 483         noprogress:       Do not print the progress bar.
 484         playliststart:    Playlist item to start at.
 485         playlistend:      Playlist item to end at.
 486         matchtitle:       Download only matching titles.
 487         rejecttitle:      Reject downloads for matching titles.
 488         logtostderr:      Log messages to stderr instead of stdout.
 489         consoletitle:     Display progress in console window's titlebar.
 490         nopart:           Do not use temporary .part files.
 491         updatetime:       Use the Last-modified header to set output file timestamps.
 492         writedescription: Write the video description to a .description file
 493         writeinfojson:    Write the video description to a .info.json file
 494         writesubtitles:   Write the video subtitles to a .srt file
 495         subtitleslang:    Language of the subtitles to download
 496         """
 497
 498         params = None
 499         _ies = []
 500         _pps = []
 501         _download_retcode = None
 502         _num_downloads = None
 503         _screen_file = None
 504
 505         def __init__(self, params):
 506                 """Create a FileDownloader object with the given options."""
 507                 self._ies = []
 508                 self._pps = []
 509                 self._download_retcode = 0
 510                 self._num_downloads = 0
 511                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 512                 self.params = params
 513
 514         @staticmethod
 515         def format_bytes(bytes):
 516                 if bytes is None:
 517                         return 'N/A'
 518                 if type(bytes) is str:
 519                         bytes = float(bytes)
 520                 if bytes == 0.0:
 521                         exponent = 0
 522                 else:
 523                         exponent = long(math.log(bytes, 1024.0))
 524                 suffix = 'bkMGTPEZY'[exponent]
 525                 converted = float(bytes) / float(1024 ** exponent)
 526                 return '%.2f%s' % (converted, suffix)
 527
 528         @staticmethod
 529         def calc_percent(byte_counter, data_len):
 530                 if data_len is None:
 531                         return '---.-%'
 532                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 533
 534         @staticmethod
 535         def calc_eta(start, now, total, current):
 536                 if total is None:
 537                         return '--:--'
 538                 dif = now - start
 539                 if current == 0 or dif < 0.001: # One millisecond
 540                         return '--:--'
 541                 rate = float(current) / dif
 542                 eta = long((float(total) - float(current)) / rate)
 543                 (eta_mins, eta_secs) = divmod(eta, 60)
 544                 if eta_mins > 99:
 545                         return '--:--'
 546                 return '%02d:%02d' % (eta_mins, eta_secs)
 547
 548         @staticmethod
 549         def calc_speed(start, now, bytes):
 550                 dif = now - start
 551                 if bytes == 0 or dif < 0.001: # One millisecond
 552                         return '%10s' % '---b/s'
 553                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 554
 555         @staticmethod
 556         def best_block_size(elapsed_time, bytes):
 557                 new_min = max(bytes / 2.0, 1.0)
 558                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 559                 if elapsed_time < 0.001:
 560                         return long(new_max)
 561                 rate = bytes / elapsed_time
 562                 if rate > new_max:
 563                         return long(new_max)
 564                 if rate < new_min:
 565                         return long(new_min)
 566                 return long(rate)
 567
 568         @staticmethod
 569         def parse_bytes(bytestr):
 570                 """Parse a string indicating a byte quantity into a long integer."""
 571                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 572                 if matchobj is None:
 573                         return None
 574                 number = float(matchobj.group(1))
 575                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 576                 return long(round(number * multiplier))
 577
 578         def add_info_extractor(self, ie):
 579                 """Add an InfoExtractor object to the end of the list."""
 580                 self._ies.append(ie)
 581                 ie.set_downloader(self)
 582
 583         def add_post_processor(self, pp):
 584                 """Add a PostProcessor object to the end of the chain."""
 585                 self._pps.append(pp)
 586                 pp.set_downloader(self)
 587
 588         def to_screen(self, message, skip_eol=False):
 589                 """Print message to stdout if not in quiet mode."""
 590                 assert type(message) == type(u'')
 591                 if not self.params.get('quiet', False):
 592                         terminator = [u'\n', u''][skip_eol]
 593                         output = message + terminator
 594
 595                         if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 596                                 output = output.encode(preferredencoding(), 'ignore')
 597                         self._screen_file.write(output)
 598                         self._screen_file.flush()
 599
 600         def to_stderr(self, message):
 601                 """Print message to stderr."""
 602                 print >>sys.stderr, message.encode(preferredencoding())
 603
 604         def to_cons_title(self, message):
 605                 """Set console/terminal window title to message."""
 606                 if not self.params.get('consoletitle', False):
 607                         return
 608                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 609                         # c_wchar_p() might not be necessary if `message` is
 610                         # already of type unicode()
 611                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 612                 elif 'TERM' in os.environ:
 613                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 614
 615         def fixed_template(self):
 616                 """Checks if the output template is fixed."""
 617                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 618
 619         def trouble(self, message=None):
 620                 """Determine action to take when a download problem appears.
 621
 622                 Depending on if the downloader has been configured to ignore
 623                 download errors or not, this method may throw an exception or
 624                 not when errors are found, after printing the message.
 625                 """
 626                 if message is not None:
 627                         self.to_stderr(message)
 628                 if not self.params.get('ignoreerrors', False):
 629                         raise DownloadError(message)
 630                 self._download_retcode = 1
 631
 632         def slow_down(self, start_time, byte_counter):
 633                 """Sleep if the download speed is over the rate limit."""
 634                 rate_limit = self.params.get('ratelimit', None)
 635                 if rate_limit is None or byte_counter == 0:
 636                         return
 637                 now = time.time()
 638                 elapsed = now - start_time
 639                 if elapsed <= 0.0:
 640                         return
 641                 speed = float(byte_counter) / elapsed
 642                 if speed > rate_limit:
 643                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 644
 645         def temp_name(self, filename):
 646                 """Returns a temporary filename for the given filename."""
 647                 if self.params.get('nopart', False) or filename == u'-' or \
 648                                 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
 649                         return filename
 650                 return filename + u'.part'
 651
 652         def undo_temp_name(self, filename):
 653                 if filename.endswith(u'.part'):
 654                         return filename[:-len(u'.part')]
 655                 return filename
 656
 657         def try_rename(self, old_filename, new_filename):
 658                 try:
 659                         if old_filename == new_filename:
 660                                 return
 661                         os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
 662                 except (IOError, OSError), err:
 663                         self.trouble(u'ERROR: unable to rename file')
 664
 665         def try_utime(self, filename, last_modified_hdr):
 666                 """Try to set the last-modified time of the given file."""
 667                 if last_modified_hdr is None:
 668                         return
 669                 if not os.path.isfile(_encodeFilename(filename)):
 670                         return
 671                 timestr = last_modified_hdr
 672                 if timestr is None:
 673                         return
 674                 filetime = timeconvert(timestr)
 675                 if filetime is None:
 676                         return filetime
 677                 try:
 678                         os.utime(filename, (time.time(), filetime))
 679                 except:
 680                         pass
 681                 return filetime
 682
 683         def report_writedescription(self, descfn):
 684                 """ Report that the description file is being written """
 685                 self.to_screen(u'[info] Writing video description to: ' + descfn)
 686
 687         def report_writesubtitles(self, srtfn):
 688                 """ Report that the subtitles file is being written """
 689                 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
 690
 691         def report_writeinfojson(self, infofn):
 692                 """ Report that the metadata file has been written """
 693                 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 694
 695         def report_destination(self, filename):
 696                 """Report destination filename."""
 697                 self.to_screen(u'[download] Destination: ' + filename)
 698
 699         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 700                 """Report download progress."""
 701                 if self.params.get('noprogress', False):
 702                         return
 703                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 704                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 705                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 706                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 707
 708         def report_resuming_byte(self, resume_len):
 709                 """Report attempt to resume at given byte."""
 710                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 711
 712         def report_retry(self, count, retries):
 713                 """Report retry in case of HTTP error 5xx"""
 714                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 715
 716         def report_file_already_downloaded(self, file_name):
 717                 """Report file has already been fully downloaded."""
 718                 try:
 719                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 720                 except (UnicodeEncodeError), err:
 721                         self.to_screen(u'[download] The file has already been downloaded')
 722
 723         def report_unable_to_resume(self):
 724                 """Report it was impossible to resume download."""
 725                 self.to_screen(u'[download] Unable to resume')
 726
 727         def report_finish(self):
 728                 """Report download finished."""
 729                 if self.params.get('noprogress', False):
 730                         self.to_screen(u'[download] Download completed')
 731                 else:
 732                         self.to_screen(u'')
 733
 734         def increment_downloads(self):
 735                 """Increment the ordinal that assigns a number to each file."""
 736                 self._num_downloads += 1
 737
 738         def prepare_filename(self, info_dict):
 739                 """Generate the output filename."""
 740                 try:
 741                         template_dict = dict(info_dict)
 742                         template_dict['epoch'] = unicode(long(time.time()))
 743                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 744                         filename = self.params['outtmpl'] % template_dict
 745                         return filename
 746                 except (ValueError, KeyError), err:
 747                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 748                         return None
 749
 750         def _match_entry(self, info_dict):
 751                 """ Returns None iff the file should be downloaded """
 752
 753                 title = info_dict['title']
 754                 matchtitle = self.params.get('matchtitle', False)
 755                 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
 756                         return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 757                 rejecttitle = self.params.get('rejecttitle', False)
 758                 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
 759                         return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 760                 return None
 761
 762         def process_info(self, info_dict):
 763                 """Process a single dictionary returned by an InfoExtractor."""
 764
 765                 reason = self._match_entry(info_dict)
 766                 if reason is not None:
 767                         self.to_screen(u'[download] ' + reason)
 768                         return
 769
 770                 max_downloads = self.params.get('max_downloads')
 771                 if max_downloads is not None:
 772                         if self._num_downloads > int(max_downloads):
 773                                 raise MaxDownloadsReached()
 774
 775                 filename = self.prepare_filename(info_dict)
 776
 777                 # Forced printings
 778                 if self.params.get('forcetitle', False):
 779                         print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 780                 if self.params.get('forceurl', False):
 781                         print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 782                 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 783                         print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 784                 if self.params.get('forcedescription', False) and 'description' in info_dict:
 785                         print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 786                 if self.params.get('forcefilename', False) and filename is not None:
 787                         print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 788                 if self.params.get('forceformat', False):
 789                         print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
 790
 791                 # Do nothing else if in simulate mode
 792                 if self.params.get('simulate', False):
 793                         return
 794
 795                 if filename is None:
 796                         return
 797
 798                 try:
 799                         dn = os.path.dirname(_encodeFilename(filename))
 800                         if dn != '' and not os.path.exists(dn): # dn is already encoded
 801                                 os.makedirs(dn)
 802                 except (OSError, IOError), err:
 803                         self.trouble(u'ERROR: unable to create directory ' + unicode(err))
 804                         return
 805
 806                 if self.params.get('writedescription', False):
 807                         try:
 808                                 descfn = filename + u'.description'
 809                                 self.report_writedescription(descfn)
 810                                 descfile = open(_encodeFilename(descfn), 'wb')
 811                                 try:
 812                                         descfile.write(info_dict['description'].encode('utf-8'))
 813                                 finally:
 814                                         descfile.close()
 815                         except (OSError, IOError):
 816                                 self.trouble(u'ERROR: Cannot write description file ' + descfn)
 817                                 return
 818
 819                 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 820                         # subtitles download errors are already managed as troubles in relevant IE
 821                         # that way it will silently go on when used with unsupporting IE
 822                         try:
 823                                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
 824                                 self.report_writesubtitles(srtfn)
 825                                 srtfile = open(_encodeFilename(srtfn), 'wb')
 826                                 try:
 827                                         srtfile.write(info_dict['subtitles'].encode('utf-8'))
 828                                 finally:
 829                                         srtfile.close()
 830                         except (OSError, IOError):
 831                                 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
 832                                 return
 833
 834                 if self.params.get('writeinfojson', False):
 835                         infofn = filename + u'.info.json'
 836                         self.report_writeinfojson(infofn)
 837                         try:
 838                                 json.dump
 839                         except (NameError,AttributeError):
 840                                 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
 841                                 return
 842                         try:
 843                                 infof = open(_encodeFilename(infofn), 'wb')
 844                                 try:
 845                                         json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
 846                                         json.dump(json_info_dict, infof)
 847                                 finally:
 848                                         infof.close()
 849                         except (OSError, IOError):
 850                                 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
 851                                 return
 852
 853                 if not self.params.get('skip_download', False):
 854                         if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
 855                                 success = True
 856                         else:
 857                                 try:
 858                                         success = self._do_download(filename, info_dict)
 859                                 except (OSError, IOError), err:
 860                                         raise UnavailableVideoError
 861                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 862                                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 863                                         return
 864                                 except (ContentTooShortError, ), err:
 865                                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 866                                         return
 867
 868                         if success:
 869                                 try:
 870                                         self.post_process(filename, info_dict)
 871                                 except (PostProcessingError), err:
 872                                         self.trouble(u'ERROR: postprocessing: %s' % str(err))
 873                                         return
 874
 875         def download(self, url_list):
 876                 """Download a given list of URLs."""
 877                 if len(url_list) > 1 and self.fixed_template():
 878                         raise SameFileError(self.params['outtmpl'])
 879
 880                 for url in url_list:
 881                         suitable_found = False
 882                         for ie in self._ies:
 883                                 # Go to next InfoExtractor if not suitable
 884                                 if not ie.suitable(url):
 885                                         continue
 886
 887                                 # Suitable InfoExtractor found
 888                                 suitable_found = True
 889
 890                                 # Extract information from URL and process it
 891                                 ie.extract(url)
 892
 893                                 # Suitable InfoExtractor had been found; go to next URL
 894                                 break
 895
 896                         if not suitable_found:
 897                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 898
 899                 return self._download_retcode
 900
 901         def post_process(self, filename, ie_info):
 902                 """Run the postprocessing chain on the given file."""
 903                 info = dict(ie_info)
 904                 info['filepath'] = filename
 905                 for pp in self._pps:
 906                         info = pp.run(info)
 907                         if info is None:
 908                                 break
 909
 910         def _download_with_rtmpdump(self, filename, url, player_url):
 911                 self.report_destination(filename)
 912                 tmpfilename = self.temp_name(filename)
 913
 914                 # Check for rtmpdump first
 915                 try:
 916                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 917                 except (OSError, IOError):
 918                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 919                         return False
 920
 921                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 922                 # the connection was interrumpted and resuming appears to be
 923                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 924                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 925                 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
 926                 if self.params.get('verbose', False):
 927                         try:
 928                                 import pipes
 929                                 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
 930                         except ImportError:
 931                                 shell_quote = repr
 932                         self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
 933                 retval = subprocess.call(args)
 934                 while retval == 2 or retval == 1:
 935                         prevsize = os.path.getsize(_encodeFilename(tmpfilename))
 936                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 937                         time.sleep(5.0) # This seems to be needed
 938                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 939                         cursize = os.path.getsize(_encodeFilename(tmpfilename))
 940                         if prevsize == cursize and retval == 1:
 941                                 break
 942                          # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
 943                         if prevsize == cursize and retval == 2 and cursize > 1024:
 944                                 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
 945                                 retval = 0
 946                                 break
 947                 if retval == 0:
 948                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
 949                         self.try_rename(tmpfilename, filename)
 950                         return True
 951                 else:
 952                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 953                         return False
 954
 955         def _do_download(self, filename, info_dict):
 956                 url = info_dict['url']
 957                 player_url = info_dict.get('player_url', None)
 958
 959                 # Check file already present
 960                 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
 961                         self.report_file_already_downloaded(filename)
 962                         return True
 963
 964                 # Attempt to download using rtmpdump
 965                 if url.startswith('rtmp'):
 966                         return self._download_with_rtmpdump(filename, url, player_url)
 967
 968                 tmpfilename = self.temp_name(filename)
 969                 stream = None
 970
 971                 # Do not include the Accept-Encoding header
 972                 headers = {'Youtubedl-no-compression': 'True'}
 973                 basic_request = urllib2.Request(url, None, headers)
 974                 request = urllib2.Request(url, None, headers)
 975
 976                 # Establish possible resume length
 977                 if os.path.isfile(_encodeFilename(tmpfilename)):
 978                         resume_len = os.path.getsize(_encodeFilename(tmpfilename))
 979                 else:
 980                         resume_len = 0
 981
 982                 open_mode = 'wb'
 983                 if resume_len != 0:
 984                         if self.params.get('continuedl', False):
 985                                 self.report_resuming_byte(resume_len)
 986                                 request.add_header('Range','bytes=%d-' % resume_len)
 987                                 open_mode = 'ab'
 988                         else:
 989                                 resume_len = 0
 990
 991                 count = 0
 992                 retries = self.params.get('retries', 0)
 993                 while count <= retries:
 994                         # Establish connection
 995                         try:
 996                                 if count == 0 and 'urlhandle' in info_dict:
 997                                         data = info_dict['urlhandle']
 998                                 data = urllib2.urlopen(request)
 999                                 break
1000                         except (urllib2.HTTPError, ), err:
1001                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002                                         # Unexpected HTTP error
1003                                         raise
1004                                 elif err.code == 416:
1005                                         # Unable to resume (requested range not satisfiable)
1006                                         try:
1007                                                 # Open the connection again without the range header
1008                                                 data = urllib2.urlopen(basic_request)
1009                                                 content_length = data.info()['Content-Length']
1010                                         except (urllib2.HTTPError, ), err:
1011                                                 if err.code < 500 or err.code >= 600:
1012                                                         raise
1013                                         else:
1014                                                 # Examine the reported length
1015                                                 if (content_length is not None and
1016                                                                 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017                                                         # The file had already been fully downloaded.
1018                                                         # Explanation to the above condition: in issue #175 it was revealed that
1019                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
1020                                                         # changing the file size slightly and causing problems for some users. So
1021                                                         # I decided to implement a suggested change and consider the file
1022                                                         # completely downloaded if the file size differs less than 100 bytes from
1023                                                         # the one in the hard drive.
1024                                                         self.report_file_already_downloaded(filename)
1025                                                         self.try_rename(tmpfilename, filename)
1026                                                         return True
1027                                                 else:
1028                                                         # The length does not match, we start the download over
1029                                                         self.report_unable_to_resume()
1030                                                         open_mode = 'wb'
1031                                                         break
1032                         # Retry
1033                         count += 1
1034                         if count <= retries:
1035                                 self.report_retry(count, retries)
1036
1037                 if count > retries:
1038                         self.trouble(u'ERROR: giving up after %s retries' % retries)
1039                         return False
1040
1041                 data_len = data.info().get('Content-length', None)
1042                 if data_len is not None:
1043                         data_len = long(data_len) + resume_len
1044                 data_len_str = self.format_bytes(data_len)
1045                 byte_counter = 0 + resume_len
1046                 block_size = 1024
1047                 start = time.time()
1048                 while True:
1049                         # Download and write
1050                         before = time.time()
1051                         data_block = data.read(block_size)
1052                         after = time.time()
1053                         if len(data_block) == 0:
1054                                 break
1055                         byte_counter += len(data_block)
1056
1057                         # Open file just in time
1058                         if stream is None:
1059                                 try:
1060                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061                                         assert stream is not None
1062                                         filename = self.undo_temp_name(tmpfilename)
1063                                         self.report_destination(filename)
1064                                 except (OSError, IOError), err:
1065                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1066                                         return False
1067                         try:
1068                                 stream.write(data_block)
1069                         except (IOError, OSError), err:
1070                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1071                                 return False
1072                         block_size = self.best_block_size(after - before, len(data_block))
1073
1074                         # Progress message
1075                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076                         if data_len is None:
1077                                 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1078                         else:
1079                                 percent_str = self.calc_percent(byte_counter, data_len)
1080                                 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081                                 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1082
1083                         # Apply rate limit
1084                         self.slow_down(start, byte_counter - resume_len)
1085
1086                 if stream is None:
1087                         self.trouble(u'\nERROR: Did not get any data blocks')
1088                         return False
1089                 stream.close()
1090                 self.report_finish()
1091                 if data_len is not None and byte_counter != data_len:
1092                         raise ContentTooShortError(byte_counter, long(data_len))
1093                 self.try_rename(tmpfilename, filename)
1094
1095                 # Update file modification time
1096                 if self.params.get('updatetime', True):
1097                         info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1098
1099                 return True
1100
1101
1102 class InfoExtractor(object):
1103         """Information Extractor class.
1104
1105         Information extractors are the classes that, given a URL, extract
1106         information from the video (or videos) the URL refers to. This
1107         information includes the real video URL, the video title and simplified
1108         title, author and others. The information is stored in a dictionary
1109         which is then passed to the FileDownloader. The FileDownloader
1110         processes this information possibly downloading the video to the file
1111         system, among other possible outcomes. The dictionaries must include
1112         the following fields:
1113
1114         id:             Video identifier.
1115         url:            Final video URL.
1116         uploader:       Nickname of the video uploader.
1117         title:          Literal title.
1118         stitle:         Simplified title.
1119         ext:            Video filename extension.
1120         format:         Video format.
1121         player_url:     SWF Player URL (may be None).
1122
1123         The following fields are optional. Their primary purpose is to allow
1124         youtube-dl to serve as the backend for a video search function, such
1125         as the one in youtube2mp3.  They are only used when their respective
1126         forced printing functions are called:
1127
1128         thumbnail:      Full URL to a video thumbnail image.
1129         description:    One-line video description.
1130
1131         Subclasses of this one should re-define the _real_initialize() and
1132         _real_extract() methods and define a _VALID_URL regexp.
1133         Probably, they should also be added to the list of extractors.
1134         """
1135
1136         _ready = False
1137         _downloader = None
1138
1139         def __init__(self, downloader=None):
1140                 """Constructor. Receives an optional downloader."""
1141                 self._ready = False
1142                 self.set_downloader(downloader)
1143
1144         def suitable(self, url):
1145                 """Receives a URL and returns True if suitable for this IE."""
1146                 return re.match(self._VALID_URL, url) is not None
1147
1148         def initialize(self):
1149                 """Initializes an instance (authentication, etc)."""
1150                 if not self._ready:
1151                         self._real_initialize()
1152                         self._ready = True
1153
1154         def extract(self, url):
1155                 """Extracts URL information and returns it in list of dicts."""
1156                 self.initialize()
1157                 return self._real_extract(url)
1158
1159         def set_downloader(self, downloader):
1160                 """Sets the downloader for this IE."""
1161                 self._downloader = downloader
1162
1163         def _real_initialize(self):
1164                 """Real initialization process. Redefine in subclasses."""
1165                 pass
1166
1167         def _real_extract(self, url):
1168                 """Real extraction process. Redefine in subclasses."""
1169                 pass
1170
1171
1172 class YoutubeIE(InfoExtractor):
1173         """Information extractor for youtube.com."""
1174
1175         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179         _NETRC_MACHINE = 'youtube'
1180         # Listed in order of quality
1181         _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182         _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183         _video_extensions = {
1184                 '13': '3gp',
1185                 '17': 'mp4',
1186                 '18': 'mp4',
1187                 '22': 'mp4',
1188                 '37': 'mp4',
1189                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1190                 '43': 'webm',
1191                 '44': 'webm',
1192                 '45': 'webm',
1193         }
1194         _video_dimensions = {
1195                 '5': '240x400',
1196                 '6': '???',
1197                 '13': '???',
1198                 '17': '144x176',
1199                 '18': '360x640',
1200                 '22': '720x1280',
1201                 '34': '360x640',
1202                 '35': '480x854',
1203                 '37': '1080x1920',
1204                 '38': '3072x4096',
1205                 '43': '360x640',
1206                 '44': '480x854',
1207                 '45': '720x1280',
1208         }
1209         IE_NAME = u'youtube'
1210
1211         def report_lang(self):
1212                 """Report attempt to set language."""
1213                 self._downloader.to_screen(u'[youtube] Setting language')
1214
1215         def report_login(self):
1216                 """Report attempt to log in."""
1217                 self._downloader.to_screen(u'[youtube] Logging in')
1218
1219         def report_age_confirmation(self):
1220                 """Report attempt to confirm age."""
1221                 self._downloader.to_screen(u'[youtube] Confirming age')
1222
1223         def report_video_webpage_download(self, video_id):
1224                 """Report attempt to download video webpage."""
1225                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1226
1227         def report_video_info_webpage_download(self, video_id):
1228                 """Report attempt to download video info webpage."""
1229                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1230
1231         def report_video_subtitles_download(self, video_id):
1232                 """Report attempt to download video info webpage."""
1233                 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1234
1235         def report_information_extraction(self, video_id):
1236                 """Report attempt to extract video information."""
1237                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1238
1239         def report_unavailable_format(self, video_id, format):
1240                 """Report extracted video URL."""
1241                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1242
1243         def report_rtmp_download(self):
1244                 """Indicate the download will use the RTMP protocol."""
1245                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1246
1247         def _closed_captions_xml_to_srt(self, xml_string):
1248                 srt = ''
1249                 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250                 # TODO parse xml instead of regex
1251                 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252                         if not dur: dur = '4'
1253                         start = float(start)
1254                         end = start + float(dur)
1255                         start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256                         end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258                         caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259                         srt += str(n) + '\n'
1260                         srt += start + ' --> ' + end + '\n'
1261                         srt += caption + '\n\n'
1262                 return srt
1263
1264         def _print_formats(self, formats):
1265                 print 'Available formats:'
1266                 for x in formats:
1267                         print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1268
1269         def _real_initialize(self):
1270                 if self._downloader is None:
1271                         return
1272
1273                 username = None
1274                 password = None
1275                 downloader_params = self._downloader.params
1276
1277                 # Attempt to use provided username and password or .netrc data
1278                 if downloader_params.get('username', None) is not None:
1279                         username = downloader_params['username']
1280                         password = downloader_params['password']
1281                 elif downloader_params.get('usenetrc', False):
1282                         try:
1283                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284                                 if info is not None:
1285                                         username = info[0]
1286                                         password = info[2]
1287                                 else:
1288                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289                         except (IOError, netrc.NetrcParseError), err:
1290                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1291                                 return
1292
1293                 # Set language
1294                 request = urllib2.Request(self._LANG_URL)
1295                 try:
1296                         self.report_lang()
1297                         urllib2.urlopen(request).read()
1298                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1300                         return
1301
1302                 # No authentication to be performed
1303                 if username is None:
1304                         return
1305
1306                 # Log in
1307                 login_form = {
1308                                 'current_form': 'loginForm',
1309                                 'next':         '/',
1310                                 'action_login': 'Log In',
1311                                 'username':     username,
1312                                 'password':     password,
1313                                 }
1314                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1315                 try:
1316                         self.report_login()
1317                         login_results = urllib2.urlopen(request).read()
1318                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1320                                 return
1321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1323                         return
1324
1325                 # Confirm age
1326                 age_form = {
1327                                 'next_url':             '/',
1328                                 'action_confirm':       'Confirm',
1329                                 }
1330                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1331                 try:
1332                         self.report_age_confirmation()
1333                         age_results = urllib2.urlopen(request).read()
1334                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1336                         return
1337
1338         def _real_extract(self, url):
1339                 # Extract video id from URL
1340                 mobj = re.match(self._VALID_URL, url)
1341                 if mobj is None:
1342                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1343                         return
1344                 video_id = mobj.group(2)
1345
1346                 # Get video webpage
1347                 self.report_video_webpage_download(video_id)
1348                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1349                 try:
1350                         video_webpage = urllib2.urlopen(request).read()
1351                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1352                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1353                         return
1354
1355                 # Attempt to extract SWF player URL
1356                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1357                 if mobj is not None:
1358                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1359                 else:
1360                         player_url = None
1361
1362                 # Get video info
1363                 self.report_video_info_webpage_download(video_id)
1364                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1365                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1366                                         % (video_id, el_type))
1367                         request = urllib2.Request(video_info_url)
1368                         try:
1369                                 video_info_webpage = urllib2.urlopen(request).read()
1370                                 video_info = parse_qs(video_info_webpage)
1371                                 if 'token' in video_info:
1372                                         break
1373                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1374                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1375                                 return
1376                 if 'token' not in video_info:
1377                         if 'reason' in video_info:
1378                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1379                         else:
1380                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1381                         return
1382
1383                 # Start extracting information
1384                 self.report_information_extraction(video_id)
1385
1386                 # uploader
1387                 if 'author' not in video_info:
1388                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1389                         return
1390                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1391
1392                 # title
1393                 if 'title' not in video_info:
1394                         self._downloader.trouble(u'ERROR: unable to extract video title')
1395                         return
1396                 video_title = urllib.unquote_plus(video_info['title'][0])
1397                 video_title = video_title.decode('utf-8')
1398                 video_title = sanitize_title(video_title)
1399
1400                 # simplified title
1401                 simple_title = _simplify_title(video_title)
1402
1403                 # thumbnail image
1404                 if 'thumbnail_url' not in video_info:
1405                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1406                         video_thumbnail = ''
1407                 else:   # don't panic if we can't find it
1408                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1409
1410                 # upload date
1411                 upload_date = u'NA'
1412                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1413                 if mobj is not None:
1414                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1415                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1416                         for expression in format_expressions:
1417                                 try:
1418                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1419                                 except:
1420                                         pass
1421
1422                 # description
1423                 try:
1424                         lxml.etree
1425                 except NameError:
1426                         video_description = u'No description available.'
1427                         mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1428                         if mobj is not None:
1429                                 video_description = mobj.group(1).decode('utf-8')
1430                 else:
1431                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1432                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1433                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1434                         # TODO use another parser
1435
1436                 # closed captions
1437                 video_subtitles = None
1438                 if self._downloader.params.get('writesubtitles', False):
1439                         self.report_video_subtitles_download(video_id)
1440                         request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1441                         try:
1442                                 srt_list = urllib2.urlopen(request).read()
1443                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444                                 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1445                         else:
1446                                 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1447                                 if srt_lang_list:
1448                                         if self._downloader.params.get('subtitleslang', False):
1449                                                 srt_lang = self._downloader.params.get('subtitleslang')
1450                                         elif 'en' in srt_lang_list:
1451                                                 srt_lang = 'en'
1452                                         else:
1453                                                 srt_lang = srt_lang_list[0]
1454                                         if not srt_lang in srt_lang_list:
1455                                                 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1456                                         else:
1457                                                 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1458                                                 try:
1459                                                         srt_xml = urllib2.urlopen(request).read()
1460                                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1461                                                         self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1462                                                 else:
1463                                                         video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1464                                 else:
1465                                         self._downloader.trouble(u'WARNING: video has no closed captions')
1466
1467                 # token
1468                 video_token = urllib.unquote_plus(video_info['token'][0])
1469
1470                 # Decide which formats to download
1471                 req_format = self._downloader.params.get('format', None)
1472
1473                 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1474                         self.report_rtmp_download()
1475                         video_url_list = [(None, video_info['conn'][0])]
1476                 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1477                         url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1478                         url_data = [parse_qs(uds) for uds in url_data_strs]
1479                         url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1480                         url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1481
1482                         format_limit = self._downloader.params.get('format_limit', None)
1483                         available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1484                         if format_limit is not None and format_limit in available_formats:
1485                                 format_list = available_formats[available_formats.index(format_limit):]
1486                         else:
1487                                 format_list = available_formats
1488                         existing_formats = [x for x in format_list if x in url_map]
1489                         if len(existing_formats) == 0:
1490                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1491                                 return
1492                         if self._downloader.params.get('listformats', None):
1493                                 self._print_formats(existing_formats)
1494                                 return
1495                         if req_format is None or req_format == 'best':
1496                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1497                         elif req_format == 'worst':
1498                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1499                         elif req_format in ('-1', 'all'):
1500                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1501                         else:
1502                                 # Specific formats. We pick the first in a slash-delimeted sequence.
1503                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1504                                 req_formats = req_format.split('/')
1505                                 video_url_list = None
1506                                 for rf in req_formats:
1507                                         if rf in url_map:
1508                                                 video_url_list = [(rf, url_map[rf])]
1509                                                 break
1510                                 if video_url_list is None:
1511                                         self._downloader.trouble(u'ERROR: requested format not available')
1512                                         return
1513                 else:
1514                         self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1515                         return
1516
1517                 for format_param, video_real_url in video_url_list:
1518                         # At this point we have a new video
1519                         self._downloader.increment_downloads()
1520
1521                         # Extension
1522                         video_extension = self._video_extensions.get(format_param, 'flv')
1523
1524                         try:
1525                                 # Process video information
1526                                 self._downloader.process_info({
1527                                         'id':           video_id.decode('utf-8'),
1528                                         'url':          video_real_url.decode('utf-8'),
1529                                         'uploader':     video_uploader.decode('utf-8'),
1530                                         'upload_date':  upload_date,
1531                                         'title':        video_title,
1532                                         'stitle':       simple_title,
1533                                         'ext':          video_extension.decode('utf-8'),
1534                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1535                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1536                                         'description':  video_description,
1537                                         'player_url':   player_url,
1538                                         'subtitles':    video_subtitles
1539                                 })
1540                         except UnavailableVideoError, err:
1541                                 self._downloader.trouble(u'\nERROR: unable to download video')
1542
1543
1544 class MetacafeIE(InfoExtractor):
1545         """Information Extractor for metacafe.com."""
1546
1547         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1548         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1549         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1550         _youtube_ie = None
1551         IE_NAME = u'metacafe'
1552
1553         def __init__(self, youtube_ie, downloader=None):
1554                 InfoExtractor.__init__(self, downloader)
1555                 self._youtube_ie = youtube_ie
1556
1557         def report_disclaimer(self):
1558                 """Report disclaimer retrieval."""
1559                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1560
1561         def report_age_confirmation(self):
1562                 """Report attempt to confirm age."""
1563                 self._downloader.to_screen(u'[metacafe] Confirming age')
1564
1565         def report_download_webpage(self, video_id):
1566                 """Report webpage download."""
1567                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1568
1569         def report_extraction(self, video_id):
1570                 """Report information extraction."""
1571                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1572
1573         def _real_initialize(self):
1574                 # Retrieve disclaimer
1575                 request = urllib2.Request(self._DISCLAIMER)
1576                 try:
1577                         self.report_disclaimer()
1578                         disclaimer = urllib2.urlopen(request).read()
1579                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1580                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1581                         return
1582
1583                 # Confirm age
1584                 disclaimer_form = {
1585                         'filters': '0',
1586                         'submit': "Continue - I'm over 18",
1587                         }
1588                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1589                 try:
1590                         self.report_age_confirmation()
1591                         disclaimer = urllib2.urlopen(request).read()
1592                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1594                         return
1595
1596         def _real_extract(self, url):
1597                 # Extract id and simplified title from URL
1598                 mobj = re.match(self._VALID_URL, url)
1599                 if mobj is None:
1600                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1601                         return
1602
1603                 video_id = mobj.group(1)
1604
1605                 # Check if video comes from YouTube
1606                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1607                 if mobj2 is not None:
1608                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1609                         return
1610
1611                 # At this point we have a new video
1612                 self._downloader.increment_downloads()
1613
1614                 simple_title = mobj.group(2).decode('utf-8')
1615
1616                 # Retrieve video webpage to extract further information
1617                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1618                 try:
1619                         self.report_download_webpage(video_id)
1620                         webpage = urllib2.urlopen(request).read()
1621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1623                         return
1624
1625                 # Extract URL, uploader and title from webpage
1626                 self.report_extraction(video_id)
1627                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1628                 if mobj is not None:
1629                         mediaURL = urllib.unquote(mobj.group(1))
1630                         video_extension = mediaURL[-3:]
1631
1632                         # Extract gdaKey if available
1633                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1634                         if mobj is None:
1635                                 video_url = mediaURL
1636                         else:
1637                                 gdaKey = mobj.group(1)
1638                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1639                 else:
1640                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1641                         if mobj is None:
1642                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1643                                 return
1644                         vardict = parse_qs(mobj.group(1))
1645                         if 'mediaData' not in vardict:
1646                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1647                                 return
1648                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1649                         if mobj is None:
1650                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1651                                 return
1652                         mediaURL = mobj.group(1).replace('\\/', '/')
1653                         video_extension = mediaURL[-3:]
1654                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1655
1656                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1657                 if mobj is None:
1658                         self._downloader.trouble(u'ERROR: unable to extract title')
1659                         return
1660                 video_title = mobj.group(1).decode('utf-8')
1661                 video_title = sanitize_title(video_title)
1662
1663                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1664                 if mobj is None:
1665                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1666                         return
1667                 video_uploader = mobj.group(1)
1668
1669                 try:
1670                         # Process video information
1671                         self._downloader.process_info({
1672                                 'id':           video_id.decode('utf-8'),
1673                                 'url':          video_url.decode('utf-8'),
1674                                 'uploader':     video_uploader.decode('utf-8'),
1675                                 'upload_date':  u'NA',
1676                                 'title':        video_title,
1677                                 'stitle':       simple_title,
1678                                 'ext':          video_extension.decode('utf-8'),
1679                                 'format':       u'NA',
1680                                 'player_url':   None,
1681                         })
1682                 except UnavailableVideoError:
1683                         self._downloader.trouble(u'\nERROR: unable to download video')
1684
1685
1686 class DailymotionIE(InfoExtractor):
1687         """Information Extractor for Dailymotion"""
1688
1689         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1690         IE_NAME = u'dailymotion'
1691
1692         def __init__(self, downloader=None):
1693                 InfoExtractor.__init__(self, downloader)
1694
1695         def report_download_webpage(self, video_id):
1696                 """Report webpage download."""
1697                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1698
1699         def report_extraction(self, video_id):
1700                 """Report information extraction."""
1701                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1702
1703         def _real_extract(self, url):
1704                 # Extract id and simplified title from URL
1705                 mobj = re.match(self._VALID_URL, url)
1706                 if mobj is None:
1707                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1708                         return
1709
1710                 # At this point we have a new video
1711                 self._downloader.increment_downloads()
1712                 video_id = mobj.group(1)
1713
1714                 video_extension = 'flv'
1715
1716                 # Retrieve video webpage to extract further information
1717                 request = urllib2.Request(url)
1718                 request.add_header('Cookie', 'family_filter=off')
1719                 try:
1720                         self.report_download_webpage(video_id)
1721                         webpage = urllib2.urlopen(request).read()
1722                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1724                         return
1725
1726                 # Extract URL, uploader and title from webpage
1727                 self.report_extraction(video_id)
1728                 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1729                 if mobj is None:
1730                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1731                         return
1732                 sequence = urllib.unquote(mobj.group(1))
1733                 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1736                         return
1737                 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1738
1739                 # if needed add http://www.dailymotion.com/ if relative URL
1740
1741                 video_url = mediaURL
1742
1743                 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1744                 if mobj is None:
1745                         self._downloader.trouble(u'ERROR: unable to extract title')
1746                         return
1747                 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1748                 video_title = sanitize_title(video_title)
1749                 simple_title = _simplify_title(video_title)
1750
1751                 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1752                 if mobj is None:
1753                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1754                         return
1755                 video_uploader = mobj.group(1)
1756
1757                 try:
1758                         # Process video information
1759                         self._downloader.process_info({
1760                                 'id':           video_id.decode('utf-8'),
1761                                 'url':          video_url.decode('utf-8'),
1762                                 'uploader':     video_uploader.decode('utf-8'),
1763                                 'upload_date':  u'NA',
1764                                 'title':        video_title,
1765                                 'stitle':       simple_title,
1766                                 'ext':          video_extension.decode('utf-8'),
1767                                 'format':       u'NA',
1768                                 'player_url':   None,
1769                         })
1770                 except UnavailableVideoError:
1771                         self._downloader.trouble(u'\nERROR: unable to download video')
1772
1773
1774 class GoogleIE(InfoExtractor):
1775         """Information extractor for video.google.com."""
1776
1777         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1778         IE_NAME = u'video.google'
1779
1780         def __init__(self, downloader=None):
1781                 InfoExtractor.__init__(self, downloader)
1782
1783         def report_download_webpage(self, video_id):
1784                 """Report webpage download."""
1785                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1786
1787         def report_extraction(self, video_id):
1788                 """Report information extraction."""
1789                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1790
1791         def _real_extract(self, url):
1792                 # Extract id from URL
1793                 mobj = re.match(self._VALID_URL, url)
1794                 if mobj is None:
1795                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1796                         return
1797
1798                 # At this point we have a new video
1799                 self._downloader.increment_downloads()
1800                 video_id = mobj.group(1)
1801
1802                 video_extension = 'mp4'
1803
1804                 # Retrieve video webpage to extract further information
1805                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1806                 try:
1807                         self.report_download_webpage(video_id)
1808                         webpage = urllib2.urlopen(request).read()
1809                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1811                         return
1812
1813                 # Extract URL, uploader, and title from webpage
1814                 self.report_extraction(video_id)
1815                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1816                 if mobj is None:
1817                         video_extension = 'flv'
1818                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1819                 if mobj is None:
1820                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1821                         return
1822                 mediaURL = urllib.unquote(mobj.group(1))
1823                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1824                 mediaURL = mediaURL.replace('\\x26', '\x26')
1825
1826                 video_url = mediaURL
1827
1828                 mobj = re.search(r'<title>(.*)</title>', webpage)
1829                 if mobj is None:
1830                         self._downloader.trouble(u'ERROR: unable to extract title')
1831                         return
1832                 video_title = mobj.group(1).decode('utf-8')
1833                 video_title = sanitize_title(video_title)
1834                 simple_title = _simplify_title(video_title)
1835
1836                 # Extract video description
1837                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1838                 if mobj is None:
1839                         self._downloader.trouble(u'ERROR: unable to extract video description')
1840                         return
1841                 video_description = mobj.group(1).decode('utf-8')
1842                 if not video_description:
1843                         video_description = 'No description available.'
1844
1845                 # Extract video thumbnail
1846                 if self._downloader.params.get('forcethumbnail', False):
1847                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1848                         try:
1849                                 webpage = urllib2.urlopen(request).read()
1850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1852                                 return
1853                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1854                         if mobj is None:
1855                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1856                                 return
1857                         video_thumbnail = mobj.group(1)
1858                 else:   # we need something to pass to process_info
1859                         video_thumbnail = ''
1860
1861                 try:
1862                         # Process video information
1863                         self._downloader.process_info({
1864                                 'id':           video_id.decode('utf-8'),
1865                                 'url':          video_url.decode('utf-8'),
1866                                 'uploader':     u'NA',
1867                                 'upload_date':  u'NA',
1868                                 'title':        video_title,
1869                                 'stitle':       simple_title,
1870                                 'ext':          video_extension.decode('utf-8'),
1871                                 'format':       u'NA',
1872                                 'player_url':   None,
1873                         })
1874                 except UnavailableVideoError:
1875                         self._downloader.trouble(u'\nERROR: unable to download video')
1876
1877
1878 class PhotobucketIE(InfoExtractor):
1879         """Information extractor for photobucket.com."""
1880
1881         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1882         IE_NAME = u'photobucket'
1883
1884         def __init__(self, downloader=None):
1885                 InfoExtractor.__init__(self, downloader)
1886
1887         def report_download_webpage(self, video_id):
1888                 """Report webpage download."""
1889                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1890
1891         def report_extraction(self, video_id):
1892                 """Report information extraction."""
1893                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1894
1895         def _real_extract(self, url):
1896                 # Extract id from URL
1897                 mobj = re.match(self._VALID_URL, url)
1898                 if mobj is None:
1899                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1900                         return
1901
1902                 # At this point we have a new video
1903                 self._downloader.increment_downloads()
1904                 video_id = mobj.group(1)
1905
1906                 video_extension = 'flv'
1907
1908                 # Retrieve video webpage to extract further information
1909                 request = urllib2.Request(url)
1910                 try:
1911                         self.report_download_webpage(video_id)
1912                         webpage = urllib2.urlopen(request).read()
1913                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1914                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1915                         return
1916
1917                 # Extract URL, uploader, and title from webpage
1918                 self.report_extraction(video_id)
1919                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1920                 if mobj is None:
1921                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1922                         return
1923                 mediaURL = urllib.unquote(mobj.group(1))
1924
1925                 video_url = mediaURL
1926
1927                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1928                 if mobj is None:
1929                         self._downloader.trouble(u'ERROR: unable to extract title')
1930                         return
1931                 video_title = mobj.group(1).decode('utf-8')
1932                 video_title = sanitize_title(video_title)
1933                 simple_title = _simplify_title(vide_title)
1934
1935                 video_uploader = mobj.group(2).decode('utf-8')
1936
1937                 try:
1938                         # Process video information
1939                         self._downloader.process_info({
1940                                 'id':           video_id.decode('utf-8'),
1941                                 'url':          video_url.decode('utf-8'),
1942                                 'uploader':     video_uploader,
1943                                 'upload_date':  u'NA',
1944                                 'title':        video_title,
1945                                 'stitle':       simple_title,
1946                                 'ext':          video_extension.decode('utf-8'),
1947                                 'format':       u'NA',
1948                                 'player_url':   None,
1949                         })
1950                 except UnavailableVideoError:
1951                         self._downloader.trouble(u'\nERROR: unable to download video')
1952
1953
1954 class YahooIE(InfoExtractor):
1955         """Information extractor for video.yahoo.com."""
1956
1957         # _VALID_URL matches all Yahoo! Video URLs
1958         # _VPAGE_URL matches only the extractable '/watch/' URLs
1959         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1960         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1961         IE_NAME = u'video.yahoo'
1962
1963         def __init__(self, downloader=None):
1964                 InfoExtractor.__init__(self, downloader)
1965
1966         def report_download_webpage(self, video_id):
1967                 """Report webpage download."""
1968                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1969
1970         def report_extraction(self, video_id):
1971                 """Report information extraction."""
1972                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1973
1974         def _real_extract(self, url, new_video=True):
1975                 # Extract ID from URL
1976                 mobj = re.match(self._VALID_URL, url)
1977                 if mobj is None:
1978                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1979                         return
1980
1981                 # At this point we have a new video
1982                 self._downloader.increment_downloads()
1983                 video_id = mobj.group(2)
1984                 video_extension = 'flv'
1985
1986                 # Rewrite valid but non-extractable URLs as
1987                 # extractable English language /watch/ URLs
1988                 if re.match(self._VPAGE_URL, url) is None:
1989                         request = urllib2.Request(url)
1990                         try:
1991                                 webpage = urllib2.urlopen(request).read()
1992                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1994                                 return
1995
1996                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1997                         if mobj is None:
1998                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1999                                 return
2000                         yahoo_id = mobj.group(1)
2001
2002                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2003                         if mobj is None:
2004                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2005                                 return
2006                         yahoo_vid = mobj.group(1)
2007
2008                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2009                         return self._real_extract(url, new_video=False)
2010
2011                 # Retrieve video webpage to extract further information
2012                 request = urllib2.Request(url)
2013                 try:
2014                         self.report_download_webpage(video_id)
2015                         webpage = urllib2.urlopen(request).read()
2016                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2018                         return
2019
2020                 # Extract uploader and title from webpage
2021                 self.report_extraction(video_id)
2022                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2023                 if mobj is None:
2024                         self._downloader.trouble(u'ERROR: unable to extract video title')
2025                         return
2026                 video_title = mobj.group(1).decode('utf-8')
2027                 simple_title = _simplify_title(video_title)
2028
2029                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2030                 if mobj is None:
2031                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
2032                         return
2033                 video_uploader = mobj.group(1).decode('utf-8')
2034
2035                 # Extract video thumbnail
2036                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2037                 if mobj is None:
2038                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2039                         return
2040                 video_thumbnail = mobj.group(1).decode('utf-8')
2041
2042                 # Extract video description
2043                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2044                 if mobj is None:
2045                         self._downloader.trouble(u'ERROR: unable to extract video description')
2046                         return
2047                 video_description = mobj.group(1).decode('utf-8')
2048                 if not video_description:
2049                         video_description = 'No description available.'
2050
2051                 # Extract video height and width
2052                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2053                 if mobj is None:
2054                         self._downloader.trouble(u'ERROR: unable to extract video height')
2055                         return
2056                 yv_video_height = mobj.group(1)
2057
2058                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2059                 if mobj is None:
2060                         self._downloader.trouble(u'ERROR: unable to extract video width')
2061                         return
2062                 yv_video_width = mobj.group(1)
2063
2064                 # Retrieve video playlist to extract media URL
2065                 # I'm not completely sure what all these options are, but we
2066                 # seem to need most of them, otherwise the server sends a 401.
2067                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
2068                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
2069                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2070                                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2071                                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2072                 try:
2073                         self.report_download_webpage(video_id)
2074                         webpage = urllib2.urlopen(request).read()
2075                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2077                         return
2078
2079                 # Extract media URL from playlist XML
2080                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2081                 if mobj is None:
2082                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
2083                         return
2084                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2085                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2086
2087                 try:
2088                         # Process video information
2089                         self._downloader.process_info({
2090                                 'id':           video_id.decode('utf-8'),
2091                                 'url':          video_url,
2092                                 'uploader':     video_uploader,
2093                                 'upload_date':  u'NA',
2094                                 'title':        video_title,
2095                                 'stitle':       simple_title,
2096                                 'ext':          video_extension.decode('utf-8'),
2097                                 'thumbnail':    video_thumbnail.decode('utf-8'),
2098                                 'description':  video_description,
2099                                 'thumbnail':    video_thumbnail,
2100                                 'player_url':   None,
2101                         })
2102                 except UnavailableVideoError:
2103                         self._downloader.trouble(u'\nERROR: unable to download video')
2104
2105
2106 class VimeoIE(InfoExtractor):
2107         """Information extractor for vimeo.com."""
2108
2109         # _VALID_URL matches Vimeo URLs
2110         _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2111         IE_NAME = u'vimeo'
2112
2113         def __init__(self, downloader=None):
2114                 InfoExtractor.__init__(self, downloader)
2115
2116         def report_download_webpage(self, video_id):
2117                 """Report webpage download."""
2118                 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2119
2120         def report_extraction(self, video_id):
2121                 """Report information extraction."""
2122                 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2123
2124         def _real_extract(self, url, new_video=True):
2125                 # Extract ID from URL
2126                 mobj = re.match(self._VALID_URL, url)
2127                 if mobj is None:
2128                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2129                         return
2130
2131                 # At this point we have a new video
2132                 self._downloader.increment_downloads()
2133                 video_id = mobj.group(1)
2134
2135                 # Retrieve video webpage to extract further information
2136                 request = urllib2.Request(url, None, std_headers)
2137                 try:
2138                         self.report_download_webpage(video_id)
2139                         webpage = urllib2.urlopen(request).read()
2140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2142                         return
2143
2144                 # Now we begin extracting as much information as we can from what we
2145                 # retrieved. First we extract the information common to all extractors,
2146                 # and latter we extract those that are Vimeo specific.
2147                 self.report_extraction(video_id)
2148
2149                 # Extract the config JSON
2150                 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2151                 try:
2152                         config = json.loads(config)
2153                 except:
2154                         self._downloader.trouble(u'ERROR: unable to extract info section')
2155                         return
2156
2157                 # Extract title
2158                 video_title = config["video"]["title"]
2159                 simple_title = _simplify_title(video_title)
2160
2161                 # Extract uploader
2162                 video_uploader = config["video"]["owner"]["name"]
2163
2164                 # Extract video thumbnail
2165                 video_thumbnail = config["video"]["thumbnail"]
2166
2167                 # Extract video description
2168                 try:
2169                         lxml.etree
2170                 except NameError:
2171                         video_description = u'No description available.'
2172                         mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2173                         if mobj is not None:
2174                                 video_description = mobj.group(1)
2175                 else:
2176                         html_parser = lxml.etree.HTMLParser()
2177                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2178                         video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2179                         # TODO use another parser
2180
2181                 # Extract upload date
2182                 video_upload_date = u'NA'
2183                 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2184                 if mobj is not None:
2185                         video_upload_date = mobj.group(1)
2186
2187                 # Vimeo specific: extract request signature and timestamp
2188                 sig = config['request']['signature']
2189                 timestamp = config['request']['timestamp']
2190
2191                 # Vimeo specific: extract video codec and quality information
2192                 # TODO bind to format param
2193                 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2194                 for codec in codecs:
2195                         if codec[0] in config["video"]["files"]:
2196                                 video_codec = codec[0]
2197                                 video_extension = codec[1]
2198                                 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2199                                 else: quality = 'sd'
2200                                 break
2201                 else:
2202                         self._downloader.trouble(u'ERROR: no known codec found')
2203                         return
2204
2205                 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2206                                         %(video_id, sig, timestamp, quality, video_codec.upper())
2207
2208                 try:
2209                         # Process video information
2210                         self._downloader.process_info({
2211                                 'id':           video_id,
2212                                 'url':          video_url,
2213                                 'uploader':     video_uploader,
2214                                 'upload_date':  video_upload_date,
2215                                 'title':        video_title,
2216                                 'stitle':       simple_title,
2217                                 'ext':          video_extension,
2218                                 'thumbnail':    video_thumbnail,
2219                                 'description':  video_description,
2220                                 'player_url':   None,
2221                         })
2222                 except UnavailableVideoError:
2223                         self._downloader.trouble(u'ERROR: unable to download video')
2224
2225
2226 class GenericIE(InfoExtractor):
2227         """Generic last-resort information extractor."""
2228
2229         _VALID_URL = r'.*'
2230         IE_NAME = u'generic'
2231
2232         def __init__(self, downloader=None):
2233                 InfoExtractor.__init__(self, downloader)
2234
2235         def report_download_webpage(self, video_id):
2236                 """Report webpage download."""
2237                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2238                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2239
2240         def report_extraction(self, video_id):
2241                 """Report information extraction."""
2242                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2243
2244         def report_following_redirect(self, new_url):
2245                 """Report information extraction."""
2246                 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
2247
2248         def _test_redirect(self, url):
2249                 """Check if it is a redirect, like url shorteners, in case restart chain."""
2250                 class HeadRequest(urllib2.Request):
2251                         def get_method(self):
2252                                 return "HEAD"
2253
2254                 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
2255                         """
2256                         Subclass the HTTPRedirectHandler to make it use our
2257                         HeadRequest also on the redirected URL
2258                         """
2259                         def redirect_request(self, req, fp, code, msg, headers, newurl):
2260                                 if code in (301, 302, 303, 307):
2261                                     newurl = newurl.replace(' ', '%20')
2262                                     newheaders = dict((k,v) for k,v in req.headers.items()
2263                                                       if k.lower() not in ("content-length", "content-type"))
2264                                     return HeadRequest(newurl,
2265                                                        headers=newheaders,
2266                                                        origin_req_host=req.get_origin_req_host(),
2267                                                        unverifiable=True)
2268                                 else:
2269                                     raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
2270
2271                 class HTTPMethodFallback(urllib2.BaseHandler):
2272                         """
2273                         Fallback to GET if HEAD is not allowed (405 HTTP error)
2274                         """
2275                         def http_error_405(self, req, fp, code, msg, headers):
2276                                 fp.read()
2277                                 fp.close()
2278
2279                                 newheaders = dict((k,v) for k,v in req.headers.items()
2280                                                   if k.lower() not in ("content-length", "content-type"))
2281                                 return self.parent.open(urllib2.Request(req.get_full_url(),
2282                                                                  headers=newheaders,
2283                                                                  origin_req_host=req.get_origin_req_host(),
2284                                                                  unverifiable=True))
2285
2286                 # Build our opener
2287                 opener = urllib2.OpenerDirector()
2288                 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
2289                                         HTTPMethodFallback, HEADRedirectHandler,
2290                                         urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
2291                         opener.add_handler(handler())
2292
2293                 response = opener.open(HeadRequest(url))
2294                 new_url = response.geturl()
2295
2296                 if url == new_url: return False
2297
2298                 self.report_following_redirect(new_url)
2299                 self._downloader.download([new_url])
2300                 return True
2301
2302         def _real_extract(self, url):
2303                 if self._test_redirect(url): return
2304
2305                 # At this point we have a new video
2306                 self._downloader.increment_downloads()
2307
2308                 video_id = url.split('/')[-1]
2309                 request = urllib2.Request(url)
2310                 try:
2311                         self.report_download_webpage(video_id)
2312                         webpage = urllib2.urlopen(request).read()
2313                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2314                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2315                         return
2316                 except ValueError, err:
2317                         # since this is the last-resort InfoExtractor, if
2318                         # this error is thrown, it'll be thrown here
2319                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2320                         return
2321
2322                 self.report_extraction(video_id)
2323                 # Start with something easy: JW Player in SWFObject
2324                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2325                 if mobj is None:
2326                         # Broaden the search a little bit
2327                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2328                 if mobj is None:
2329                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2330                         return
2331
2332                 # It's possible that one of the regexes
2333                 # matched, but returned an empty group:
2334                 if mobj.group(1) is None:
2335                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2336                         return
2337
2338                 video_url = urllib.unquote(mobj.group(1))
2339                 video_id = os.path.basename(video_url)
2340
2341                 # here's a fun little line of code for you:
2342                 video_extension = os.path.splitext(video_id)[1][1:]
2343                 video_id = os.path.splitext(video_id)[0]
2344
2345                 # it's tempting to parse this further, but you would
2346                 # have to take into account all the variations like
2347                 #   Video Title - Site Name
2348                 #   Site Name | Video Title
2349                 #   Video Title - Tagline | Site Name
2350                 # and so on and so forth; it's just not practical
2351                 mobj = re.search(r'<title>(.*)</title>', webpage)
2352                 if mobj is None:
2353                         self._downloader.trouble(u'ERROR: unable to extract title')
2354                         return
2355                 video_title = mobj.group(1).decode('utf-8')
2356                 video_title = sanitize_title(video_title)
2357                 simple_title = _simplify_title(video_title)
2358
2359                 # video uploader is domain name
2360                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2361                 if mobj is None:
2362                         self._downloader.trouble(u'ERROR: unable to extract title')
2363                         return
2364                 video_uploader = mobj.group(1).decode('utf-8')
2365
2366                 try:
2367                         # Process video information
2368                         self._downloader.process_info({
2369                                 'id':           video_id.decode('utf-8'),
2370                                 'url':          video_url.decode('utf-8'),
2371                                 'uploader':     video_uploader,
2372                                 'upload_date':  u'NA',
2373                                 'title':        video_title,
2374                                 'stitle':       simple_title,
2375                                 'ext':          video_extension.decode('utf-8'),
2376                                 'format':       u'NA',
2377                                 'player_url':   None,
2378                         })
2379                 except UnavailableVideoError, err:
2380                         self._downloader.trouble(u'\nERROR: unable to download video')
2381
2382
2383 class YoutubeSearchIE(InfoExtractor):
2384         """Information Extractor for YouTube search queries."""
2385         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2386         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2387         _youtube_ie = None
2388         _max_youtube_results = 1000
2389         IE_NAME = u'youtube:search'
2390
2391         def __init__(self, youtube_ie, downloader=None):
2392                 InfoExtractor.__init__(self, downloader)
2393                 self._youtube_ie = youtube_ie
2394
2395         def report_download_page(self, query, pagenum):
2396                 """Report attempt to download playlist page with given number."""
2397                 query = query.decode(preferredencoding())
2398                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2399
2400         def _real_initialize(self):
2401                 self._youtube_ie.initialize()
2402
2403         def _real_extract(self, query):
2404                 mobj = re.match(self._VALID_URL, query)
2405                 if mobj is None:
2406                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2407                         return
2408
2409                 prefix, query = query.split(':')
2410                 prefix = prefix[8:]
2411                 query = query.encode('utf-8')
2412                 if prefix == '':
2413                         self._download_n_results(query, 1)
2414                         return
2415                 elif prefix == 'all':
2416                         self._download_n_results(query, self._max_youtube_results)
2417                         return
2418                 else:
2419                         try:
2420                                 n = long(prefix)
2421                                 if n <= 0:
2422                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2423                                         return
2424                                 elif n > self._max_youtube_results:
2425                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2426                                         n = self._max_youtube_results
2427                                 self._download_n_results(query, n)
2428                                 return
2429                         except ValueError: # parsing prefix as integer fails
2430                                 self._download_n_results(query, 1)
2431                                 return
2432
2433         def _download_n_results(self, query, n):
2434                 """Downloads a specified number of results for a query"""
2435
2436                 video_ids = []
2437                 pagenum = 0
2438                 limit = n
2439
2440                 while (50 * pagenum) < limit:
2441                         self.report_download_page(query, pagenum+1)
2442                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2443                         request = urllib2.Request(result_url)
2444                         try:
2445                                 data = urllib2.urlopen(request).read()
2446                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2447                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2448                                 return
2449                         api_response = json.loads(data)['data']
2450
2451                         new_ids = list(video['id'] for video in api_response['items'])
2452                         video_ids += new_ids
2453
2454                         limit = min(n, api_response['totalItems'])
2455                         pagenum += 1
2456
2457                 if len(video_ids) > n:
2458                         video_ids = video_ids[:n]
2459                 for id in video_ids:
2460                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2461                 return
2462
2463
2464 class GoogleSearchIE(InfoExtractor):
2465         """Information Extractor for Google Video search queries."""
2466         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2467         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2468         _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2469         _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2470         _google_ie = None
2471         _max_google_results = 1000
2472         IE_NAME = u'video.google:search'
2473
2474         def __init__(self, google_ie, downloader=None):
2475                 InfoExtractor.__init__(self, downloader)
2476                 self._google_ie = google_ie
2477
2478         def report_download_page(self, query, pagenum):
2479                 """Report attempt to download playlist page with given number."""
2480                 query = query.decode(preferredencoding())
2481                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2482
2483         def _real_initialize(self):
2484                 self._google_ie.initialize()
2485
2486         def _real_extract(self, query):
2487                 mobj = re.match(self._VALID_URL, query)
2488                 if mobj is None:
2489                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2490                         return
2491
2492                 prefix, query = query.split(':')
2493                 prefix = prefix[8:]
2494                 query = query.encode('utf-8')
2495                 if prefix == '':
2496                         self._download_n_results(query, 1)
2497                         return
2498                 elif prefix == 'all':
2499                         self._download_n_results(query, self._max_google_results)
2500                         return
2501                 else:
2502                         try:
2503                                 n = long(prefix)
2504                                 if n <= 0:
2505                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2506                                         return
2507                                 elif n > self._max_google_results:
2508                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2509                                         n = self._max_google_results
2510                                 self._download_n_results(query, n)
2511                                 return
2512                         except ValueError: # parsing prefix as integer fails
2513                                 self._download_n_results(query, 1)
2514                                 return
2515
2516         def _download_n_results(self, query, n):
2517                 """Downloads a specified number of results for a query"""
2518
2519                 video_ids = []
2520                 pagenum = 0
2521
2522                 while True:
2523                         self.report_download_page(query, pagenum)
2524                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2525                         request = urllib2.Request(result_url)
2526                         try:
2527                                 page = urllib2.urlopen(request).read()
2528                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2529                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2530                                 return
2531
2532                         # Extract video identifiers
2533                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2534                                 video_id = mobj.group(1)
2535                                 if video_id not in video_ids:
2536                                         video_ids.append(video_id)
2537                                         if len(video_ids) == n:
2538                                                 # Specified n videos reached
2539                                                 for id in video_ids:
2540                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2541                                                 return
2542
2543                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2544                                 for id in video_ids:
2545                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2546                                 return
2547
2548                         pagenum = pagenum + 1
2549
2550
2551 class YahooSearchIE(InfoExtractor):
2552         """Information Extractor for Yahoo! Video search queries."""
2553         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2554         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2555         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2556         _MORE_PAGES_INDICATOR = r'\s*Next'
2557         _yahoo_ie = None
2558         _max_yahoo_results = 1000
2559         IE_NAME = u'video.yahoo:search'
2560
2561         def __init__(self, yahoo_ie, downloader=None):
2562                 InfoExtractor.__init__(self, downloader)
2563                 self._yahoo_ie = yahoo_ie
2564
2565         def report_download_page(self, query, pagenum):
2566                 """Report attempt to download playlist page with given number."""
2567                 query = query.decode(preferredencoding())
2568                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2569
2570         def _real_initialize(self):
2571                 self._yahoo_ie.initialize()
2572
2573         def _real_extract(self, query):
2574                 mobj = re.match(self._VALID_URL, query)
2575                 if mobj is None:
2576                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2577                         return
2578
2579                 prefix, query = query.split(':')
2580                 prefix = prefix[8:]
2581                 query = query.encode('utf-8')
2582                 if prefix == '':
2583                         self._download_n_results(query, 1)
2584                         return
2585                 elif prefix == 'all':
2586                         self._download_n_results(query, self._max_yahoo_results)
2587                         return
2588                 else:
2589                         try:
2590                                 n = long(prefix)
2591                                 if n <= 0:
2592                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2593                                         return
2594                                 elif n > self._max_yahoo_results:
2595                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2596                                         n = self._max_yahoo_results
2597                                 self._download_n_results(query, n)
2598                                 return
2599                         except ValueError: # parsing prefix as integer fails
2600                                 self._download_n_results(query, 1)
2601                                 return
2602
2603         def _download_n_results(self, query, n):
2604                 """Downloads a specified number of results for a query"""
2605
2606                 video_ids = []
2607                 already_seen = set()
2608                 pagenum = 1
2609
2610                 while True:
2611                         self.report_download_page(query, pagenum)
2612                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2613                         request = urllib2.Request(result_url)
2614                         try:
2615                                 page = urllib2.urlopen(request).read()
2616                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2617                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2618                                 return
2619
2620                         # Extract video identifiers
2621                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2622                                 video_id = mobj.group(1)
2623                                 if video_id not in already_seen:
2624                                         video_ids.append(video_id)
2625                                         already_seen.add(video_id)
2626                                         if len(video_ids) == n:
2627                                                 # Specified n videos reached
2628                                                 for id in video_ids:
2629                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2630                                                 return
2631
2632                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2633                                 for id in video_ids:
2634                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2635                                 return
2636
2637                         pagenum = pagenum + 1
2638
2639
2640 class YoutubePlaylistIE(InfoExtractor):
2641         """Information Extractor for YouTube playlists."""
2642
2643         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2644         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2645         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
2646         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2647         _youtube_ie = None
2648         IE_NAME = u'youtube:playlist'
2649
2650         def __init__(self, youtube_ie, downloader=None):
2651                 InfoExtractor.__init__(self, downloader)
2652                 self._youtube_ie = youtube_ie
2653
2654         def report_download_page(self, playlist_id, pagenum):
2655                 """Report attempt to download playlist page with given number."""
2656                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2657
2658         def _real_initialize(self):
2659                 self._youtube_ie.initialize()
2660
2661         def _real_extract(self, url):
2662                 # Extract playlist id
2663                 mobj = re.match(self._VALID_URL, url)
2664                 if mobj is None:
2665                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2666                         return
2667
2668                 # Single video case
2669                 if mobj.group(3) is not None:
2670                         self._youtube_ie.extract(mobj.group(3))
2671                         return
2672
2673                 # Download playlist pages
2674                 # prefix is 'p' as default for playlists but there are other types that need extra care
2675                 playlist_prefix = mobj.group(1)
2676                 if playlist_prefix == 'a':
2677                         playlist_access = 'artist'
2678                 else:
2679                         playlist_prefix = 'p'
2680                         playlist_access = 'view_play_list'
2681                 playlist_id = mobj.group(2)
2682                 video_ids = []
2683                 pagenum = 1
2684
2685                 while True:
2686                         self.report_download_page(playlist_id, pagenum)
2687                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2688                         request = urllib2.Request(url)
2689                         try:
2690                                 page = urllib2.urlopen(request).read()
2691                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2692                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2693                                 return
2694
2695                         # Extract video identifiers
2696                         ids_in_page = []
2697                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2698                                 if mobj.group(1) not in ids_in_page:
2699                                         ids_in_page.append(mobj.group(1))
2700                         video_ids.extend(ids_in_page)
2701
2702                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2703                                 break
2704                         pagenum = pagenum + 1
2705
2706                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2707                 playlistend = self._downloader.params.get('playlistend', -1)
2708                 if playlistend == -1:
2709                         video_ids = video_ids[playliststart:]
2710                 else:
2711                         video_ids = video_ids[playliststart:playlistend]
2712
2713                 for id in video_ids:
2714                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2715                 return
2716
2717
2718 class YoutubeUserIE(InfoExtractor):
2719         """Information Extractor for YouTube users."""
2720
2721         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2722         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2723         _GDATA_PAGE_SIZE = 50
2724         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2725         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2726         _youtube_ie = None
2727         IE_NAME = u'youtube:user'
2728
2729         def __init__(self, youtube_ie, downloader=None):
2730                 InfoExtractor.__init__(self, downloader)
2731                 self._youtube_ie = youtube_ie
2732
2733         def report_download_page(self, username, start_index):
2734                 """Report attempt to download user page."""
2735                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2736                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2737
2738         def _real_initialize(self):
2739                 self._youtube_ie.initialize()
2740
2741         def _real_extract(self, url):
2742                 # Extract username
2743                 mobj = re.match(self._VALID_URL, url)
2744                 if mobj is None:
2745                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2746                         return
2747
2748                 username = mobj.group(1)
2749
2750                 # Download video ids using YouTube Data API. Result size per
2751                 # query is limited (currently to 50 videos) so we need to query
2752                 # page by page until there are no video ids - it means we got
2753                 # all of them.
2754
2755                 video_ids = []
2756                 pagenum = 0
2757
2758                 while True:
2759                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2760                         self.report_download_page(username, start_index)
2761
2762                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2763
2764                         try:
2765                                 page = urllib2.urlopen(request).read()
2766                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2767                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2768                                 return
2769
2770                         # Extract video identifiers
2771                         ids_in_page = []
2772
2773                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2774                                 if mobj.group(1) not in ids_in_page:
2775                                         ids_in_page.append(mobj.group(1))
2776
2777                         video_ids.extend(ids_in_page)
2778
2779                         # A little optimization - if current page is not
2780                         # "full", ie. does not contain PAGE_SIZE video ids then
2781                         # we can assume that this page is the last one - there
2782                         # are no more ids on further pages - no need to query
2783                         # again.
2784
2785                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2786                                 break
2787
2788                         pagenum += 1
2789
2790                 all_ids_count = len(video_ids)
2791                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2792                 playlistend = self._downloader.params.get('playlistend', -1)
2793
2794                 if playlistend == -1:
2795                         video_ids = video_ids[playliststart:]
2796                 else:
2797                         video_ids = video_ids[playliststart:playlistend]
2798
2799                 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2800                                 (username, all_ids_count, len(video_ids)))
2801
2802                 for video_id in video_ids:
2803                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2804
2805
2806 class DepositFilesIE(InfoExtractor):
2807         """Information extractor for depositfiles.com"""
2808
2809         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2810         IE_NAME = u'DepositFiles'
2811
2812         def __init__(self, downloader=None):
2813                 InfoExtractor.__init__(self, downloader)
2814
2815         def report_download_webpage(self, file_id):
2816                 """Report webpage download."""
2817                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2818
2819         def report_extraction(self, file_id):
2820                 """Report information extraction."""
2821                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2822
2823         def _real_extract(self, url):
2824                 # At this point we have a new file
2825                 self._downloader.increment_downloads()
2826
2827                 file_id = url.split('/')[-1]
2828                 # Rebuild url in english locale
2829                 url = 'http://depositfiles.com/en/files/' + file_id
2830
2831                 # Retrieve file webpage with 'Free download' button pressed
2832                 free_download_indication = { 'gateway_result' : '1' }
2833                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2834                 try:
2835                         self.report_download_webpage(file_id)
2836                         webpage = urllib2.urlopen(request).read()
2837                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2838                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2839                         return
2840
2841                 # Search for the real file URL
2842                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2843                 if (mobj is None) or (mobj.group(1) is None):
2844                         # Try to figure out reason of the error.
2845                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2846                         if (mobj is not None) and (mobj.group(1) is not None):
2847                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2848                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2849                         else:
2850                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2851                         return
2852
2853                 file_url = mobj.group(1)
2854                 file_extension = os.path.splitext(file_url)[1][1:]
2855
2856                 # Search for file title
2857                 mobj = re.search(r'<b title="(.*?)">', webpage)
2858                 if mobj is None:
2859                         self._downloader.trouble(u'ERROR: unable to extract title')
2860                         return
2861                 file_title = mobj.group(1).decode('utf-8')
2862
2863                 try:
2864                         # Process file information
2865                         self._downloader.process_info({
2866                                 'id':           file_id.decode('utf-8'),
2867                                 'url':          file_url.decode('utf-8'),
2868                                 'uploader':     u'NA',
2869                                 'upload_date':  u'NA',
2870                                 'title':        file_title,
2871                                 'stitle':       file_title,
2872                                 'ext':          file_extension.decode('utf-8'),
2873                                 'format':       u'NA',
2874                                 'player_url':   None,
2875                         })
2876                 except UnavailableVideoError, err:
2877                         self._downloader.trouble(u'ERROR: unable to download file')
2878
2879
2880 class FacebookIE(InfoExtractor):
2881         """Information Extractor for Facebook"""
2882
2883         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2884         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2885         _NETRC_MACHINE = 'facebook'
2886         _available_formats = ['video', 'highqual', 'lowqual']
2887         _video_extensions = {
2888                 'video': 'mp4',
2889                 'highqual': 'mp4',
2890                 'lowqual': 'mp4',
2891         }
2892         IE_NAME = u'facebook'
2893
2894         def __init__(self, downloader=None):
2895                 InfoExtractor.__init__(self, downloader)
2896
2897         def _reporter(self, message):
2898                 """Add header and report message."""
2899                 self._downloader.to_screen(u'[facebook] %s' % message)
2900
2901         def report_login(self):
2902                 """Report attempt to log in."""
2903                 self._reporter(u'Logging in')
2904
2905         def report_video_webpage_download(self, video_id):
2906                 """Report attempt to download video webpage."""
2907                 self._reporter(u'%s: Downloading video webpage' % video_id)
2908
2909         def report_information_extraction(self, video_id):
2910                 """Report attempt to extract video information."""
2911                 self._reporter(u'%s: Extracting video information' % video_id)
2912
2913         def _parse_page(self, video_webpage):
2914                 """Extract video information from page"""
2915                 # General data
2916                 data = {'title': r'\("video_title", "(.*?)"\)',
2917                         'description': r'<div class="datawrap">(.*?)</div>',
2918                         'owner': r'\("video_owner_name", "(.*?)"\)',
2919                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2920                         }
2921                 video_info = {}
2922                 for piece in data.keys():
2923                         mobj = re.search(data[piece], video_webpage)
2924                         if mobj is not None:
2925                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2926
2927                 # Video urls
2928                 video_urls = {}
2929                 for fmt in self._available_formats:
2930                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2931                         if mobj is not None:
2932                                 # URL is in a Javascript segment inside an escaped Unicode format within
2933                                 # the generally utf-8 page
2934                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2935                 video_info['video_urls'] = video_urls
2936
2937                 return video_info
2938
2939         def _real_initialize(self):
2940                 if self._downloader is None:
2941                         return
2942
2943                 useremail = None
2944                 password = None
2945                 downloader_params = self._downloader.params
2946
2947                 # Attempt to use provided username and password or .netrc data
2948                 if downloader_params.get('username', None) is not None:
2949                         useremail = downloader_params['username']
2950                         password = downloader_params['password']
2951                 elif downloader_params.get('usenetrc', False):
2952                         try:
2953                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2954                                 if info is not None:
2955                                         useremail = info[0]
2956                                         password = info[2]
2957                                 else:
2958                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2959                         except (IOError, netrc.NetrcParseError), err:
2960                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2961                                 return
2962
2963                 if useremail is None:
2964                         return
2965
2966                 # Log in
2967                 login_form = {
2968                         'email': useremail,
2969                         'pass': password,
2970                         'login': 'Log+In'
2971                         }
2972                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2973                 try:
2974                         self.report_login()
2975                         login_results = urllib2.urlopen(request).read()
2976                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2977                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2978                                 return
2979                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2980                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2981                         return
2982
2983         def _real_extract(self, url):
2984                 mobj = re.match(self._VALID_URL, url)
2985                 if mobj is None:
2986                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2987                         return
2988                 video_id = mobj.group('ID')
2989
2990                 # Get video webpage
2991                 self.report_video_webpage_download(video_id)
2992                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2993                 try:
2994                         page = urllib2.urlopen(request)
2995                         video_webpage = page.read()
2996                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2997                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2998                         return
2999
3000                 # Start extracting information
3001                 self.report_information_extraction(video_id)
3002
3003                 # Extract information
3004                 video_info = self._parse_page(video_webpage)
3005
3006                 # uploader
3007                 if 'owner' not in video_info:
3008                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3009                         return
3010                 video_uploader = video_info['owner']
3011
3012                 # title
3013                 if 'title' not in video_info:
3014                         self._downloader.trouble(u'ERROR: unable to extract video title')
3015                         return
3016                 video_title = video_info['title']
3017                 video_title = video_title.decode('utf-8')
3018                 video_title = sanitize_title(video_title)
3019
3020                 simple_title = _simplify_title(video_title)
3021
3022                 # thumbnail image
3023                 if 'thumbnail' not in video_info:
3024                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3025                         video_thumbnail = ''
3026                 else:
3027                         video_thumbnail = video_info['thumbnail']
3028
3029                 # upload date
3030                 upload_date = u'NA'
3031                 if 'upload_date' in video_info:
3032                         upload_time = video_info['upload_date']
3033                         timetuple = email.utils.parsedate_tz(upload_time)
3034                         if timetuple is not None:
3035                                 try:
3036                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
3037                                 except:
3038                                         pass
3039
3040                 # description
3041                 video_description = video_info.get('description', 'No description available.')
3042
3043                 url_map = video_info['video_urls']
3044                 if len(url_map.keys()) > 0:
3045                         # Decide which formats to download
3046                         req_format = self._downloader.params.get('format', None)
3047                         format_limit = self._downloader.params.get('format_limit', None)
3048
3049                         if format_limit is not None and format_limit in self._available_formats:
3050                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
3051                         else:
3052                                 format_list = self._available_formats
3053                         existing_formats = [x for x in format_list if x in url_map]
3054                         if len(existing_formats) == 0:
3055                                 self._downloader.trouble(u'ERROR: no known formats available for video')
3056                                 return
3057                         if req_format is None:
3058                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3059                         elif req_format == 'worst':
3060                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3061                         elif req_format == '-1':
3062                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3063                         else:
3064                                 # Specific format
3065                                 if req_format not in url_map:
3066                                         self._downloader.trouble(u'ERROR: requested format not available')
3067                                         return
3068                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
3069
3070                 for format_param, video_real_url in video_url_list:
3071
3072                         # At this point we have a new video
3073                         self._downloader.increment_downloads()
3074
3075                         # Extension
3076                         video_extension = self._video_extensions.get(format_param, 'mp4')
3077
3078                         try:
3079                                 # Process video information
3080                                 self._downloader.process_info({
3081                                         'id':           video_id.decode('utf-8'),
3082                                         'url':          video_real_url.decode('utf-8'),
3083                                         'uploader':     video_uploader.decode('utf-8'),
3084                                         'upload_date':  upload_date,
3085                                         'title':        video_title,
3086                                         'stitle':       simple_title,
3087                                         'ext':          video_extension.decode('utf-8'),
3088                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
3089                                         'thumbnail':    video_thumbnail.decode('utf-8'),
3090                                         'description':  video_description.decode('utf-8'),
3091                                         'player_url':   None,
3092                                 })
3093                         except UnavailableVideoError, err:
3094                                 self._downloader.trouble(u'\nERROR: unable to download video')
3095
3096 class BlipTVIE(InfoExtractor):
3097         """Information extractor for blip.tv"""
3098
3099         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3100         _URL_EXT = r'^.*\.([a-z0-9]+)$'
3101         IE_NAME = u'blip.tv'
3102
3103         def report_extraction(self, file_id):
3104                 """Report information extraction."""
3105                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3106
3107         def report_direct_download(self, title):
3108                 """Report information extraction."""
3109                 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3110
3111         def _real_extract(self, url):
3112                 mobj = re.match(self._VALID_URL, url)
3113                 if mobj is None:
3114                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3115                         return
3116
3117                 if '?' in url:
3118                         cchar = '&'
3119                 else:
3120                         cchar = '?'
3121                 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3122                 request = urllib2.Request(json_url)
3123                 self.report_extraction(mobj.group(1))
3124                 info = None
3125                 try:
3126                         urlh = urllib2.urlopen(request)
3127                         if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3128                                 basename = url.split('/')[-1]
3129                                 title,ext = os.path.splitext(basename)
3130                                 title = title.decode('UTF-8')
3131                                 ext = ext.replace('.', '')
3132                                 self.report_direct_download(title)
3133                                 info = {
3134                                         'id': title,
3135                                         'url': url,
3136                                         'title': title,
3137                                         'stitle': _simplify_title(title),
3138                                         'ext': ext,
3139                                         'urlhandle': urlh
3140                                 }
3141                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3143                         return
3144                 if info is None: # Regular URL
3145                         try:
3146                                 json_code = urlh.read()
3147                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148                                 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3149                                 return
3150
3151                         try:
3152                                 json_data = json.loads(json_code)
3153                                 if 'Post' in json_data:
3154                                         data = json_data['Post']
3155                                 else:
3156                                         data = json_data
3157
3158                                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3159                                 video_url = data['media']['url']
3160                                 umobj = re.match(self._URL_EXT, video_url)
3161                                 if umobj is None:
3162                                         raise ValueError('Can not determine filename extension')
3163                                 ext = umobj.group(1)
3164
3165                                 info = {
3166                                         'id': data['item_id'],
3167                                         'url': video_url,
3168                                         'uploader': data['display_name'],
3169                                         'upload_date': upload_date,
3170                                         'title': data['title'],
3171                                         'stitle': _simplify_title(data['title']),
3172                                         'ext': ext,
3173                                         'format': data['media']['mimeType'],
3174                                         'thumbnail': data['thumbnailUrl'],
3175                                         'description': data['description'],
3176                                         'player_url': data['embedUrl']
3177                                 }
3178                         except (ValueError,KeyError), err:
3179                                 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3180                                 return
3181
3182                 self._downloader.increment_downloads()
3183
3184                 try:
3185                         self._downloader.process_info(info)
3186                 except UnavailableVideoError, err:
3187                         self._downloader.trouble(u'\nERROR: unable to download video')
3188
3189
3190 class MyVideoIE(InfoExtractor):
3191         """Information Extractor for myvideo.de."""
3192
3193         _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3194         IE_NAME = u'myvideo'
3195
3196         def __init__(self, downloader=None):
3197                 InfoExtractor.__init__(self, downloader)
3198
3199         def report_download_webpage(self, video_id):
3200                 """Report webpage download."""
3201                 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3202
3203         def report_extraction(self, video_id):
3204                 """Report information extraction."""
3205                 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3206
3207         def _real_extract(self,url):
3208                 mobj = re.match(self._VALID_URL, url)
3209                 if mobj is None:
3210                         self._download.trouble(u'ERROR: invalid URL: %s' % url)
3211                         return
3212
3213                 video_id = mobj.group(1)
3214
3215                 # Get video webpage
3216                 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3217                 try:
3218                         self.report_download_webpage(video_id)
3219                         webpage = urllib2.urlopen(request).read()
3220                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3221                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3222                         return
3223
3224                 self.report_extraction(video_id)
3225                 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3226                                  webpage)
3227                 if mobj is None:
3228                         self._downloader.trouble(u'ERROR: unable to extract media URL')
3229                         return
3230                 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3231
3232                 mobj = re.search('<title>([^<]+)</title>', webpage)
3233                 if mobj is None:
3234                         self._downloader.trouble(u'ERROR: unable to extract title')
3235                         return
3236
3237                 video_title = mobj.group(1)
3238                 video_title = sanitize_title(video_title)
3239
3240                 simple_title = _simplify_title(video_title)
3241
3242                 try:
3243                         self._downloader.process_info({
3244                                 'id':           video_id,
3245                                 'url':          video_url,
3246                                 'uploader':     u'NA',
3247                                 'upload_date':  u'NA',
3248                                 'title':        video_title,
3249                                 'stitle':       simple_title,
3250                                 'ext':          u'flv',
3251                                 'format':       u'NA',
3252                                 'player_url':   None,
3253                         })
3254                 except UnavailableVideoError:
3255                         self._downloader.trouble(u'\nERROR: Unable to download video')
3256
3257 class ComedyCentralIE(InfoExtractor):
3258         """Information extractor for The Daily Show and Colbert Report """
3259
3260         _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3261         IE_NAME = u'comedycentral'
3262
3263         def report_extraction(self, episode_id):
3264                 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3265
3266         def report_config_download(self, episode_id):
3267                 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3268
3269         def report_index_download(self, episode_id):
3270                 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3271
3272         def report_player_url(self, episode_id):
3273                 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3274
3275         def _real_extract(self, url):
3276                 mobj = re.match(self._VALID_URL, url)
3277                 if mobj is None:
3278                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3279                         return
3280
3281                 if mobj.group('shortname'):
3282                         if mobj.group('shortname') in ('tds', 'thedailyshow'):
3283                                 url = u'http://www.thedailyshow.com/full-episodes/'
3284                         else:
3285                                 url = u'http://www.colbertnation.com/full-episodes/'
3286                         mobj = re.match(self._VALID_URL, url)
3287                         assert mobj is not None
3288
3289                 dlNewest = not mobj.group('episode')
3290                 if dlNewest:
3291                         epTitle = mobj.group('showname')
3292                 else:
3293                         epTitle = mobj.group('episode')
3294
3295                 req = urllib2.Request(url)
3296                 self.report_extraction(epTitle)
3297                 try:
3298                         htmlHandle = urllib2.urlopen(req)
3299                         html = htmlHandle.read()
3300                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3301                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3302                         return
3303                 if dlNewest:
3304                         url = htmlHandle.geturl()
3305                         mobj = re.match(self._VALID_URL, url)
3306                         if mobj is None:
3307                                 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3308                                 return
3309                         if mobj.group('episode') == '':
3310                                 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3311                                 return
3312                         epTitle = mobj.group('episode')
3313
3314                 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3315                 if len(mMovieParams) == 0:
3316                         self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3317                         return
3318
3319                 playerUrl_raw = mMovieParams[0][0]
3320                 self.report_player_url(epTitle)
3321                 try:
3322                         urlHandle = urllib2.urlopen(playerUrl_raw)
3323                         playerUrl = urlHandle.geturl()
3324                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3325                         self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3326                         return
3327
3328                 uri = mMovieParams[0][1]
3329                 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3330                 self.report_index_download(epTitle)
3331                 try:
3332                         indexXml = urllib2.urlopen(indexUrl).read()
3333                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3334                         self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3335                         return
3336
3337                 idoc = xml.etree.ElementTree.fromstring(indexXml)
3338                 itemEls = idoc.findall('.//item')
3339                 for itemEl in itemEls:
3340                         mediaId = itemEl.findall('./guid')[0].text
3341                         shortMediaId = mediaId.split(':')[-1]
3342                         showId = mediaId.split(':')[-2].replace('.com', '')
3343                         officialTitle = itemEl.findall('./title')[0].text
3344                         officialDate = itemEl.findall('./pubDate')[0].text
3345
3346                         configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3347                                                 urllib.urlencode({'uri': mediaId}))
3348                         configReq = urllib2.Request(configUrl)
3349                         self.report_config_download(epTitle)
3350                         try:
3351                                 configXml = urllib2.urlopen(configReq).read()
3352                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3353                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3354                                 return
3355
3356                         cdoc = xml.etree.ElementTree.fromstring(configXml)
3357                         turls = []
3358                         for rendition in cdoc.findall('.//rendition'):
3359                                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3360                                 turls.append(finfo)
3361
3362                         if len(turls) == 0:
3363                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3364                                 continue
3365
3366                         # For now, just pick the highest bitrate
3367                         format,video_url = turls[-1]
3368
3369                         self._downloader.increment_downloads()
3370
3371                         effTitle = showId + u'-' + epTitle
3372                         info = {
3373                                 'id': shortMediaId,
3374                                 'url': video_url,
3375                                 'uploader': showId,
3376                                 'upload_date': officialDate,
3377                                 'title': effTitle,
3378                                 'stitle': _simplify_title(effTitle),
3379                                 'ext': 'mp4',
3380                                 'format': format,
3381                                 'thumbnail': None,
3382                                 'description': officialTitle,
3383                                 'player_url': playerUrl
3384                         }
3385
3386                         try:
3387                                 self._downloader.process_info(info)
3388                         except UnavailableVideoError, err:
3389                                 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3390                                 continue
3391
3392
3393 class EscapistIE(InfoExtractor):
3394         """Information extractor for The Escapist """
3395
3396         _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3397         IE_NAME = u'escapist'
3398
3399         def report_extraction(self, showName):
3400                 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3401
3402         def report_config_download(self, showName):
3403                 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3404
3405         def _real_extract(self, url):
3406                 htmlParser = HTMLParser.HTMLParser()
3407
3408                 mobj = re.match(self._VALID_URL, url)
3409                 if mobj is None:
3410                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3411                         return
3412                 showName = mobj.group('showname')
3413                 videoId = mobj.group('episode')
3414
3415                 self.report_extraction(showName)
3416                 try:
3417                         webPage = urllib2.urlopen(url).read()
3418                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3419                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3420                         return
3421
3422                 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3423                 description = htmlParser.unescape(descMatch.group(1))
3424                 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3425                 imgUrl = htmlParser.unescape(imgMatch.group(1))
3426                 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3427                 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3428                 configUrlMatch = re.search('config=(.*)$', playerUrl)
3429                 configUrl = urllib2.unquote(configUrlMatch.group(1))
3430
3431                 self.report_config_download(showName)
3432                 try:
3433                         configJSON = urllib2.urlopen(configUrl).read()
3434                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3435                         self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3436                         return
3437
3438                 # Technically, it's JavaScript, not JSON
3439                 configJSON = configJSON.replace("'", '"')
3440
3441                 try:
3442                         config = json.loads(configJSON)
3443                 except (ValueError,), err:
3444                         self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3445                         return
3446
3447                 playlist = config['playlist']
3448                 videoUrl = playlist[1]['url']
3449
3450                 self._downloader.increment_downloads()
3451                 info = {
3452                         'id': videoId,
3453                         'url': videoUrl,
3454                         'uploader': showName,
3455                         'upload_date': None,
3456                         'title': showName,
3457                         'stitle': _simplify_title(showName),
3458                         'ext': 'flv',
3459                         'format': 'flv',
3460                         'thumbnail': imgUrl,
3461                         'description': description,
3462                         'player_url': playerUrl,
3463                 }
3464
3465                 try:
3466                         self._downloader.process_info(info)
3467                 except UnavailableVideoError, err:
3468                         self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3469
3470
3471 class CollegeHumorIE(InfoExtractor):
3472         """Information extractor for collegehumor.com"""
3473
3474         _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3475         IE_NAME = u'collegehumor'
3476
3477         def report_webpage(self, video_id):
3478                 """Report information extraction."""
3479                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3480
3481         def report_extraction(self, video_id):
3482                 """Report information extraction."""
3483                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3484
3485         def _real_extract(self, url):
3486                 htmlParser = HTMLParser.HTMLParser()
3487
3488                 mobj = re.match(self._VALID_URL, url)
3489                 if mobj is None:
3490                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3491                         return
3492                 video_id = mobj.group('videoid')
3493
3494                 self.report_webpage(video_id)
3495                 request = urllib2.Request(url)
3496                 try:
3497                         webpage = urllib2.urlopen(request).read()
3498                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3499                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3500                         return
3501
3502                 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3503                 if m is None:
3504                         self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3505                         return
3506                 internal_video_id = m.group('internalvideoid')
3507
3508                 info = {
3509                         'id': video_id,
3510                         'internal_id': internal_video_id,
3511                 }
3512
3513                 self.report_extraction(video_id)
3514                 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3515                 try:
3516                         metaXml = urllib2.urlopen(xmlUrl).read()
3517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3518                         self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3519                         return
3520
3521                 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3522                 try:
3523                         videoNode = mdoc.findall('./video')[0]
3524                         info['description'] = videoNode.findall('./description')[0].text
3525                         info['title'] = videoNode.findall('./caption')[0].text
3526                         info['stitle'] = _simplify_title(info['title'])
3527                         info['url'] = videoNode.findall('./file')[0].text
3528                         info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3529                         info['ext'] = info['url'].rpartition('.')[2]
3530                         info['format'] = info['ext']
3531                 except IndexError:
3532                         self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3533                         return
3534
3535                 self._downloader.increment_downloads()
3536
3537                 try:
3538                         self._downloader.process_info(info)
3539                 except UnavailableVideoError, err:
3540                         self._downloader.trouble(u'\nERROR: unable to download video')
3541
3542
3543 class XVideosIE(InfoExtractor):
3544         """Information extractor for xvideos.com"""
3545
3546         _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3547         IE_NAME = u'xvideos'
3548
3549         def report_webpage(self, video_id):
3550                 """Report information extraction."""
3551                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3552
3553         def report_extraction(self, video_id):
3554                 """Report information extraction."""
3555                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3556
3557         def _real_extract(self, url):
3558                 htmlParser = HTMLParser.HTMLParser()
3559
3560                 mobj = re.match(self._VALID_URL, url)
3561                 if mobj is None:
3562                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3563                         return
3564                 video_id = mobj.group(1).decode('utf-8')
3565
3566                 self.report_webpage(video_id)
3567
3568                 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3569                 try:
3570                         webpage = urllib2.urlopen(request).read()
3571                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3572                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3573                         return
3574
3575                 self.report_extraction(video_id)
3576
3577
3578                 # Extract video URL
3579                 mobj = re.search(r'flv_url=(.+?)&', webpage)
3580                 if mobj is None:
3581                         self._downloader.trouble(u'ERROR: unable to extract video url')
3582                         return
3583                 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3584
3585
3586                 # Extract title
3587                 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3588                 if mobj is None:
3589                         self._downloader.trouble(u'ERROR: unable to extract video title')
3590                         return
3591                 video_title = mobj.group(1).decode('utf-8')
3592
3593
3594                 # Extract video thumbnail
3595                 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3596                 if mobj is None:
3597                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3598                         return
3599                 video_thumbnail = mobj.group(1).decode('utf-8')
3600
3601
3602
3603                 self._downloader.increment_downloads()
3604                 info = {
3605                         'id': video_id,
3606                         'url': video_url,
3607                         'uploader': None,
3608                         'upload_date': None,
3609                         'title': video_title,
3610                         'stitle': _simplify_title(video_title),
3611                         'ext': 'flv',
3612                         'format': 'flv',
3613                         'thumbnail': video_thumbnail,
3614                         'description': None,
3615                         'player_url': None,
3616                 }
3617
3618                 try:
3619                         self._downloader.process_info(info)
3620                 except UnavailableVideoError, err:
3621                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3622
3623
3624 class SoundcloudIE(InfoExtractor):
3625         """Information extractor for soundcloud.com
3626            To access the media, the uid of the song and a stream token
3627            must be extracted from the page source and the script must make
3628            a request to media.soundcloud.com/crossdomain.xml. Then
3629            the media can be grabbed by requesting from an url composed
3630            of the stream token and uid
3631          """
3632
3633         _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3634         IE_NAME = u'soundcloud'
3635
3636         def __init__(self, downloader=None):
3637                 InfoExtractor.__init__(self, downloader)
3638
3639         def report_webpage(self, video_id):
3640                 """Report information extraction."""
3641                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3642
3643         def report_extraction(self, video_id):
3644                 """Report information extraction."""
3645                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3646
3647         def _real_extract(self, url):
3648                 htmlParser = HTMLParser.HTMLParser()
3649
3650                 mobj = re.match(self._VALID_URL, url)
3651                 if mobj is None:
3652                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3653                         return
3654
3655                 # extract uploader (which is in the url)
3656                 uploader = mobj.group(1).decode('utf-8')
3657                 # extract simple title (uploader + slug of song title)
3658                 slug_title =  mobj.group(2).decode('utf-8')
3659                 simple_title = uploader + '-' + slug_title
3660
3661                 self.report_webpage('%s/%s' % (uploader, slug_title))
3662
3663                 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3664                 try:
3665                         webpage = urllib2.urlopen(request).read()
3666                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3667                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3668                         return
3669
3670                 self.report_extraction('%s/%s' % (uploader, slug_title))
3671
3672                 # extract uid and stream token that soundcloud hands out for access
3673                 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3674                 if mobj:
3675                         video_id = mobj.group(1)
3676                         stream_token = mobj.group(2)
3677
3678                 # extract unsimplified title
3679                 mobj = re.search('"title":"(.*?)",', webpage)
3680                 if mobj:
3681                         title = mobj.group(1)
3682
3683                 # construct media url (with uid/token)
3684                 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3685                 mediaURL = mediaURL % (video_id, stream_token)
3686
3687                 # description
3688                 description = u'No description available'
3689                 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3690                 if mobj:
3691                         description = mobj.group(1)
3692
3693                 # upload date
3694                 upload_date = None
3695                 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3696                 if mobj:
3697                         try:
3698                                 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3699                         except Exception, e:
3700                                 print str(e)
3701
3702                 # for soundcloud, a request to a cross domain is required for cookies
3703                 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3704
3705                 try:
3706                         self._downloader.process_info({
3707                                 'id':           video_id.decode('utf-8'),
3708                                 'url':          mediaURL,
3709                                 'uploader':     uploader.decode('utf-8'),
3710                                 'upload_date':  upload_date,
3711                                 'title':        simple_title.decode('utf-8'),
3712                                 'stitle':       simple_title.decode('utf-8'),
3713                                 'ext':          u'mp3',
3714                                 'format':       u'NA',
3715                                 'player_url':   None,
3716                                 'description': description.decode('utf-8')
3717                         })
3718                 except UnavailableVideoError:
3719                         self._downloader.trouble(u'\nERROR: unable to download video')
3720
3721
3722 class InfoQIE(InfoExtractor):
3723         """Information extractor for infoq.com"""
3724
3725         _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3726         IE_NAME = u'infoq'
3727
3728         def report_webpage(self, video_id):
3729                 """Report information extraction."""
3730                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3731
3732         def report_extraction(self, video_id):
3733                 """Report information extraction."""
3734                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3735
3736         def _real_extract(self, url):
3737                 htmlParser = HTMLParser.HTMLParser()
3738
3739                 mobj = re.match(self._VALID_URL, url)
3740                 if mobj is None:
3741                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3742                         return
3743
3744                 self.report_webpage(url)
3745
3746                 request = urllib2.Request(url)
3747                 try:
3748                         webpage = urllib2.urlopen(request).read()
3749                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3750                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3751                         return
3752
3753                 self.report_extraction(url)
3754
3755
3756                 # Extract video URL
3757                 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3758                 if mobj is None:
3759                         self._downloader.trouble(u'ERROR: unable to extract video url')
3760                         return
3761                 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3762
3763
3764                 # Extract title
3765                 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3766                 if mobj is None:
3767                         self._downloader.trouble(u'ERROR: unable to extract video title')
3768                         return
3769                 video_title = mobj.group(1).decode('utf-8')
3770
3771                 # Extract description
3772                 video_description = u'No description available.'
3773                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3774                 if mobj is not None:
3775                         video_description = mobj.group(1).decode('utf-8')
3776
3777                 video_filename = video_url.split('/')[-1]
3778                 video_id, extension = video_filename.split('.')
3779
3780                 self._downloader.increment_downloads()
3781                 info = {
3782                         'id': video_id,
3783                         'url': video_url,
3784                         'uploader': None,
3785                         'upload_date': None,
3786                         'title': video_title,
3787                         'stitle': _simplify_title(video_title),
3788                         'ext': extension,
3789                         'format': extension, # Extension is always(?) mp4, but seems to be flv
3790                         'thumbnail': None,
3791                         'description': video_description,
3792                         'player_url': None,
3793                 }
3794
3795                 try:
3796                         self._downloader.process_info(info)
3797                 except UnavailableVideoError, err:
3798                         self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3799
3800 class MixcloudIE(InfoExtractor):
3801         """Information extractor for www.mixcloud.com"""
3802         _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3803         IE_NAME = u'mixcloud'
3804
3805         def __init__(self, downloader=None):
3806                 InfoExtractor.__init__(self, downloader)
3807
3808         def report_download_json(self, file_id):
3809                 """Report JSON download."""
3810                 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3811
3812         def report_extraction(self, file_id):
3813                 """Report information extraction."""
3814                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3815
3816         def get_urls(self, jsonData, fmt, bitrate='best'):
3817                 """Get urls from 'audio_formats' section in json"""
3818                 file_url = None
3819                 try:
3820                         bitrate_list = jsonData[fmt]
3821                         if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3822                                 bitrate = max(bitrate_list) # select highest
3823
3824                         url_list = jsonData[fmt][bitrate]
3825                 except TypeError: # we have no bitrate info.
3826                         url_list = jsonData[fmt]
3827
3828                 return url_list
3829
3830         def check_urls(self, url_list):
3831                 """Returns 1st active url from list"""
3832                 for url in url_list:
3833                         try:
3834                                 urllib2.urlopen(url)
3835                                 return url
3836                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3837                                 url = None
3838
3839                 return None
3840
3841         def _print_formats(self, formats):
3842                 print 'Available formats:'
3843                 for fmt in formats.keys():
3844                         for b in formats[fmt]:
3845                                 try:
3846                                         ext = formats[fmt][b][0]
3847                                         print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3848                                 except TypeError: # we have no bitrate info
3849                                         ext = formats[fmt][0]
3850                                         print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3851                                         break
3852
3853         def _real_extract(self, url):
3854                 mobj = re.match(self._VALID_URL, url)
3855                 if mobj is None:
3856                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3857                         return
3858                 # extract uploader & filename from url
3859                 uploader = mobj.group(1).decode('utf-8')
3860                 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3861
3862                 # construct API request
3863                 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3864                 # retrieve .json file with links to files
3865                 request = urllib2.Request(file_url)
3866                 try:
3867                         self.report_download_json(file_url)
3868                         jsonData = urllib2.urlopen(request).read()
3869                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3870                         self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3871                         return
3872
3873                 # parse JSON
3874                 json_data = json.loads(jsonData)
3875                 player_url = json_data['player_swf_url']
3876                 formats = dict(json_data['audio_formats'])
3877
3878                 req_format = self._downloader.params.get('format', None)
3879                 bitrate = None
3880
3881                 if self._downloader.params.get('listformats', None):
3882                         self._print_formats(formats)
3883                         return
3884
3885                 if req_format is None or req_format == 'best':
3886                         for format_param in formats.keys():
3887                                 url_list = self.get_urls(formats, format_param)
3888                                 # check urls
3889                                 file_url = self.check_urls(url_list)
3890                                 if file_url is not None:
3891                                         break # got it!
3892                 else:
3893                         if req_format not in formats.keys():
3894                                 self._downloader.trouble(u'ERROR: format is not available')
3895                                 return
3896
3897                         url_list = self.get_urls(formats, req_format)
3898                         file_url = self.check_urls(url_list)
3899                         format_param = req_format
3900
3901                 # We have audio
3902                 self._downloader.increment_downloads()
3903                 try:
3904                         # Process file information
3905                         self._downloader.process_info({
3906                                 'id': file_id.decode('utf-8'),
3907                                 'url': file_url.decode('utf-8'),
3908                                 'uploader':     uploader.decode('utf-8'),
3909                                 'upload_date': u'NA',
3910                                 'title': json_data['name'],
3911                                 'stitle': _simplify_title(json_data['name']),
3912                                 'ext': file_url.split('.')[-1].decode('utf-8'),
3913                                 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3914                                 'thumbnail': json_data['thumbnail_url'],
3915                                 'description': json_data['description'],
3916                                 'player_url': player_url.decode('utf-8'),
3917                         })
3918                 except UnavailableVideoError, err:
3919                         self._downloader.trouble(u'ERROR: unable to download file')
3920
3921 class StanfordOpenClassroomIE(InfoExtractor):
3922         """Information extractor for Stanford's Open ClassRoom"""
3923
3924         _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3925         IE_NAME = u'stanfordoc'
3926
3927         def report_download_webpage(self, objid):
3928                 """Report information extraction."""
3929                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3930
3931         def report_extraction(self, video_id):
3932                 """Report information extraction."""
3933                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3934
3935         def _real_extract(self, url):
3936                 mobj = re.match(self._VALID_URL, url)
3937                 if mobj is None:
3938                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3939                         return
3940
3941                 if mobj.group('course') and mobj.group('video'): # A specific video
3942                         course = mobj.group('course')
3943                         video = mobj.group('video')
3944                         info = {
3945                                 'id': _simplify_title(course + '_' + video),
3946                         }
3947
3948                         self.report_extraction(info['id'])
3949                         baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3950                         xmlUrl = baseUrl + video + '.xml'
3951                         try:
3952                                 metaXml = urllib2.urlopen(xmlUrl).read()
3953                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3954                                 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3955                                 return
3956                         mdoc = xml.etree.ElementTree.fromstring(metaXml)
3957                         try:
3958                                 info['title'] = mdoc.findall('./title')[0].text
3959                                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3960                         except IndexError:
3961                                 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3962                                 return
3963                         info['stitle'] = _simplify_title(info['title'])
3964                         info['ext'] = info['url'].rpartition('.')[2]
3965                         info['format'] = info['ext']
3966                         self._downloader.increment_downloads()
3967                         try:
3968                                 self._downloader.process_info(info)
3969                         except UnavailableVideoError, err:
3970                                 self._downloader.trouble(u'\nERROR: unable to download video')
3971                 elif mobj.group('course'): # A course page
3972                         unescapeHTML = HTMLParser.HTMLParser().unescape
3973
3974                         course = mobj.group('course')
3975                         info = {
3976                                 'id': _simplify_title(course),
3977                                 'type': 'playlist',
3978                         }
3979
3980                         self.report_download_webpage(info['id'])
3981                         try:
3982                                 coursepage = urllib2.urlopen(url).read()
3983                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3984                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3985                                 return
3986
3987                         m = re.search('<h1>([^<]+)</h1>', coursepage)
3988                         if m:
3989                                 info['title'] = unescapeHTML(m.group(1))
3990                         else:
3991                                 info['title'] = info['id']
3992                         info['stitle'] = _simplify_title(info['title'])
3993
3994                         m = re.search('<description>([^<]+)</description>', coursepage)
3995                         if m:
3996                                 info['description'] = unescapeHTML(m.group(1))
3997
3998                         links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3999                         info['list'] = [
4000                                 {
4001                                         'type': 'reference',
4002                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
4003                                 }
4004                                         for vpage in links]
4005
4006                         for entry in info['list']:
4007                                 assert entry['type'] == 'reference'
4008                                 self.extract(entry['url'])
4009                 else: # Root page
4010                         unescapeHTML = HTMLParser.HTMLParser().unescape
4011
4012                         info = {
4013                                 'id': 'Stanford OpenClassroom',
4014                                 'type': 'playlist',
4015                         }
4016
4017                         self.report_download_webpage(info['id'])
4018                         rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
4019                         try:
4020                                 rootpage = urllib2.urlopen(rootURL).read()
4021                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4022                                 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
4023                                 return
4024
4025                         info['title'] = info['id']
4026                         info['stitle'] = _simplify_title(info['title'])
4027
4028                         links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
4029                         info['list'] = [
4030                                 {
4031                                         'type': 'reference',
4032                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
4033                                 }
4034                                         for cpage in links]
4035
4036                         for entry in info['list']:
4037                                 assert entry['type'] == 'reference'
4038                                 self.extract(entry['url'])
4039
4040 class MTVIE(InfoExtractor):
4041         """Information extractor for MTV.com"""
4042
4043         _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
4044         IE_NAME = u'mtv'
4045
4046         def report_webpage(self, video_id):
4047                 """Report information extraction."""
4048                 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
4049
4050         def report_extraction(self, video_id):
4051                 """Report information extraction."""
4052                 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
4053
4054         def _real_extract(self, url):
4055                 mobj = re.match(self._VALID_URL, url)
4056                 if mobj is None:
4057                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4058                         return
4059                 if not mobj.group('proto'):
4060                         url = 'http://' + url
4061                 video_id = mobj.group('videoid')
4062                 self.report_webpage(video_id)
4063
4064                 request = urllib2.Request(url)
4065                 try:
4066                         webpage = urllib2.urlopen(request).read()
4067                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4068                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4069                         return
4070
4071                 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4072                 if mobj is None:
4073                         self._downloader.trouble(u'ERROR: unable to extract song name')
4074                         return
4075                 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4076                 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4077                 if mobj is None:
4078                         self._downloader.trouble(u'ERROR: unable to extract performer')
4079                         return
4080                 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4081                 video_title = performer + ' - ' + song_name
4082
4083                 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4084                 if mobj is None:
4085                         self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4086                         return
4087                 mtvn_uri = mobj.group(1)
4088
4089                 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4090                 if mobj is None:
4091                         self._downloader.trouble(u'ERROR: unable to extract content id')
4092                         return
4093                 content_id = mobj.group(1)
4094
4095                 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4096                 self.report_extraction(video_id)
4097                 request = urllib2.Request(videogen_url)
4098                 try:
4099                         metadataXml = urllib2.urlopen(request).read()
4100                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4101                         self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4102                         return
4103
4104                 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4105                 renditions = mdoc.findall('.//rendition')
4106
4107                 # For now, always pick the highest quality.
4108                 rendition = renditions[-1]
4109
4110                 try:
4111                         _,_,ext = rendition.attrib['type'].partition('/')
4112                         format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4113                         video_url = rendition.find('./src').text
4114                 except KeyError:
4115                         self._downloader.trouble('Invalid rendition field.')
4116                         return
4117
4118                 self._downloader.increment_downloads()
4119                 info = {
4120                         'id': video_id,
4121                         'url': video_url,
4122                         'uploader': performer,
4123                         'title': video_title,
4124                         'stitle': _simplify_title(video_title),
4125                         'ext': ext,
4126                         'format': format,
4127                 }
4128
4129                 try:
4130                         self._downloader.process_info(info)
4131                 except UnavailableVideoError, err:
4132                         self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4133
4134
4135 class PostProcessor(object):
4136         """Post Processor class.
4137
4138         PostProcessor objects can be added to downloaders with their
4139         add_post_processor() method. When the downloader has finished a
4140         successful download, it will take its internal chain of PostProcessors
4141         and start calling the run() method on each one of them, first with
4142         an initial argument and then with the returned value of the previous
4143         PostProcessor.
4144
4145         The chain will be stopped if one of them ever returns None or the end
4146         of the chain is reached.
4147
4148         PostProcessor objects follow a "mutual registration" process similar
4149         to InfoExtractor objects.
4150         """
4151
4152         _downloader = None
4153
4154         def __init__(self, downloader=None):
4155                 self._downloader = downloader
4156
4157         def set_downloader(self, downloader):
4158                 """Sets the downloader for this PP."""
4159                 self._downloader = downloader
4160
4161         def run(self, information):
4162                 """Run the PostProcessor.
4163
4164                 The "information" argument is a dictionary like the ones
4165                 composed by InfoExtractors. The only difference is that this
4166                 one has an extra field called "filepath" that points to the
4167                 downloaded file.
4168
4169                 When this method returns None, the postprocessing chain is
4170                 stopped. However, this method may return an information
4171                 dictionary that will be passed to the next postprocessing
4172                 object in the chain. It can be the one it received after
4173                 changing some fields.
4174
4175                 In addition, this method may raise a PostProcessingError
4176                 exception that will be taken into account by the downloader
4177                 it was called from.
4178                 """
4179                 return information # by default, do nothing
4180
4181 class AudioConversionError(BaseException):
4182         def __init__(self, message):
4183                 self.message = message
4184
4185 class FFmpegExtractAudioPP(PostProcessor):
4186
4187         def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4188                 PostProcessor.__init__(self, downloader)
4189                 if preferredcodec is None:
4190                         preferredcodec = 'best'
4191                 self._preferredcodec = preferredcodec
4192                 self._preferredquality = preferredquality
4193                 self._keepvideo = keepvideo
4194
4195         @staticmethod
4196         def get_audio_codec(path):
4197                 try:
4198                         cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4199                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4200                         output = handle.communicate()[0]
4201                         if handle.wait() != 0:
4202                                 return None
4203                 except (IOError, OSError):
4204                         return None
4205                 audio_codec = None
4206                 for line in output.split('\n'):
4207                         if line.startswith('codec_name='):
4208                                 audio_codec = line.split('=')[1].strip()
4209                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4210                                 return audio_codec
4211                 return None
4212
4213         @staticmethod
4214         def run_ffmpeg(path, out_path, codec, more_opts):
4215                 if codec is None:
4216                         acodec_opts = []
4217                 else:
4218                         acodec_opts = ['-acodec', codec]
4219                 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4220                 try:
4221                         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4222                         stdout,stderr = p.communicate()
4223                 except (IOError, OSError):
4224                         e = sys.exc_info()[1]
4225                         if isinstance(e, OSError) and e.errno == 2:
4226                                 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4227                         else:
4228                                 raise e
4229                 if p.returncode != 0:
4230                         msg = stderr.strip().split('\n')[-1]
4231                         raise AudioConversionError(msg)
4232
4233         def run(self, information):
4234                 path = information['filepath']
4235
4236                 filecodec = self.get_audio_codec(path)
4237                 if filecodec is None:
4238                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4239                         return None
4240
4241                 more_opts = []
4242                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4243                         if self._preferredcodec == 'm4a' and filecodec == 'aac':
4244                                 # Lossless, but in another container
4245                                 acodec = 'copy'
4246                                 extension = self._preferredcodec
4247                                 more_opts = ['-absf', 'aac_adtstoasc']
4248                         elif filecodec in ['aac', 'mp3', 'vorbis']:
4249                                 # Lossless if possible
4250                                 acodec = 'copy'
4251                                 extension = filecodec
4252                                 if filecodec == 'aac':
4253                                         more_opts = ['-f', 'adts']
4254                                 if filecodec == 'vorbis':
4255                                         extension = 'ogg'
4256                         else:
4257                                 # MP3 otherwise.
4258                                 acodec = 'libmp3lame'
4259                                 extension = 'mp3'
4260                                 more_opts = []
4261                                 if self._preferredquality is not None:
4262                                         more_opts += ['-ab', self._preferredquality]
4263                 else:
4264                         # We convert the audio (lossy)
4265                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4266                         extension = self._preferredcodec
4267                         more_opts = []
4268                         if self._preferredquality is not None:
4269                                 more_opts += ['-ab', self._preferredquality]
4270                         if self._preferredcodec == 'aac':
4271                                 more_opts += ['-f', 'adts']
4272                         if self._preferredcodec == 'm4a':
4273                                 more_opts += ['-absf', 'aac_adtstoasc']
4274                         if self._preferredcodec == 'vorbis':
4275                                 extension = 'ogg'
4276                         if self._preferredcodec == 'wav':
4277                                 extension = 'wav'
4278                                 more_opts += ['-f', 'wav']
4279
4280                 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4281                 new_path = prefix + sep + extension
4282                 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4283                 try:
4284                         self.run_ffmpeg(path, new_path, acodec, more_opts)
4285                 except:
4286                         etype,e,tb = sys.exc_info()
4287                         if isinstance(e, AudioConversionError):
4288                                 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4289                         else:
4290                                 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4291                         return None
4292
4293                 # Try to update the date time for extracted audio file.
4294                 if information.get('filetime') is not None:
4295                         try:
4296                                 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4297                         except:
4298                                 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4299
4300                 if not self._keepvideo:
4301                         try:
4302                                 os.remove(_encodeFilename(path))
4303                         except (IOError, OSError):
4304                                 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4305                                 return None
4306
4307                 information['filepath'] = new_path
4308                 return information
4309
4310
4311 def updateSelf(downloader, filename):
4312         ''' Update the program file with the latest version from the repository '''
4313         # Note: downloader only used for options
4314         if not os.access(filename, os.W_OK):
4315                 sys.exit('ERROR: no write permissions on %s' % filename)
4316
4317         downloader.to_screen(u'Updating to latest version...')
4318
4319         try:
4320                 try:
4321                         urlh = urllib.urlopen(UPDATE_URL)
4322                         newcontent = urlh.read()
4323
4324                         vmatch = re.search("__version__ = '([^']+)'", newcontent)
4325                         if vmatch is not None and vmatch.group(1) == __version__:
4326                                 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4327                                 return
4328                 finally:
4329                         urlh.close()
4330         except (IOError, OSError), err:
4331                 sys.exit('ERROR: unable to download latest version')
4332
4333         try:
4334                 outf = open(filename, 'wb')
4335                 try:
4336                         outf.write(newcontent)
4337                 finally:
4338                         outf.close()
4339         except (IOError, OSError), err:
4340                 sys.exit('ERROR: unable to overwrite current version')
4341
4342         downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4343
4344 def parseOpts():
4345         def _readOptions(filename_bytes):
4346                 try:
4347                         optionf = open(filename_bytes)
4348                 except IOError:
4349                         return [] # silently skip if file is not present
4350                 try:
4351                         res = []
4352                         for l in optionf:
4353                                 res += shlex.split(l, comments=True)
4354                 finally:
4355                         optionf.close()
4356                 return res
4357
4358         def _format_option_string(option):
4359                 ''' ('-o', '--option') -> -o, --format METAVAR'''
4360
4361                 opts = []
4362
4363                 if option._short_opts: opts.append(option._short_opts[0])
4364                 if option._long_opts: opts.append(option._long_opts[0])
4365                 if len(opts) > 1: opts.insert(1, ', ')
4366
4367                 if option.takes_value(): opts.append(' %s' % option.metavar)
4368
4369                 return "".join(opts)
4370
4371         def _find_term_columns():
4372                 columns = os.environ.get('COLUMNS', None)
4373                 if columns:
4374                         return int(columns)
4375
4376                 try:
4377                         sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4378                         out,err = sp.communicate()
4379                         return int(out.split()[1])
4380                 except:
4381                         pass
4382                 return None
4383
4384         max_width = 80
4385         max_help_position = 80
4386
4387         # No need to wrap help messages if we're on a wide console
4388         columns = _find_term_columns()
4389         if columns: max_width = columns
4390
4391         fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4392         fmt.format_option_strings = _format_option_string
4393
4394         kw = {
4395                 'version'   : __version__,
4396                 'formatter' : fmt,
4397                 'usage' : '%prog [options] url [url...]',
4398                 'conflict_handler' : 'resolve',
4399         }
4400
4401         parser = optparse.OptionParser(**kw)
4402
4403         # option groups
4404         general        = optparse.OptionGroup(parser, 'General Options')
4405         selection      = optparse.OptionGroup(parser, 'Video Selection')
4406         authentication = optparse.OptionGroup(parser, 'Authentication Options')
4407         video_format   = optparse.OptionGroup(parser, 'Video Format Options')
4408         postproc       = optparse.OptionGroup(parser, 'Post-processing Options')
4409         filesystem     = optparse.OptionGroup(parser, 'Filesystem Options')
4410         verbosity      = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4411
4412         general.add_option('-h', '--help',
4413                         action='help', help='print this help text and exit')
4414         general.add_option('-v', '--version',
4415                         action='version', help='print program version and exit')
4416         general.add_option('-U', '--update',
4417                         action='store_true', dest='update_self', help='update this program to latest version')
4418         general.add_option('-i', '--ignore-errors',
4419                         action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4420         general.add_option('-r', '--rate-limit',
4421                         dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4422         general.add_option('-R', '--retries',
4423                         dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4424         general.add_option('--dump-user-agent',
4425                         action='store_true', dest='dump_user_agent',
4426                         help='display the current browser identification', default=False)
4427         general.add_option('--list-extractors',
4428                         action='store_true', dest='list_extractors',
4429                         help='List all supported extractors and the URLs they would handle', default=False)
4430
4431         selection.add_option('--playlist-start',
4432                         dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4433         selection.add_option('--playlist-end',
4434                         dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4435         selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4436         selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4437         selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4438
4439         authentication.add_option('-u', '--username',
4440                         dest='username', metavar='USERNAME', help='account username')
4441         authentication.add_option('-p', '--password',
4442                         dest='password', metavar='PASSWORD', help='account password')
4443         authentication.add_option('-n', '--netrc',
4444                         action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4445
4446
4447         video_format.add_option('-f', '--format',
4448                         action='store', dest='format', metavar='FORMAT', help='video format code')
4449         video_format.add_option('--all-formats',
4450                         action='store_const', dest='format', help='download all available video formats', const='all')
4451         video_format.add_option('--prefer-free-formats',
4452                         action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4453         video_format.add_option('--max-quality',
4454                         action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4455         video_format.add_option('-F', '--list-formats',
4456                         action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4457         video_format.add_option('--write-srt',
4458                         action='store_true', dest='writesubtitles',
4459                         help='write video closed captions to a .srt file (currently youtube only)', default=False)
4460         video_format.add_option('--srt-lang',
4461                         action='store', dest='subtitleslang', metavar='LANG',
4462                         help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4463
4464
4465         verbosity.add_option('-q', '--quiet',
4466                         action='store_true', dest='quiet', help='activates quiet mode', default=False)
4467         verbosity.add_option('-s', '--simulate',
4468                         action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4469         verbosity.add_option('--skip-download',
4470                         action='store_true', dest='skip_download', help='do not download the video', default=False)
4471         verbosity.add_option('-g', '--get-url',
4472                         action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4473         verbosity.add_option('-e', '--get-title',
4474                         action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4475         verbosity.add_option('--get-thumbnail',
4476                         action='store_true', dest='getthumbnail',
4477                         help='simulate, quiet but print thumbnail URL', default=False)
4478         verbosity.add_option('--get-description',
4479                         action='store_true', dest='getdescription',
4480                         help='simulate, quiet but print video description', default=False)
4481         verbosity.add_option('--get-filename',
4482                         action='store_true', dest='getfilename',
4483                         help='simulate, quiet but print output filename', default=False)
4484         verbosity.add_option('--get-format',
4485                         action='store_true', dest='getformat',
4486                         help='simulate, quiet but print output format', default=False)
4487         verbosity.add_option('--no-progress',
4488                         action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4489         verbosity.add_option('--console-title',
4490                         action='store_true', dest='consoletitle',
4491                         help='display progress in console titlebar', default=False)
4492         verbosity.add_option('-v', '--verbose',
4493                         action='store_true', dest='verbose', help='print various debugging information', default=False)
4494
4495
4496         filesystem.add_option('-t', '--title',
4497                         action='store_true', dest='usetitle', help='use title in file name', default=False)
4498         filesystem.add_option('-l', '--literal',
4499                         action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4500         filesystem.add_option('-A', '--auto-number',
4501                         action='store_true', dest='autonumber',
4502                         help='number downloaded files starting from 00000', default=False)
4503         filesystem.add_option('-o', '--output',
4504                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4505         filesystem.add_option('-a', '--batch-file',
4506                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4507         filesystem.add_option('-w', '--no-overwrites',
4508                         action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4509         filesystem.add_option('-c', '--continue',
4510                         action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4511         filesystem.add_option('--no-continue',
4512                         action='store_false', dest='continue_dl',
4513                         help='do not resume partially downloaded files (restart from beginning)')
4514         filesystem.add_option('--cookies',
4515                         dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4516         filesystem.add_option('--no-part',
4517                         action='store_true', dest='nopart', help='do not use .part files', default=False)
4518         filesystem.add_option('--no-mtime',
4519                         action='store_false', dest='updatetime',
4520                         help='do not use the Last-modified header to set the file modification time', default=True)
4521         filesystem.add_option('--write-description',
4522                         action='store_true', dest='writedescription',
4523                         help='write video description to a .description file', default=False)
4524         filesystem.add_option('--write-info-json',
4525                         action='store_true', dest='writeinfojson',
4526                         help='write video metadata to a .info.json file', default=False)
4527
4528
4529         postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4530                         help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4531         postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4532                         help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4533         postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4534                         help='ffmpeg audio bitrate specification, 128k by default')
4535         postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4536                         help='keeps the video file on disk after the post-processing; the video is erased by default')
4537
4538
4539         parser.add_option_group(general)
4540         parser.add_option_group(selection)
4541         parser.add_option_group(filesystem)
4542         parser.add_option_group(verbosity)
4543         parser.add_option_group(video_format)
4544         parser.add_option_group(authentication)
4545         parser.add_option_group(postproc)
4546
4547         xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4548         if xdg_config_home:
4549                 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4550         else:
4551                 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4552         argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4553         opts, args = parser.parse_args(argv)
4554
4555         return parser, opts, args
4556
4557 def gen_extractors():
4558         """ Return a list of an instance of every supported extractor.
4559         The order does matter; the first extractor matched is the one handling the URL.
4560         """
4561         youtube_ie = YoutubeIE()
4562         google_ie = GoogleIE()
4563         yahoo_ie = YahooIE()
4564         return [
4565                 YoutubePlaylistIE(youtube_ie),
4566                 YoutubeUserIE(youtube_ie),
4567                 YoutubeSearchIE(youtube_ie),
4568                 youtube_ie,
4569                 MetacafeIE(youtube_ie),
4570                 DailymotionIE(),
4571                 google_ie,
4572                 GoogleSearchIE(google_ie),
4573                 PhotobucketIE(),
4574                 yahoo_ie,
4575                 YahooSearchIE(yahoo_ie),
4576                 DepositFilesIE(),
4577                 FacebookIE(),
4578                 BlipTVIE(),
4579                 VimeoIE(),
4580                 MyVideoIE(),
4581                 ComedyCentralIE(),
4582                 EscapistIE(),
4583                 CollegeHumorIE(),
4584                 XVideosIE(),
4585                 SoundcloudIE(),
4586                 InfoQIE(),
4587                 MixcloudIE(),
4588                 StanfordOpenClassroomIE(),
4589                 MTVIE(),
4590
4591                 GenericIE()
4592         ]
4593
4594 def _real_main():
4595         parser, opts, args = parseOpts()
4596
4597         # Open appropriate CookieJar
4598         if opts.cookiefile is None:
4599                 jar = cookielib.CookieJar()
4600         else:
4601                 try:
4602                         jar = cookielib.MozillaCookieJar(opts.cookiefile)
4603                         if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4604                                 jar.load()
4605                 except (IOError, OSError), err:
4606                         sys.exit(u'ERROR: unable to open cookie file')
4607
4608         # Dump user agent
4609         if opts.dump_user_agent:
4610                 print std_headers['User-Agent']
4611                 sys.exit(0)
4612
4613         # Batch file verification
4614         batchurls = []
4615         if opts.batchfile is not None:
4616                 try:
4617                         if opts.batchfile == '-':
4618                                 batchfd = sys.stdin
4619                         else:
4620                                 batchfd = open(opts.batchfile, 'r')
4621                         batchurls = batchfd.readlines()
4622                         batchurls = [x.strip() for x in batchurls]
4623                         batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4624                 except IOError:
4625                         sys.exit(u'ERROR: batch file could not be read')
4626         all_urls = batchurls + args
4627
4628         # General configuration
4629         cookie_processor = urllib2.HTTPCookieProcessor(jar)
4630         proxy_handler = urllib2.ProxyHandler()
4631         opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4632         urllib2.install_opener(opener)
4633         socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4634
4635         if opts.verbose:
4636                 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4637
4638         extractors = gen_extractors()
4639
4640         if opts.list_extractors:
4641                 for ie in extractors:
4642                         print(ie.IE_NAME)
4643                         matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4644                         all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4645                         for mu in matchedUrls:
4646                                 print(u'  ' + mu)
4647                 sys.exit(0)
4648
4649         # Conflicting, missing and erroneous options
4650         if opts.usenetrc and (opts.username is not None or opts.password is not None):
4651                 parser.error(u'using .netrc conflicts with giving username/password')
4652         if opts.password is not None and opts.username is None:
4653                 parser.error(u'account username missing')
4654         if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4655                 parser.error(u'using output template conflicts with using title, literal title or auto number')
4656         if opts.usetitle and opts.useliteral:
4657                 parser.error(u'using title conflicts with using literal title')
4658         if opts.username is not None and opts.password is None:
4659                 opts.password = getpass.getpass(u'Type account password and press return:')
4660         if opts.ratelimit is not None:
4661                 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4662                 if numeric_limit is None:
4663                         parser.error(u'invalid rate limit specified')
4664                 opts.ratelimit = numeric_limit
4665         if opts.retries is not None:
4666                 try:
4667                         opts.retries = long(opts.retries)
4668                 except (TypeError, ValueError), err:
4669                         parser.error(u'invalid retry count specified')
4670         try:
4671                 opts.playliststart = int(opts.playliststart)
4672                 if opts.playliststart <= 0:
4673                         raise ValueError(u'Playlist start must be positive')
4674         except (TypeError, ValueError), err:
4675                 parser.error(u'invalid playlist start number specified')
4676         try:
4677                 opts.playlistend = int(opts.playlistend)
4678                 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4679                         raise ValueError(u'Playlist end must be greater than playlist start')
4680         except (TypeError, ValueError), err:
4681                 parser.error(u'invalid playlist end number specified')
4682         if opts.extractaudio:
4683                 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4684                         parser.error(u'invalid audio format specified')
4685
4686         # File downloader
4687         fd = FileDownloader({
4688                 'usenetrc': opts.usenetrc,
4689                 'username': opts.username,
4690                 'password': opts.password,
4691                 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4692                 'forceurl': opts.geturl,
4693                 'forcetitle': opts.gettitle,
4694                 'forcethumbnail': opts.getthumbnail,
4695                 'forcedescription': opts.getdescription,
4696                 'forcefilename': opts.getfilename,
4697                 'forceformat': opts.getformat,
4698                 'simulate': opts.simulate,
4699                 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4700                 'format': opts.format,
4701                 'format_limit': opts.format_limit,
4702                 'listformats': opts.listformats,
4703                 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4704                         or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4705                         or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4706                         or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4707                         or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4708                         or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4709                         or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4710                         or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4711                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4712                         or u'%(id)s.%(ext)s'),
4713                 'ignoreerrors': opts.ignoreerrors,
4714                 'ratelimit': opts.ratelimit,
4715                 'nooverwrites': opts.nooverwrites,
4716                 'retries': opts.retries,
4717                 'continuedl': opts.continue_dl,
4718                 'noprogress': opts.noprogress,
4719                 'playliststart': opts.playliststart,
4720                 'playlistend': opts.playlistend,
4721                 'logtostderr': opts.outtmpl == '-',
4722                 'consoletitle': opts.consoletitle,
4723                 'nopart': opts.nopart,
4724                 'updatetime': opts.updatetime,
4725                 'writedescription': opts.writedescription,
4726                 'writeinfojson': opts.writeinfojson,
4727                 'writesubtitles': opts.writesubtitles,
4728                 'subtitleslang': opts.subtitleslang,
4729                 'matchtitle': opts.matchtitle,
4730                 'rejecttitle': opts.rejecttitle,
4731                 'max_downloads': opts.max_downloads,
4732                 'prefer_free_formats': opts.prefer_free_formats,
4733                 'verbose': opts.verbose,
4734                 })
4735         for extractor in extractors:
4736                 fd.add_info_extractor(extractor)
4737
4738         # PostProcessors
4739         if opts.extractaudio:
4740                 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4741
4742         # Update version
4743         if opts.update_self:
4744                 updateSelf(fd, sys.argv[0])
4745
4746         # Maybe do nothing
4747         if len(all_urls) < 1:
4748                 if not opts.update_self:
4749                         parser.error(u'you must provide at least one URL')
4750                 else:
4751                         sys.exit()
4752
4753         try:
4754                 retcode = fd.download(all_urls)
4755         except MaxDownloadsReached:
4756                 fd.to_screen(u'--max-download limit reached, aborting.')
4757                 retcode = 101
4758
4759         # Dump cookie jar if requested
4760         if opts.cookiefile is not None:
4761                 try:
4762                         jar.save()
4763                 except (IOError, OSError), err:
4764                         sys.exit(u'ERROR: unable to save cookie jar')
4765
4766         sys.exit(retcode)
4767
4768 def main():
4769         try:
4770                 _real_main()
4771         except DownloadError:
4772                 sys.exit(1)
4773         except SameFileError:
4774                 sys.exit(u'ERROR: fixed output name but more than one file to download')
4775         except KeyboardInterrupt:
4776                 sys.exit(u'\nERROR: Interrupted by user')
4777
4778 if __name__ == '__main__':
4779         main()
4780
4781 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: