git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # Author: Witold Baryluk
   8 # Author: Paweł Paprota
   9 # Author: Gergely Imreh
  10 # Author: Philipp Hagemeister <phihag@phihag.de>
  11 # License: Public domain code
  12 from __future__ import with_statement
  13 import contextlib
  14 import cookielib
  15 import ctypes
  16 import datetime
  17 import email.utils
  18 import gzip
  19 import htmlentitydefs
  20 import httplib
  21 import locale
  22 import math
  23 import netrc
  24 import os
  25 import os.path
  26 import re
  27 import socket
  28 import string
  29 import subprocess
  30 import sys
  31 import time
  32 import urllib
  33 import urllib2
  34 import warnings
  35 import zlib
  36
  37 try:
  38         import cStringIO as StringIO
  39 except ImportError:
  40         import StringIO
  41
  42 # parse_qs was moved from the cgi module to the urlparse module recently.
  43 try:
  44         from urlparse import parse_qs
  45 except ImportError:
  46         from cgi import parse_qs
  47
  48 try:
  49         import lxml.etree
  50 except ImportError: # Python < 2.6
  51         pass # Handled below
  52
  53 std_headers = {
  54         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
  55         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  56         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  57         'Accept-Encoding': 'gzip, deflate',
  58         'Accept-Language': 'en-us,en;q=0.5',
  59 }
  60
  61 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  62
  63 try:
  64         import json
  65 except ImportError: # Python <2.5, use trivialjson (https://github.com/phihag/trivialjson):
  66         import re
  67         class json(object):
  68                 @staticmethod
  69                 def loads(s):
  70                         s = s.decode('UTF-8')
  71                         def raiseError(msg, i):
  72                                 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
  73                         def skipSpace(i, expectMore=True):
  74                                 while i < len(s) and s[i] in ' \t\r\n':
  75                                         i += 1
  76                                 if expectMore:
  77                                         if i >= len(s):
  78                                                 raiseError('Premature end', i)
  79                                 return i
  80                         def decodeEscape(match):
  81                                 esc = match.group(1)
  82                                 _STATIC = {
  83                                         '"': '"',
  84                                         '\\': '\\',
  85                                         '/': '/',
  86                                         'b': unichr(0x8),
  87                                         'f': unichr(0xc),
  88                                         'n': '\n',
  89                                         'r': '\r',
  90                                         't': '\t',
  91                                 }
  92                                 if esc in _STATIC:
  93                                         return _STATIC[esc]
  94                                 if esc[0] == 'u':
  95                                         if len(esc) == 1+4:
  96                                                 return unichr(int(esc[1:5], 16))
  97                                         if len(esc) == 5+6 and esc[5:7] == '\\u':
  98                                                 hi = int(esc[1:5], 16)
  99                                                 low = int(esc[7:11], 16)
 100                                                 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
 101                                 raise ValueError('Unknown escape ' + str(esc))
 102                         def parseString(i):
 103                                 i += 1
 104                                 e = i
 105                                 while True:
 106                                         e = s.index('"', e)
 107                                         bslashes = 0
 108                                         while s[e-bslashes-1] == '\\':
 109                                                 bslashes += 1
 110                                         if bslashes % 2 == 1:
 111                                                 e += 1
 112                                                 continue
 113                                         break
 114                                 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
 115                                 stri = rexp.sub(decodeEscape, s[i:e])
 116                                 return (e+1,stri)
 117                         def parseObj(i):
 118                                 i += 1
 119                                 res = {}
 120                                 i = skipSpace(i)
 121                                 if s[i] == '}': # Empty dictionary
 122                                         return (i+1,res)
 123                                 while True:
 124                                         if s[i] != '"':
 125                                                 raiseError('Expected a string object key', i)
 126                                         i,key = parseString(i)
 127                                         i = skipSpace(i)
 128                                         if i >= len(s) or s[i] != ':':
 129                                                 raiseError('Expected a colon', i)
 130                                         i,val = parse(i+1)
 131                                         res[key] = val
 132                                         i = skipSpace(i)
 133                                         if s[i] == '}':
 134                                                 return (i+1, res)
 135                                         if s[i] != ',':
 136                                                 raiseError('Expected comma or closing curly brace', i)
 137                                         i = skipSpace(i+1)
 138                         def parseArray(i):
 139                                 res = []
 140                                 i = skipSpace(i+1)
 141                                 if s[i] == ']': # Empty array
 142                                         return (i+1,res)
 143                                 while True:
 144                                         i,val = parse(i)
 145                                         res.append(val)
 146                                         i = skipSpace(i) # Raise exception if premature end
 147                                         if s[i] == ']':
 148                                                 return (i+1, res)
 149                                         if s[i] != ',':
 150                                                 raiseError('Expected a comma or closing bracket', i)
 151                                         i = skipSpace(i+1)
 152                         def parseDiscrete(i):
 153                                 for k,v in {'true': True, 'false': False, 'null': None}.items():
 154                                         if s.startswith(k, i):
 155                                                 return (i+len(k), v)
 156                                 raiseError('Not a boolean (or null)', i)
 157                         def parseNumber(i):
 158                                 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
 159                                 if mobj is None:
 160                                         raiseError('Not a number', i)
 161                                 nums = mobj.group(1)
 162                                 if '.' in nums or 'e' in nums or 'E' in nums:
 163                                         return (i+len(nums), float(nums))
 164                                 return (i+len(nums), int(nums))
 165                         CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
 166                         def parse(i):
 167                                 i = skipSpace(i)
 168                                 i,res = CHARMAP.get(s[i], parseNumber)(i)
 169                                 i = skipSpace(i, False)
 170                                 return (i,res)
 171                         i,res = parse(0)
 172                         if i < len(s):
 173                                 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 174                         return res
 175
 176 def preferredencoding():
 177         """Get preferred encoding.
 178
 179         Returns the best encoding scheme for the system, based on
 180         locale.getpreferredencoding() and some further tweaks.
 181         """
 182         def yield_preferredencoding():
 183                 try:
 184                         pref = locale.getpreferredencoding()
 185                         u'TEST'.encode(pref)
 186                 except:
 187                         pref = 'UTF-8'
 188                 while True:
 189                         yield pref
 190         return yield_preferredencoding().next()
 191
 192 def htmlentity_transform(matchobj):
 193         """Transforms an HTML entity to a Unicode character.
 194
 195         This function receives a match object and is intended to be used with
 196         the re.sub() function.
 197         """
 198         entity = matchobj.group(1)
 199
 200         # Known non-numeric HTML entity
 201         if entity in htmlentitydefs.name2codepoint:
 202                 return unichr(htmlentitydefs.name2codepoint[entity])
 203
 204         # Unicode character
 205         mobj = re.match(ur'(?u)#(x?\d+)', entity)
 206         if mobj is not None:
 207                 numstr = mobj.group(1)
 208                 if numstr.startswith(u'x'):
 209                         base = 16
 210                         numstr = u'0%s' % numstr
 211                 else:
 212                         base = 10
 213                 return unichr(long(numstr, base))
 214
 215         # Unknown entity in name, return its literal representation
 216         return (u'&%s;' % entity)
 217
 218 def sanitize_title(utitle):
 219         """Sanitizes a video title so it could be used as part of a filename."""
 220         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 221         return utitle.replace(unicode(os.sep), u'%')
 222
 223 def sanitize_open(filename, open_mode):
 224         """Try to open the given filename, and slightly tweak it if this fails.
 225
 226         Attempts to open the given filename. If this fails, it tries to change
 227         the filename slightly, step by step, until it's either able to open it
 228         or it fails and raises a final exception, like the standard open()
 229         function.
 230
 231         It returns the tuple (stream, definitive_file_name).
 232         """
 233         try:
 234                 if filename == u'-':
 235                         if sys.platform == 'win32':
 236                                 import msvcrt
 237                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 238                         return (sys.stdout, filename)
 239                 stream = open(filename, open_mode)
 240                 return (stream, filename)
 241         except (IOError, OSError), err:
 242                 # In case of error, try to remove win32 forbidden chars
 243                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 244
 245                 # An exception here should be caught in the caller
 246                 stream = open(filename, open_mode)
 247                 return (stream, filename)
 248
 249 def timeconvert(timestr):
 250     """Convert RFC 2822 defined time string into system timestamp"""
 251     timestamp = None
 252     timetuple = email.utils.parsedate_tz(timestr)
 253     if timetuple is not None:
 254         timestamp = email.utils.mktime_tz(timetuple)
 255     return timestamp
 256
 257 class DownloadError(Exception):
 258         """Download Error exception.
 259
 260         This exception may be thrown by FileDownloader objects if they are not
 261         configured to continue on errors. They will contain the appropriate
 262         error message.
 263         """
 264         pass
 265
 266 class SameFileError(Exception):
 267         """Same File exception.
 268
 269         This exception will be thrown by FileDownloader objects if they detect
 270         multiple files would have to be downloaded to the same file on disk.
 271         """
 272         pass
 273
 274 class PostProcessingError(Exception):
 275         """Post Processing exception.
 276
 277         This exception may be raised by PostProcessor's .run() method to
 278         indicate an error in the postprocessing task.
 279         """
 280         pass
 281
 282 class UnavailableVideoError(Exception):
 283         """Unavailable Format exception.
 284
 285         This exception will be thrown when a video is requested
 286         in a format that is not available for that video.
 287         """
 288         pass
 289
 290 class ContentTooShortError(Exception):
 291         """Content Too Short exception.
 292
 293         This exception may be raised by FileDownloader objects when a file they
 294         download is too small for what the server announced first, indicating
 295         the connection was probably interrupted.
 296         """
 297         # Both in bytes
 298         downloaded = None
 299         expected = None
 300
 301         def __init__(self, downloaded, expected):
 302                 self.downloaded = downloaded
 303                 self.expected = expected
 304
 305 class YoutubeDLHandler(urllib2.HTTPHandler):
 306         """Handler for HTTP requests and responses.
 307
 308         This class, when installed with an OpenerDirector, automatically adds
 309         the standard headers to every HTTP request and handles gzipped and
 310         deflated responses from web servers. If compression is to be avoided in
 311         a particular request, the original request in the program code only has
 312         to include the HTTP header "Youtubedl-No-Compression", which will be
 313         removed before making the real request.
 314
 315         Part of this code was copied from:
 316
 317           http://techknack.net/python-urllib2-handlers/
 318
 319         Andrew Rowls, the author of that code, agreed to release it to the
 320         public domain.
 321         """
 322
 323         @staticmethod
 324         def deflate(data):
 325                 try:
 326                         return zlib.decompress(data, -zlib.MAX_WBITS)
 327                 except zlib.error:
 328                         return zlib.decompress(data)
 329
 330         @staticmethod
 331         def addinfourl_wrapper(stream, headers, url, code):
 332                 if hasattr(urllib2.addinfourl, 'getcode'):
 333                         return urllib2.addinfourl(stream, headers, url, code)
 334                 ret = urllib2.addinfourl(stream, headers, url)
 335                 ret.code = code
 336                 return ret
 337
 338         def http_request(self, req):
 339                 for h in std_headers:
 340                         if h in req.headers:
 341                                 del req.headers[h]
 342                         req.add_header(h, std_headers[h])
 343                 if 'Youtubedl-no-compression' in req.headers:
 344                         if 'Accept-encoding' in req.headers:
 345                                 del req.headers['Accept-encoding']
 346                         del req.headers['Youtubedl-no-compression']
 347                 return req
 348
 349         def http_response(self, req, resp):
 350                 old_resp = resp
 351                 # gzip
 352                 if resp.headers.get('Content-encoding', '') == 'gzip':
 353                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 354                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 355                         resp.msg = old_resp.msg
 356                 # deflate
 357                 if resp.headers.get('Content-encoding', '') == 'deflate':
 358                         gz = StringIO.StringIO(self.deflate(resp.read()))
 359                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 360                         resp.msg = old_resp.msg
 361                 return resp
 362
 363 class FileDownloader(object):
 364         """File Downloader class.
 365
 366         File downloader objects are the ones responsible of downloading the
 367         actual video file and writing it to disk if the user has requested
 368         it, among some other tasks. In most cases there should be one per
 369         program. As, given a video URL, the downloader doesn't know how to
 370         extract all the needed information, task that InfoExtractors do, it
 371         has to pass the URL to one of them.
 372
 373         For this, file downloader objects have a method that allows
 374         InfoExtractors to be registered in a given order. When it is passed
 375         a URL, the file downloader handles it to the first InfoExtractor it
 376         finds that reports being able to handle it. The InfoExtractor extracts
 377         all the information about the video or videos the URL refers to, and
 378         asks the FileDownloader to process the video information, possibly
 379         downloading the video.
 380
 381         File downloaders accept a lot of parameters. In order not to saturate
 382         the object constructor with arguments, it receives a dictionary of
 383         options instead. These options are available through the params
 384         attribute for the InfoExtractors to use. The FileDownloader also
 385         registers itself as the downloader in charge for the InfoExtractors
 386         that are added to it, so this is a "mutual registration".
 387
 388         Available options:
 389
 390         username:         Username for authentication purposes.
 391         password:         Password for authentication purposes.
 392         usenetrc:         Use netrc for authentication instead.
 393         quiet:            Do not print messages to stdout.
 394         forceurl:         Force printing final URL.
 395         forcetitle:       Force printing title.
 396         forcethumbnail:   Force printing thumbnail URL.
 397         forcedescription: Force printing description.
 398         forcefilename:    Force printing final filename.
 399         simulate:         Do not download the video files.
 400         format:           Video format code.
 401         format_limit:     Highest quality format to try.
 402         outtmpl:          Template for output names.
 403         ignoreerrors:     Do not stop on download errors.
 404         ratelimit:        Download speed limit, in bytes/sec.
 405         nooverwrites:     Prevent overwriting files.
 406         retries:          Number of times to retry for HTTP error 5xx
 407         continuedl:       Try to continue downloads if possible.
 408         noprogress:       Do not print the progress bar.
 409         playliststart:    Playlist item to start at.
 410         playlistend:      Playlist item to end at.
 411         logtostderr:      Log messages to stderr instead of stdout.
 412         consoletitle:     Display progress in console window's titlebar.
 413         nopart:           Do not use temporary .part files.
 414         updatetime:       Use the Last-modified header to set output file timestamps.
 415         writedescription: Write the video description to a .description file
 416         """
 417
 418         params = None
 419         _ies = []
 420         _pps = []
 421         _download_retcode = None
 422         _num_downloads = None
 423         _screen_file = None
 424
 425         def __init__(self, params):
 426                 """Create a FileDownloader object with the given options."""
 427                 self._ies = []
 428                 self._pps = []
 429                 self._download_retcode = 0
 430                 self._num_downloads = 0
 431                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 432                 self.params = params
 433
 434         @staticmethod
 435         def pmkdir(filename):
 436                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 437                 components = filename.split(os.sep)
 438                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 439                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 440                 for dir in aggregate:
 441                         if not os.path.exists(dir):
 442                                 os.mkdir(dir)
 443
 444         @staticmethod
 445         def format_bytes(bytes):
 446                 if bytes is None:
 447                         return 'N/A'
 448                 if type(bytes) is str:
 449                         bytes = float(bytes)
 450                 if bytes == 0.0:
 451                         exponent = 0
 452                 else:
 453                         exponent = long(math.log(bytes, 1024.0))
 454                 suffix = 'bkMGTPEZY'[exponent]
 455                 converted = float(bytes) / float(1024**exponent)
 456                 return '%.2f%s' % (converted, suffix)
 457
 458         @staticmethod
 459         def calc_percent(byte_counter, data_len):
 460                 if data_len is None:
 461                         return '---.-%'
 462                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 463
 464         @staticmethod
 465         def calc_eta(start, now, total, current):
 466                 if total is None:
 467                         return '--:--'
 468                 dif = now - start
 469                 if current == 0 or dif < 0.001: # One millisecond
 470                         return '--:--'
 471                 rate = float(current) / dif
 472                 eta = long((float(total) - float(current)) / rate)
 473                 (eta_mins, eta_secs) = divmod(eta, 60)
 474                 if eta_mins > 99:
 475                         return '--:--'
 476                 return '%02d:%02d' % (eta_mins, eta_secs)
 477
 478         @staticmethod
 479         def calc_speed(start, now, bytes):
 480                 dif = now - start
 481                 if bytes == 0 or dif < 0.001: # One millisecond
 482                         return '%10s' % '---b/s'
 483                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 484
 485         @staticmethod
 486         def best_block_size(elapsed_time, bytes):
 487                 new_min = max(bytes / 2.0, 1.0)
 488                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 489                 if elapsed_time < 0.001:
 490                         return long(new_max)
 491                 rate = bytes / elapsed_time
 492                 if rate > new_max:
 493                         return long(new_max)
 494                 if rate < new_min:
 495                         return long(new_min)
 496                 return long(rate)
 497
 498         @staticmethod
 499         def parse_bytes(bytestr):
 500                 """Parse a string indicating a byte quantity into a long integer."""
 501                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 502                 if matchobj is None:
 503                         return None
 504                 number = float(matchobj.group(1))
 505                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 506                 return long(round(number * multiplier))
 507
 508         def add_info_extractor(self, ie):
 509                 """Add an InfoExtractor object to the end of the list."""
 510                 self._ies.append(ie)
 511                 ie.set_downloader(self)
 512
 513         def add_post_processor(self, pp):
 514                 """Add a PostProcessor object to the end of the chain."""
 515                 self._pps.append(pp)
 516                 pp.set_downloader(self)
 517
 518         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 519                 """Print message to stdout if not in quiet mode."""
 520                 try:
 521                         if not self.params.get('quiet', False):
 522                                 terminator = [u'\n', u''][skip_eol]
 523                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 524                         self._screen_file.flush()
 525                 except (UnicodeEncodeError), err:
 526                         if not ignore_encoding_errors:
 527                                 raise
 528
 529         def to_stderr(self, message):
 530                 """Print message to stderr."""
 531                 print >>sys.stderr, message.encode(preferredencoding())
 532
 533         def to_cons_title(self, message):
 534                 """Set console/terminal window title to message."""
 535                 if not self.params.get('consoletitle', False):
 536                         return
 537                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 538                         # c_wchar_p() might not be necessary if `message` is
 539                         # already of type unicode()
 540                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 541                 elif 'TERM' in os.environ:
 542                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 543
 544         def fixed_template(self):
 545                 """Checks if the output template is fixed."""
 546                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 547
 548         def trouble(self, message=None):
 549                 """Determine action to take when a download problem appears.
 550
 551                 Depending on if the downloader has been configured to ignore
 552                 download errors or not, this method may throw an exception or
 553                 not when errors are found, after printing the message.
 554                 """
 555                 if message is not None:
 556                         self.to_stderr(message)
 557                 if not self.params.get('ignoreerrors', False):
 558                         raise DownloadError(message)
 559                 self._download_retcode = 1
 560
 561         def slow_down(self, start_time, byte_counter):
 562                 """Sleep if the download speed is over the rate limit."""
 563                 rate_limit = self.params.get('ratelimit', None)
 564                 if rate_limit is None or byte_counter == 0:
 565                         return
 566                 now = time.time()
 567                 elapsed = now - start_time
 568                 if elapsed <= 0.0:
 569                         return
 570                 speed = float(byte_counter) / elapsed
 571                 if speed > rate_limit:
 572                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 573
 574         def temp_name(self, filename):
 575                 """Returns a temporary filename for the given filename."""
 576                 if self.params.get('nopart', False) or filename == u'-' or \
 577                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 578                         return filename
 579                 return filename + u'.part'
 580
 581         def undo_temp_name(self, filename):
 582                 if filename.endswith(u'.part'):
 583                         return filename[:-len(u'.part')]
 584                 return filename
 585
 586         def try_rename(self, old_filename, new_filename):
 587                 try:
 588                         if old_filename == new_filename:
 589                                 return
 590                         os.rename(old_filename, new_filename)
 591                 except (IOError, OSError), err:
 592                         self.trouble(u'ERROR: unable to rename file')
 593
 594         def try_utime(self, filename, last_modified_hdr):
 595                 """Try to set the last-modified time of the given file."""
 596                 if last_modified_hdr is None:
 597                         return
 598                 if not os.path.isfile(filename):
 599                         return
 600                 timestr = last_modified_hdr
 601                 if timestr is None:
 602                         return
 603                 filetime = timeconvert(timestr)
 604                 if filetime is None:
 605                         return
 606                 try:
 607                         os.utime(filename,(time.time(), filetime))
 608                 except:
 609                         pass
 610
 611         def report_writedescription(self, descfn):
 612                 """ Report that the description file has been written """
 613                 self.to_screen(u'[info] Video description written to: %s' % descfn, ignore_encoding_errors=True)
 614
 615         def report_destination(self, filename):
 616                 """Report destination filename."""
 617                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 618
 619         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 620                 """Report download progress."""
 621                 if self.params.get('noprogress', False):
 622                         return
 623                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 624                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 625                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 626                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 627
 628         def report_resuming_byte(self, resume_len):
 629                 """Report attempt to resume at given byte."""
 630                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 631
 632         def report_retry(self, count, retries):
 633                 """Report retry in case of HTTP error 5xx"""
 634                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 635
 636         def report_file_already_downloaded(self, file_name):
 637                 """Report file has already been fully downloaded."""
 638                 try:
 639                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 640                 except (UnicodeEncodeError), err:
 641                         self.to_screen(u'[download] The file has already been downloaded')
 642
 643         def report_unable_to_resume(self):
 644                 """Report it was impossible to resume download."""
 645                 self.to_screen(u'[download] Unable to resume')
 646
 647         def report_finish(self):
 648                 """Report download finished."""
 649                 if self.params.get('noprogress', False):
 650                         self.to_screen(u'[download] Download completed')
 651                 else:
 652                         self.to_screen(u'')
 653
 654         def increment_downloads(self):
 655                 """Increment the ordinal that assigns a number to each file."""
 656                 self._num_downloads += 1
 657
 658         def prepare_filename(self, info_dict):
 659                 """Generate the output filename."""
 660                 try:
 661                         template_dict = dict(info_dict)
 662                         template_dict['epoch'] = unicode(long(time.time()))
 663                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 664                         filename = self.params['outtmpl'] % template_dict
 665                         return filename
 666                 except (ValueError, KeyError), err:
 667                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 668                         return None
 669
 670         def process_info(self, info_dict):
 671                 """Process a single dictionary returned by an InfoExtractor."""
 672                 filename = self.prepare_filename(info_dict)
 673                 # Do nothing else if in simulate mode
 674                 if self.params.get('simulate', False):
 675                         # Forced printings
 676                         if self.params.get('forcetitle', False):
 677                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 678                         if self.params.get('forceurl', False):
 679                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 680                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 681                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 682                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 683                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 684                         if self.params.get('forcefilename', False) and filename is not None:
 685                                 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
 686
 687                         return
 688
 689                 if filename is None:
 690                         return
 691                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 692                         self.to_stderr(u'WARNING: file exists and will be skipped')
 693                         return
 694
 695                 try:
 696                         self.pmkdir(filename)
 697                 except (OSError, IOError), err:
 698                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 699                         return
 700
 701                 if self.params.get('writedescription', False):
 702                         try:
 703                                 descfn = filename + '.description'
 704                                 with contextlib.closing(open(descfn, 'wb')) as descfile:
 705                                         descfile.write(info_dict['description'].encode('utf-8'))
 706                                 self.report_writedescription(descfn)
 707                         except (OSError, IOError):
 708                                 self.trouble(u'ERROR: Cannot write description file: %s' % str(descfn))
 709                                 return
 710
 711                 try:
 712                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 713                 except (OSError, IOError), err:
 714                         raise UnavailableVideoError
 715                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 716                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 717                         return
 718                 except (ContentTooShortError, ), err:
 719                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 720                         return
 721
 722                 if success:
 723                         try:
 724                                 self.post_process(filename, info_dict)
 725                         except (PostProcessingError), err:
 726                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 727                                 return
 728
 729         def download(self, url_list):
 730                 """Download a given list of URLs."""
 731                 if len(url_list) > 1 and self.fixed_template():
 732                         raise SameFileError(self.params['outtmpl'])
 733
 734                 for url in url_list:
 735                         suitable_found = False
 736                         for ie in self._ies:
 737                                 # Go to next InfoExtractor if not suitable
 738                                 if not ie.suitable(url):
 739                                         continue
 740
 741                                 # Suitable InfoExtractor found
 742                                 suitable_found = True
 743
 744                                 # Extract information from URL and process it
 745                                 ie.extract(url)
 746
 747                                 # Suitable InfoExtractor had been found; go to next URL
 748                                 break
 749
 750                         if not suitable_found:
 751                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 752
 753                 return self._download_retcode
 754
 755         def post_process(self, filename, ie_info):
 756                 """Run the postprocessing chain on the given file."""
 757                 info = dict(ie_info)
 758                 info['filepath'] = filename
 759                 for pp in self._pps:
 760                         info = pp.run(info)
 761                         if info is None:
 762                                 break
 763
 764         def _download_with_rtmpdump(self, filename, url, player_url):
 765                 self.report_destination(filename)
 766                 tmpfilename = self.temp_name(filename)
 767
 768                 # Check for rtmpdump first
 769                 try:
 770                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 771                 except (OSError, IOError):
 772                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 773                         return False
 774
 775                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 776                 # the connection was interrumpted and resuming appears to be
 777                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 778                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 779                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 780                 while retval == 2 or retval == 1:
 781                         prevsize = os.path.getsize(tmpfilename)
 782                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 783                         time.sleep(5.0) # This seems to be needed
 784                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 785                         cursize = os.path.getsize(tmpfilename)
 786                         if prevsize == cursize and retval == 1:
 787                                 break
 788                 if retval == 0:
 789                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 790                         self.try_rename(tmpfilename, filename)
 791                         return True
 792                 else:
 793                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 794                         return False
 795
 796         def _do_download(self, filename, url, player_url):
 797                 # Check file already present
 798                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 799                         self.report_file_already_downloaded(filename)
 800                         return True
 801
 802                 # Attempt to download using rtmpdump
 803                 if url.startswith('rtmp'):
 804                         return self._download_with_rtmpdump(filename, url, player_url)
 805
 806                 tmpfilename = self.temp_name(filename)
 807                 stream = None
 808                 open_mode = 'wb'
 809
 810                 # Do not include the Accept-Encoding header
 811                 headers = {'Youtubedl-no-compression': 'True'}
 812                 basic_request = urllib2.Request(url, None, headers)
 813                 request = urllib2.Request(url, None, headers)
 814
 815                 # Establish possible resume length
 816                 if os.path.isfile(tmpfilename):
 817                         resume_len = os.path.getsize(tmpfilename)
 818                 else:
 819                         resume_len = 0
 820
 821                 # Request parameters in case of being able to resume
 822                 if self.params.get('continuedl', False) and resume_len != 0:
 823                         self.report_resuming_byte(resume_len)
 824                         request.add_header('Range','bytes=%d-' % resume_len)
 825                         open_mode = 'ab'
 826
 827                 count = 0
 828                 retries = self.params.get('retries', 0)
 829                 while count <= retries:
 830                         # Establish connection
 831                         try:
 832                                 data = urllib2.urlopen(request)
 833                                 break
 834                         except (urllib2.HTTPError, ), err:
 835                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 836                                         # Unexpected HTTP error
 837                                         raise
 838                                 elif err.code == 416:
 839                                         # Unable to resume (requested range not satisfiable)
 840                                         try:
 841                                                 # Open the connection again without the range header
 842                                                 data = urllib2.urlopen(basic_request)
 843                                                 content_length = data.info()['Content-Length']
 844                                         except (urllib2.HTTPError, ), err:
 845                                                 if err.code < 500 or err.code >= 600:
 846                                                         raise
 847                                         else:
 848                                                 # Examine the reported length
 849                                                 if (content_length is not None and
 850                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 851                                                         # The file had already been fully downloaded.
 852                                                         # Explanation to the above condition: in issue #175 it was revealed that
 853                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 854                                                         # changing the file size slightly and causing problems for some users. So
 855                                                         # I decided to implement a suggested change and consider the file
 856                                                         # completely downloaded if the file size differs less than 100 bytes from
 857                                                         # the one in the hard drive.
 858                                                         self.report_file_already_downloaded(filename)
 859                                                         self.try_rename(tmpfilename, filename)
 860                                                         return True
 861                                                 else:
 862                                                         # The length does not match, we start the download over
 863                                                         self.report_unable_to_resume()
 864                                                         open_mode = 'wb'
 865                                                         break
 866                         # Retry
 867                         count += 1
 868                         if count <= retries:
 869                                 self.report_retry(count, retries)
 870
 871                 if count > retries:
 872                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 873                         return False
 874
 875                 data_len = data.info().get('Content-length', None)
 876                 if data_len is not None:
 877                         data_len = long(data_len) + resume_len
 878                 data_len_str = self.format_bytes(data_len)
 879                 byte_counter = 0 + resume_len
 880                 block_size = 1024
 881                 start = time.time()
 882                 while True:
 883                         # Download and write
 884                         before = time.time()
 885                         data_block = data.read(block_size)
 886                         after = time.time()
 887                         if len(data_block) == 0:
 888                                 break
 889                         byte_counter += len(data_block)
 890
 891                         # Open file just in time
 892                         if stream is None:
 893                                 try:
 894                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 895                                         filename = self.undo_temp_name(tmpfilename)
 896                                         self.report_destination(filename)
 897                                 except (OSError, IOError), err:
 898                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 899                                         return False
 900                         try:
 901                                 stream.write(data_block)
 902                         except (IOError, OSError), err:
 903                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 904                                 return False
 905                         block_size = self.best_block_size(after - before, len(data_block))
 906
 907                         # Progress message
 908                         percent_str = self.calc_percent(byte_counter, data_len)
 909                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 910                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 911                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 912
 913                         # Apply rate limit
 914                         self.slow_down(start, byte_counter - resume_len)
 915
 916                 stream.close()
 917                 self.report_finish()
 918                 if data_len is not None and byte_counter != data_len:
 919                         raise ContentTooShortError(byte_counter, long(data_len))
 920                 self.try_rename(tmpfilename, filename)
 921
 922                 # Update file modification time
 923                 if self.params.get('updatetime', True):
 924                         self.try_utime(filename, data.info().get('last-modified', None))
 925
 926                 return True
 927
 928 class InfoExtractor(object):
 929         """Information Extractor class.
 930
 931         Information extractors are the classes that, given a URL, extract
 932         information from the video (or videos) the URL refers to. This
 933         information includes the real video URL, the video title and simplified
 934         title, author and others. The information is stored in a dictionary
 935         which is then passed to the FileDownloader. The FileDownloader
 936         processes this information possibly downloading the video to the file
 937         system, among other possible outcomes. The dictionaries must include
 938         the following fields:
 939
 940         id:             Video identifier.
 941         url:            Final video URL.
 942         uploader:       Nickname of the video uploader.
 943         title:          Literal title.
 944         stitle:         Simplified title.
 945         ext:            Video filename extension.
 946         format:         Video format.
 947         player_url:     SWF Player URL (may be None).
 948
 949         The following fields are optional. Their primary purpose is to allow
 950         youtube-dl to serve as the backend for a video search function, such
 951         as the one in youtube2mp3.  They are only used when their respective
 952         forced printing functions are called:
 953
 954         thumbnail:      Full URL to a video thumbnail image.
 955         description:    One-line video description.
 956
 957         Subclasses of this one should re-define the _real_initialize() and
 958         _real_extract() methods, as well as the suitable() static method.
 959         Probably, they should also be instantiated and added to the main
 960         downloader.
 961         """
 962
 963         _ready = False
 964         _downloader = None
 965
 966         def __init__(self, downloader=None):
 967                 """Constructor. Receives an optional downloader."""
 968                 self._ready = False
 969                 self.set_downloader(downloader)
 970
 971         @staticmethod
 972         def suitable(url):
 973                 """Receives a URL and returns True if suitable for this IE."""
 974                 return False
 975
 976         def initialize(self):
 977                 """Initializes an instance (authentication, etc)."""
 978                 if not self._ready:
 979                         self._real_initialize()
 980                         self._ready = True
 981
 982         def extract(self, url):
 983                 """Extracts URL information and returns it in list of dicts."""
 984                 self.initialize()
 985                 return self._real_extract(url)
 986
 987         def set_downloader(self, downloader):
 988                 """Sets the downloader for this IE."""
 989                 self._downloader = downloader
 990
 991         def _real_initialize(self):
 992                 """Real initialization process. Redefine in subclasses."""
 993                 pass
 994
 995         def _real_extract(self, url):
 996                 """Real extraction process. Redefine in subclasses."""
 997                 pass
 998
 999 class YoutubeIE(InfoExtractor):
1000         """Information extractor for youtube.com."""
1001
1002         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
1003         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1004         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1005         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1006         _NETRC_MACHINE = 'youtube'
1007         # Listed in order of quality
1008         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
1009         _video_extensions = {
1010                 '13': '3gp',
1011                 '17': 'mp4',
1012                 '18': 'mp4',
1013                 '22': 'mp4',
1014                 '37': 'mp4',
1015                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1016                 '43': 'webm',
1017                 '45': 'webm',
1018         }
1019
1020         @staticmethod
1021         def suitable(url):
1022                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
1023
1024         def report_lang(self):
1025                 """Report attempt to set language."""
1026                 self._downloader.to_screen(u'[youtube] Setting language')
1027
1028         def report_login(self):
1029                 """Report attempt to log in."""
1030                 self._downloader.to_screen(u'[youtube] Logging in')
1031
1032         def report_age_confirmation(self):
1033                 """Report attempt to confirm age."""
1034                 self._downloader.to_screen(u'[youtube] Confirming age')
1035
1036         def report_video_webpage_download(self, video_id):
1037                 """Report attempt to download video webpage."""
1038                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1039
1040         def report_video_info_webpage_download(self, video_id):
1041                 """Report attempt to download video info webpage."""
1042                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1043
1044         def report_information_extraction(self, video_id):
1045                 """Report attempt to extract video information."""
1046                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1047
1048         def report_unavailable_format(self, video_id, format):
1049                 """Report extracted video URL."""
1050                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1051
1052         def report_rtmp_download(self):
1053                 """Indicate the download will use the RTMP protocol."""
1054                 self._downloader.to_screen(u'[youtube] RTMP download detected')
1055
1056         def _real_initialize(self):
1057                 if self._downloader is None:
1058                         return
1059
1060                 username = None
1061                 password = None
1062                 downloader_params = self._downloader.params
1063
1064                 # Attempt to use provided username and password or .netrc data
1065                 if downloader_params.get('username', None) is not None:
1066                         username = downloader_params['username']
1067                         password = downloader_params['password']
1068                 elif downloader_params.get('usenetrc', False):
1069                         try:
1070                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1071                                 if info is not None:
1072                                         username = info[0]
1073                                         password = info[2]
1074                                 else:
1075                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1076                         except (IOError, netrc.NetrcParseError), err:
1077                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1078                                 return
1079
1080                 # Set language
1081                 request = urllib2.Request(self._LANG_URL)
1082                 try:
1083                         self.report_lang()
1084                         urllib2.urlopen(request).read()
1085                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1086                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1087                         return
1088
1089                 # No authentication to be performed
1090                 if username is None:
1091                         return
1092
1093                 # Log in
1094                 login_form = {
1095                                 'current_form': 'loginForm',
1096                                 'next':         '/',
1097                                 'action_login': 'Log In',
1098                                 'username':     username,
1099                                 'password':     password,
1100                                 }
1101                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1102                 try:
1103                         self.report_login()
1104                         login_results = urllib2.urlopen(request).read()
1105                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1106                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1107                                 return
1108                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1109                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1110                         return
1111
1112                 # Confirm age
1113                 age_form = {
1114                                 'next_url':             '/',
1115                                 'action_confirm':       'Confirm',
1116                                 }
1117                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1118                 try:
1119                         self.report_age_confirmation()
1120                         age_results = urllib2.urlopen(request).read()
1121                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1122                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1123                         return
1124
1125         def _real_extract(self, url):
1126                 # Extract video id from URL
1127                 mobj = re.match(self._VALID_URL, url)
1128                 if mobj is None:
1129                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1130                         return
1131                 video_id = mobj.group(2)
1132
1133                 # Get video webpage
1134                 self.report_video_webpage_download(video_id)
1135                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1136                 try:
1137                         video_webpage = urllib2.urlopen(request).read()
1138                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1139                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1140                         return
1141
1142                 # Attempt to extract SWF player URL
1143                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1144                 if mobj is not None:
1145                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1146                 else:
1147                         player_url = None
1148
1149                 # Get video info
1150                 self.report_video_info_webpage_download(video_id)
1151                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1152                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1153                                            % (video_id, el_type))
1154                         request = urllib2.Request(video_info_url)
1155                         try:
1156                                 video_info_webpage = urllib2.urlopen(request).read()
1157                                 video_info = parse_qs(video_info_webpage)
1158                                 if 'token' in video_info:
1159                                         break
1160                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1161                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1162                                 return
1163                 if 'token' not in video_info:
1164                         if 'reason' in video_info:
1165                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1166                         else:
1167                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1168                         return
1169
1170                 # Start extracting information
1171                 self.report_information_extraction(video_id)
1172
1173                 # uploader
1174                 if 'author' not in video_info:
1175                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1176                         return
1177                 video_uploader = urllib.unquote_plus(video_info['author'][0])
1178
1179                 # title
1180                 if 'title' not in video_info:
1181                         self._downloader.trouble(u'ERROR: unable to extract video title')
1182                         return
1183                 video_title = urllib.unquote_plus(video_info['title'][0])
1184                 video_title = video_title.decode('utf-8')
1185                 video_title = sanitize_title(video_title)
1186
1187                 # simplified title
1188                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1189                 simple_title = simple_title.strip(ur'_')
1190
1191                 # thumbnail image
1192                 if 'thumbnail_url' not in video_info:
1193                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1194                         video_thumbnail = ''
1195                 else:   # don't panic if we can't find it
1196                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1197
1198                 # upload date
1199                 upload_date = u'NA'
1200                 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1201                 if mobj is not None:
1202                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1203                         format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1204                         for expression in format_expressions:
1205                                 try:
1206                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1207                                 except:
1208                                         pass
1209
1210                 # description
1211                 try:
1212                         lxml.etree
1213                 except NameError:
1214                         video_description = u'No description available.'
1215                         if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1216                                 warnings.warn(u'You are using an old Python version, install Python 2.6+ or lxml. Falling back to old video description extractor.')
1217                                 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1218                                 if mobj is not None:
1219                                         video_description = mobj.group(1).decode('utf-8')
1220                 else:
1221                         html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1222                         vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1223                         video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1224
1225                 # token
1226                 video_token = urllib.unquote_plus(video_info['token'][0])
1227
1228                 # Decide which formats to download
1229                 req_format = self._downloader.params.get('format', None)
1230
1231                 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1232                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1233                         format_limit = self._downloader.params.get('format_limit', None)
1234                         if format_limit is not None and format_limit in self._available_formats:
1235                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1236                         else:
1237                                 format_list = self._available_formats
1238                         existing_formats = [x for x in format_list if x in url_map]
1239                         if len(existing_formats) == 0:
1240                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1241                                 return
1242                         if req_format is None:
1243                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1244                         elif req_format == '-1':
1245                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1246                         else:
1247                                 # Specific format
1248                                 if req_format not in url_map:
1249                                         self._downloader.trouble(u'ERROR: requested format not available')
1250                                         return
1251                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1252
1253                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1254                         self.report_rtmp_download()
1255                         video_url_list = [(None, video_info['conn'][0])]
1256
1257                 else:
1258                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1259                         return
1260
1261                 for format_param, video_real_url in video_url_list:
1262                         # At this point we have a new video
1263                         self._downloader.increment_downloads()
1264
1265                         # Extension
1266                         video_extension = self._video_extensions.get(format_param, 'flv')
1267
1268                         # Find the video URL in fmt_url_map or conn paramters
1269                         try:
1270                                 # Process video information
1271                                 self._downloader.process_info({
1272                                         'id':           video_id.decode('utf-8'),
1273                                         'url':          video_real_url.decode('utf-8'),
1274                                         'uploader':     video_uploader.decode('utf-8'),
1275                                         'upload_date':  upload_date,
1276                                         'title':        video_title,
1277                                         'stitle':       simple_title,
1278                                         'ext':          video_extension.decode('utf-8'),
1279                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1280                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1281                                         'description':  video_description,
1282                                         'player_url':   player_url,
1283                                 })
1284                         except UnavailableVideoError, err:
1285                                 self._downloader.trouble(u'\nERROR: unable to download video')
1286
1287
1288 class MetacafeIE(InfoExtractor):
1289         """Information Extractor for metacafe.com."""
1290
1291         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1292         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1293         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1294         _youtube_ie = None
1295
1296         def __init__(self, youtube_ie, downloader=None):
1297                 InfoExtractor.__init__(self, downloader)
1298                 self._youtube_ie = youtube_ie
1299
1300         @staticmethod
1301         def suitable(url):
1302                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1303
1304         def report_disclaimer(self):
1305                 """Report disclaimer retrieval."""
1306                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1307
1308         def report_age_confirmation(self):
1309                 """Report attempt to confirm age."""
1310                 self._downloader.to_screen(u'[metacafe] Confirming age')
1311
1312         def report_download_webpage(self, video_id):
1313                 """Report webpage download."""
1314                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1315
1316         def report_extraction(self, video_id):
1317                 """Report information extraction."""
1318                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1319
1320         def _real_initialize(self):
1321                 # Retrieve disclaimer
1322                 request = urllib2.Request(self._DISCLAIMER)
1323                 try:
1324                         self.report_disclaimer()
1325                         disclaimer = urllib2.urlopen(request).read()
1326                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1327                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1328                         return
1329
1330                 # Confirm age
1331                 disclaimer_form = {
1332                         'filters': '0',
1333                         'submit': "Continue - I'm over 18",
1334                         }
1335                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1336                 try:
1337                         self.report_age_confirmation()
1338                         disclaimer = urllib2.urlopen(request).read()
1339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1340                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1341                         return
1342
1343         def _real_extract(self, url):
1344                 # Extract id and simplified title from URL
1345                 mobj = re.match(self._VALID_URL, url)
1346                 if mobj is None:
1347                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1348                         return
1349
1350                 video_id = mobj.group(1)
1351
1352                 # Check if video comes from YouTube
1353                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1354                 if mobj2 is not None:
1355                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1356                         return
1357
1358                 # At this point we have a new video
1359                 self._downloader.increment_downloads()
1360
1361                 simple_title = mobj.group(2).decode('utf-8')
1362
1363                 # Retrieve video webpage to extract further information
1364                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1365                 try:
1366                         self.report_download_webpage(video_id)
1367                         webpage = urllib2.urlopen(request).read()
1368                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1370                         return
1371
1372                 # Extract URL, uploader and title from webpage
1373                 self.report_extraction(video_id)
1374                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1375                 if mobj is not None:
1376                         mediaURL = urllib.unquote(mobj.group(1))
1377                         video_extension = mediaURL[-3:]
1378
1379                         # Extract gdaKey if available
1380                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1381                         if mobj is None:
1382                                 video_url = mediaURL
1383                         else:
1384                                 gdaKey = mobj.group(1)
1385                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1386                 else:
1387                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1388                         if mobj is None:
1389                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1390                                 return
1391                         vardict = parse_qs(mobj.group(1))
1392                         if 'mediaData' not in vardict:
1393                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1394                                 return
1395                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1396                         if mobj is None:
1397                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1398                                 return
1399                         mediaURL = mobj.group(1).replace('\\/', '/')
1400                         video_extension = mediaURL[-3:]
1401                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1402
1403                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1404                 if mobj is None:
1405                         self._downloader.trouble(u'ERROR: unable to extract title')
1406                         return
1407                 video_title = mobj.group(1).decode('utf-8')
1408                 video_title = sanitize_title(video_title)
1409
1410                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1411                 if mobj is None:
1412                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1413                         return
1414                 video_uploader = mobj.group(1)
1415
1416                 try:
1417                         # Process video information
1418                         self._downloader.process_info({
1419                                 'id':           video_id.decode('utf-8'),
1420                                 'url':          video_url.decode('utf-8'),
1421                                 'uploader':     video_uploader.decode('utf-8'),
1422                                 'upload_date':  u'NA',
1423                                 'title':        video_title,
1424                                 'stitle':       simple_title,
1425                                 'ext':          video_extension.decode('utf-8'),
1426                                 'format':       u'NA',
1427                                 'player_url':   None,
1428                         })
1429                 except UnavailableVideoError:
1430                         self._downloader.trouble(u'\nERROR: unable to download video')
1431
1432
1433 class DailymotionIE(InfoExtractor):
1434         """Information Extractor for Dailymotion"""
1435
1436         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1437
1438         def __init__(self, downloader=None):
1439                 InfoExtractor.__init__(self, downloader)
1440
1441         @staticmethod
1442         def suitable(url):
1443                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1444
1445         def report_download_webpage(self, video_id):
1446                 """Report webpage download."""
1447                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1448
1449         def report_extraction(self, video_id):
1450                 """Report information extraction."""
1451                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1452
1453         def _real_initialize(self):
1454                 return
1455
1456         def _real_extract(self, url):
1457                 # Extract id and simplified title from URL
1458                 mobj = re.match(self._VALID_URL, url)
1459                 if mobj is None:
1460                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1461                         return
1462
1463                 # At this point we have a new video
1464                 self._downloader.increment_downloads()
1465                 video_id = mobj.group(1)
1466
1467                 simple_title = mobj.group(2).decode('utf-8')
1468                 video_extension = 'flv'
1469
1470                 # Retrieve video webpage to extract further information
1471                 request = urllib2.Request(url)
1472                 try:
1473                         self.report_download_webpage(video_id)
1474                         webpage = urllib2.urlopen(request).read()
1475                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1476                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1477                         return
1478
1479                 # Extract URL, uploader and title from webpage
1480                 self.report_extraction(video_id)
1481                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1482                 if mobj is None:
1483                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1484                         return
1485                 mediaURL = urllib.unquote(mobj.group(1))
1486
1487                 # if needed add http://www.dailymotion.com/ if relative URL
1488
1489                 video_url = mediaURL
1490
1491                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1492                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1493                 if mobj is None:
1494                         self._downloader.trouble(u'ERROR: unable to extract title')
1495                         return
1496                 video_title = mobj.group(1).decode('utf-8')
1497                 video_title = sanitize_title(video_title)
1498
1499                 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1500                 if mobj is None:
1501                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1502                         return
1503                 video_uploader = mobj.group(1)
1504
1505                 try:
1506                         # Process video information
1507                         self._downloader.process_info({
1508                                 'id':           video_id.decode('utf-8'),
1509                                 'url':          video_url.decode('utf-8'),
1510                                 'uploader':     video_uploader.decode('utf-8'),
1511                                 'upload_date':  u'NA',
1512                                 'title':        video_title,
1513                                 'stitle':       simple_title,
1514                                 'ext':          video_extension.decode('utf-8'),
1515                                 'format':       u'NA',
1516                                 'player_url':   None,
1517                         })
1518                 except UnavailableVideoError:
1519                         self._downloader.trouble(u'\nERROR: unable to download video')
1520
1521 class GoogleIE(InfoExtractor):
1522         """Information extractor for video.google.com."""
1523
1524         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1525
1526         def __init__(self, downloader=None):
1527                 InfoExtractor.__init__(self, downloader)
1528
1529         @staticmethod
1530         def suitable(url):
1531                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1532
1533         def report_download_webpage(self, video_id):
1534                 """Report webpage download."""
1535                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1536
1537         def report_extraction(self, video_id):
1538                 """Report information extraction."""
1539                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1540
1541         def _real_initialize(self):
1542                 return
1543
1544         def _real_extract(self, url):
1545                 # Extract id from URL
1546                 mobj = re.match(self._VALID_URL, url)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1549                         return
1550
1551                 # At this point we have a new video
1552                 self._downloader.increment_downloads()
1553                 video_id = mobj.group(1)
1554
1555                 video_extension = 'mp4'
1556
1557                 # Retrieve video webpage to extract further information
1558                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1559                 try:
1560                         self.report_download_webpage(video_id)
1561                         webpage = urllib2.urlopen(request).read()
1562                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1564                         return
1565
1566                 # Extract URL, uploader, and title from webpage
1567                 self.report_extraction(video_id)
1568                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1569                 if mobj is None:
1570                         video_extension = 'flv'
1571                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1572                 if mobj is None:
1573                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1574                         return
1575                 mediaURL = urllib.unquote(mobj.group(1))
1576                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1577                 mediaURL = mediaURL.replace('\\x26', '\x26')
1578
1579                 video_url = mediaURL
1580
1581                 mobj = re.search(r'<title>(.*)</title>', webpage)
1582                 if mobj is None:
1583                         self._downloader.trouble(u'ERROR: unable to extract title')
1584                         return
1585                 video_title = mobj.group(1).decode('utf-8')
1586                 video_title = sanitize_title(video_title)
1587                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1588
1589                 # Extract video description
1590                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1591                 if mobj is None:
1592                         self._downloader.trouble(u'ERROR: unable to extract video description')
1593                         return
1594                 video_description = mobj.group(1).decode('utf-8')
1595                 if not video_description:
1596                         video_description = 'No description available.'
1597
1598                 # Extract video thumbnail
1599                 if self._downloader.params.get('forcethumbnail', False):
1600                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1601                         try:
1602                                 webpage = urllib2.urlopen(request).read()
1603                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1604                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1605                                 return
1606                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1607                         if mobj is None:
1608                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1609                                 return
1610                         video_thumbnail = mobj.group(1)
1611                 else:   # we need something to pass to process_info
1612                         video_thumbnail = ''
1613
1614
1615                 try:
1616                         # Process video information
1617                         self._downloader.process_info({
1618                                 'id':           video_id.decode('utf-8'),
1619                                 'url':          video_url.decode('utf-8'),
1620                                 'uploader':     u'NA',
1621                                 'upload_date':  u'NA',
1622                                 'title':        video_title,
1623                                 'stitle':       simple_title,
1624                                 'ext':          video_extension.decode('utf-8'),
1625                                 'format':       u'NA',
1626                                 'player_url':   None,
1627                         })
1628                 except UnavailableVideoError:
1629                         self._downloader.trouble(u'\nERROR: unable to download video')
1630
1631
1632 class PhotobucketIE(InfoExtractor):
1633         """Information extractor for photobucket.com."""
1634
1635         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1636
1637         def __init__(self, downloader=None):
1638                 InfoExtractor.__init__(self, downloader)
1639
1640         @staticmethod
1641         def suitable(url):
1642                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1643
1644         def report_download_webpage(self, video_id):
1645                 """Report webpage download."""
1646                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1647
1648         def report_extraction(self, video_id):
1649                 """Report information extraction."""
1650                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1651
1652         def _real_initialize(self):
1653                 return
1654
1655         def _real_extract(self, url):
1656                 # Extract id from URL
1657                 mobj = re.match(self._VALID_URL, url)
1658                 if mobj is None:
1659                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1660                         return
1661
1662                 # At this point we have a new video
1663                 self._downloader.increment_downloads()
1664                 video_id = mobj.group(1)
1665
1666                 video_extension = 'flv'
1667
1668                 # Retrieve video webpage to extract further information
1669                 request = urllib2.Request(url)
1670                 try:
1671                         self.report_download_webpage(video_id)
1672                         webpage = urllib2.urlopen(request).read()
1673                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1674                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1675                         return
1676
1677                 # Extract URL, uploader, and title from webpage
1678                 self.report_extraction(video_id)
1679                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1680                 if mobj is None:
1681                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1682                         return
1683                 mediaURL = urllib.unquote(mobj.group(1))
1684
1685                 video_url = mediaURL
1686
1687                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1688                 if mobj is None:
1689                         self._downloader.trouble(u'ERROR: unable to extract title')
1690                         return
1691                 video_title = mobj.group(1).decode('utf-8')
1692                 video_title = sanitize_title(video_title)
1693                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1694
1695                 video_uploader = mobj.group(2).decode('utf-8')
1696
1697                 try:
1698                         # Process video information
1699                         self._downloader.process_info({
1700                                 'id':           video_id.decode('utf-8'),
1701                                 'url':          video_url.decode('utf-8'),
1702                                 'uploader':     video_uploader,
1703                                 'upload_date':  u'NA',
1704                                 'title':        video_title,
1705                                 'stitle':       simple_title,
1706                                 'ext':          video_extension.decode('utf-8'),
1707                                 'format':       u'NA',
1708                                 'player_url':   None,
1709                         })
1710                 except UnavailableVideoError:
1711                         self._downloader.trouble(u'\nERROR: unable to download video')
1712
1713
1714 class YahooIE(InfoExtractor):
1715         """Information extractor for video.yahoo.com."""
1716
1717         # _VALID_URL matches all Yahoo! Video URLs
1718         # _VPAGE_URL matches only the extractable '/watch/' URLs
1719         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1720         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1721
1722         def __init__(self, downloader=None):
1723                 InfoExtractor.__init__(self, downloader)
1724
1725         @staticmethod
1726         def suitable(url):
1727                 return (re.match(YahooIE._VALID_URL, url) is not None)
1728
1729         def report_download_webpage(self, video_id):
1730                 """Report webpage download."""
1731                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1732
1733         def report_extraction(self, video_id):
1734                 """Report information extraction."""
1735                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1736
1737         def _real_initialize(self):
1738                 return
1739
1740         def _real_extract(self, url, new_video=True):
1741                 # Extract ID from URL
1742                 mobj = re.match(self._VALID_URL, url)
1743                 if mobj is None:
1744                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1745                         return
1746
1747                 # At this point we have a new video
1748                 self._downloader.increment_downloads()
1749                 video_id = mobj.group(2)
1750                 video_extension = 'flv'
1751
1752                 # Rewrite valid but non-extractable URLs as
1753                 # extractable English language /watch/ URLs
1754                 if re.match(self._VPAGE_URL, url) is None:
1755                         request = urllib2.Request(url)
1756                         try:
1757                                 webpage = urllib2.urlopen(request).read()
1758                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1759                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1760                                 return
1761
1762                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1763                         if mobj is None:
1764                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1765                                 return
1766                         yahoo_id = mobj.group(1)
1767
1768                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1769                         if mobj is None:
1770                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1771                                 return
1772                         yahoo_vid = mobj.group(1)
1773
1774                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1775                         return self._real_extract(url, new_video=False)
1776
1777                 # Retrieve video webpage to extract further information
1778                 request = urllib2.Request(url)
1779                 try:
1780                         self.report_download_webpage(video_id)
1781                         webpage = urllib2.urlopen(request).read()
1782                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1784                         return
1785
1786                 # Extract uploader and title from webpage
1787                 self.report_extraction(video_id)
1788                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1789                 if mobj is None:
1790                         self._downloader.trouble(u'ERROR: unable to extract video title')
1791                         return
1792                 video_title = mobj.group(1).decode('utf-8')
1793                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1794
1795                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1796                 if mobj is None:
1797                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1798                         return
1799                 video_uploader = mobj.group(1).decode('utf-8')
1800
1801                 # Extract video thumbnail
1802                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1803                 if mobj is None:
1804                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1805                         return
1806                 video_thumbnail = mobj.group(1).decode('utf-8')
1807
1808                 # Extract video description
1809                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1810                 if mobj is None:
1811                         self._downloader.trouble(u'ERROR: unable to extract video description')
1812                         return
1813                 video_description = mobj.group(1).decode('utf-8')
1814                 if not video_description: video_description = 'No description available.'
1815
1816                 # Extract video height and width
1817                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1818                 if mobj is None:
1819                         self._downloader.trouble(u'ERROR: unable to extract video height')
1820                         return
1821                 yv_video_height = mobj.group(1)
1822
1823                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: unable to extract video width')
1826                         return
1827                 yv_video_width = mobj.group(1)
1828
1829                 # Retrieve video playlist to extract media URL
1830                 # I'm not completely sure what all these options are, but we
1831                 # seem to need most of them, otherwise the server sends a 401.
1832                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1833                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1834                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1835                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1836                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1837                 try:
1838                         self.report_download_webpage(video_id)
1839                         webpage = urllib2.urlopen(request).read()
1840                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1842                         return
1843
1844                 # Extract media URL from playlist XML
1845                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1846                 if mobj is None:
1847                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1848                         return
1849                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1850                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1851
1852                 try:
1853                         # Process video information
1854                         self._downloader.process_info({
1855                                 'id':           video_id.decode('utf-8'),
1856                                 'url':          video_url,
1857                                 'uploader':     video_uploader,
1858                                 'upload_date':  u'NA',
1859                                 'title':        video_title,
1860                                 'stitle':       simple_title,
1861                                 'ext':          video_extension.decode('utf-8'),
1862                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1863                                 'description':  video_description,
1864                                 'thumbnail':    video_thumbnail,
1865                                 'description':  video_description,
1866                                 'player_url':   None,
1867                         })
1868                 except UnavailableVideoError:
1869                         self._downloader.trouble(u'\nERROR: unable to download video')
1870
1871
1872 class GenericIE(InfoExtractor):
1873         """Generic last-resort information extractor."""
1874
1875         def __init__(self, downloader=None):
1876                 InfoExtractor.__init__(self, downloader)
1877
1878         @staticmethod
1879         def suitable(url):
1880                 return True
1881
1882         def report_download_webpage(self, video_id):
1883                 """Report webpage download."""
1884                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1885                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1886
1887         def report_extraction(self, video_id):
1888                 """Report information extraction."""
1889                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1890
1891         def _real_initialize(self):
1892                 return
1893
1894         def _real_extract(self, url):
1895                 # At this point we have a new video
1896                 self._downloader.increment_downloads()
1897
1898                 video_id = url.split('/')[-1]
1899                 request = urllib2.Request(url)
1900                 try:
1901                         self.report_download_webpage(video_id)
1902                         webpage = urllib2.urlopen(request).read()
1903                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1904                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1905                         return
1906                 except ValueError, err:
1907                         # since this is the last-resort InfoExtractor, if
1908                         # this error is thrown, it'll be thrown here
1909                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1910                         return
1911
1912                 self.report_extraction(video_id)
1913                 # Start with something easy: JW Player in SWFObject
1914                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1915                 if mobj is None:
1916                         # Broaden the search a little bit
1917                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1918                 if mobj is None:
1919                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1920                         return
1921
1922                 # It's possible that one of the regexes
1923                 # matched, but returned an empty group:
1924                 if mobj.group(1) is None:
1925                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1926                         return
1927
1928                 video_url = urllib.unquote(mobj.group(1))
1929                 video_id  = os.path.basename(video_url)
1930
1931                 # here's a fun little line of code for you:
1932                 video_extension = os.path.splitext(video_id)[1][1:]
1933                 video_id        = os.path.splitext(video_id)[0]
1934
1935                 # it's tempting to parse this further, but you would
1936                 # have to take into account all the variations like
1937                 #   Video Title - Site Name
1938                 #   Site Name | Video Title
1939                 #   Video Title - Tagline | Site Name
1940                 # and so on and so forth; it's just not practical
1941                 mobj = re.search(r'<title>(.*)</title>', webpage)
1942                 if mobj is None:
1943                         self._downloader.trouble(u'ERROR: unable to extract title')
1944                         return
1945                 video_title = mobj.group(1).decode('utf-8')
1946                 video_title = sanitize_title(video_title)
1947                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1948
1949                 # video uploader is domain name
1950                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1951                 if mobj is None:
1952                         self._downloader.trouble(u'ERROR: unable to extract title')
1953                         return
1954                 video_uploader = mobj.group(1).decode('utf-8')
1955
1956                 try:
1957                         # Process video information
1958                         self._downloader.process_info({
1959                                 'id':           video_id.decode('utf-8'),
1960                                 'url':          video_url.decode('utf-8'),
1961                                 'uploader':     video_uploader,
1962                                 'upload_date':  u'NA',
1963                                 'title':        video_title,
1964                                 'stitle':       simple_title,
1965                                 'ext':          video_extension.decode('utf-8'),
1966                                 'format':       u'NA',
1967                                 'player_url':   None,
1968                         })
1969                 except UnavailableVideoError, err:
1970                         self._downloader.trouble(u'\nERROR: unable to download video')
1971
1972
1973 class YoutubeSearchIE(InfoExtractor):
1974         """Information Extractor for YouTube search queries."""
1975         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1976         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1977         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1978         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1979         _youtube_ie = None
1980         _max_youtube_results = 1000
1981
1982         def __init__(self, youtube_ie, downloader=None):
1983                 InfoExtractor.__init__(self, downloader)
1984                 self._youtube_ie = youtube_ie
1985
1986         @staticmethod
1987         def suitable(url):
1988                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1989
1990         def report_download_page(self, query, pagenum):
1991                 """Report attempt to download playlist page with given number."""
1992                 query = query.decode(preferredencoding())
1993                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1994
1995         def _real_initialize(self):
1996                 self._youtube_ie.initialize()
1997
1998         def _real_extract(self, query):
1999                 mobj = re.match(self._VALID_QUERY, query)
2000                 if mobj is None:
2001                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2002                         return
2003
2004                 prefix, query = query.split(':')
2005                 prefix = prefix[8:]
2006                 query  = query.encode('utf-8')
2007                 if prefix == '':
2008                         self._download_n_results(query, 1)
2009                         return
2010                 elif prefix == 'all':
2011                         self._download_n_results(query, self._max_youtube_results)
2012                         return
2013                 else:
2014                         try:
2015                                 n = long(prefix)
2016                                 if n <= 0:
2017                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2018                                         return
2019                                 elif n > self._max_youtube_results:
2020                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
2021                                         n = self._max_youtube_results
2022                                 self._download_n_results(query, n)
2023                                 return
2024                         except ValueError: # parsing prefix as integer fails
2025                                 self._download_n_results(query, 1)
2026                                 return
2027
2028         def _download_n_results(self, query, n):
2029                 """Downloads a specified number of results for a query"""
2030
2031                 video_ids = []
2032                 already_seen = set()
2033                 pagenum = 1
2034
2035                 while True:
2036                         self.report_download_page(query, pagenum)
2037                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2038                         request = urllib2.Request(result_url)
2039                         try:
2040                                 page = urllib2.urlopen(request).read()
2041                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2042                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2043                                 return
2044
2045                         # Extract video identifiers
2046                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2047                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2048                                 if video_id not in already_seen:
2049                                         video_ids.append(video_id)
2050                                         already_seen.add(video_id)
2051                                         if len(video_ids) == n:
2052                                                 # Specified n videos reached
2053                                                 for id in video_ids:
2054                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2055                                                 return
2056
2057                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2058                                 for id in video_ids:
2059                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2060                                 return
2061
2062                         pagenum = pagenum + 1
2063
2064 class GoogleSearchIE(InfoExtractor):
2065         """Information Extractor for Google Video search queries."""
2066         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2067         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2068         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2069         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2070         _google_ie = None
2071         _max_google_results = 1000
2072
2073         def __init__(self, google_ie, downloader=None):
2074                 InfoExtractor.__init__(self, downloader)
2075                 self._google_ie = google_ie
2076
2077         @staticmethod
2078         def suitable(url):
2079                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2080
2081         def report_download_page(self, query, pagenum):
2082                 """Report attempt to download playlist page with given number."""
2083                 query = query.decode(preferredencoding())
2084                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2085
2086         def _real_initialize(self):
2087                 self._google_ie.initialize()
2088
2089         def _real_extract(self, query):
2090                 mobj = re.match(self._VALID_QUERY, query)
2091                 if mobj is None:
2092                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2093                         return
2094
2095                 prefix, query = query.split(':')
2096                 prefix = prefix[8:]
2097                 query  = query.encode('utf-8')
2098                 if prefix == '':
2099                         self._download_n_results(query, 1)
2100                         return
2101                 elif prefix == 'all':
2102                         self._download_n_results(query, self._max_google_results)
2103                         return
2104                 else:
2105                         try:
2106                                 n = long(prefix)
2107                                 if n <= 0:
2108                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2109                                         return
2110                                 elif n > self._max_google_results:
2111                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
2112                                         n = self._max_google_results
2113                                 self._download_n_results(query, n)
2114                                 return
2115                         except ValueError: # parsing prefix as integer fails
2116                                 self._download_n_results(query, 1)
2117                                 return
2118
2119         def _download_n_results(self, query, n):
2120                 """Downloads a specified number of results for a query"""
2121
2122                 video_ids = []
2123                 already_seen = set()
2124                 pagenum = 1
2125
2126                 while True:
2127                         self.report_download_page(query, pagenum)
2128                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2129                         request = urllib2.Request(result_url)
2130                         try:
2131                                 page = urllib2.urlopen(request).read()
2132                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2133                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2134                                 return
2135
2136                         # Extract video identifiers
2137                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2138                                 video_id = mobj.group(1)
2139                                 if video_id not in already_seen:
2140                                         video_ids.append(video_id)
2141                                         already_seen.add(video_id)
2142                                         if len(video_ids) == n:
2143                                                 # Specified n videos reached
2144                                                 for id in video_ids:
2145                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2146                                                 return
2147
2148                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2149                                 for id in video_ids:
2150                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2151                                 return
2152
2153                         pagenum = pagenum + 1
2154
2155 class YahooSearchIE(InfoExtractor):
2156         """Information Extractor for Yahoo! Video search queries."""
2157         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2158         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2159         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2160         _MORE_PAGES_INDICATOR = r'\s*Next'
2161         _yahoo_ie = None
2162         _max_yahoo_results = 1000
2163
2164         def __init__(self, yahoo_ie, downloader=None):
2165                 InfoExtractor.__init__(self, downloader)
2166                 self._yahoo_ie = yahoo_ie
2167
2168         @staticmethod
2169         def suitable(url):
2170                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2171
2172         def report_download_page(self, query, pagenum):
2173                 """Report attempt to download playlist page with given number."""
2174                 query = query.decode(preferredencoding())
2175                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2176
2177         def _real_initialize(self):
2178                 self._yahoo_ie.initialize()
2179
2180         def _real_extract(self, query):
2181                 mobj = re.match(self._VALID_QUERY, query)
2182                 if mobj is None:
2183                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2184                         return
2185
2186                 prefix, query = query.split(':')
2187                 prefix = prefix[8:]
2188                 query  = query.encode('utf-8')
2189                 if prefix == '':
2190                         self._download_n_results(query, 1)
2191                         return
2192                 elif prefix == 'all':
2193                         self._download_n_results(query, self._max_yahoo_results)
2194                         return
2195                 else:
2196                         try:
2197                                 n = long(prefix)
2198                                 if n <= 0:
2199                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2200                                         return
2201                                 elif n > self._max_yahoo_results:
2202                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2203                                         n = self._max_yahoo_results
2204                                 self._download_n_results(query, n)
2205                                 return
2206                         except ValueError: # parsing prefix as integer fails
2207                                 self._download_n_results(query, 1)
2208                                 return
2209
2210         def _download_n_results(self, query, n):
2211                 """Downloads a specified number of results for a query"""
2212
2213                 video_ids = []
2214                 already_seen = set()
2215                 pagenum = 1
2216
2217                 while True:
2218                         self.report_download_page(query, pagenum)
2219                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2220                         request = urllib2.Request(result_url)
2221                         try:
2222                                 page = urllib2.urlopen(request).read()
2223                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2224                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2225                                 return
2226
2227                         # Extract video identifiers
2228                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2229                                 video_id = mobj.group(1)
2230                                 if video_id not in already_seen:
2231                                         video_ids.append(video_id)
2232                                         already_seen.add(video_id)
2233                                         if len(video_ids) == n:
2234                                                 # Specified n videos reached
2235                                                 for id in video_ids:
2236                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2237                                                 return
2238
2239                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2240                                 for id in video_ids:
2241                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2242                                 return
2243
2244                         pagenum = pagenum + 1
2245
2246 class YoutubePlaylistIE(InfoExtractor):
2247         """Information Extractor for YouTube playlists."""
2248
2249         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2250         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2251         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2252         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2253         _youtube_ie = None
2254
2255         def __init__(self, youtube_ie, downloader=None):
2256                 InfoExtractor.__init__(self, downloader)
2257                 self._youtube_ie = youtube_ie
2258
2259         @staticmethod
2260         def suitable(url):
2261                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2262
2263         def report_download_page(self, playlist_id, pagenum):
2264                 """Report attempt to download playlist page with given number."""
2265                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2266
2267         def _real_initialize(self):
2268                 self._youtube_ie.initialize()
2269
2270         def _real_extract(self, url):
2271                 # Extract playlist id
2272                 mobj = re.match(self._VALID_URL, url)
2273                 if mobj is None:
2274                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2275                         return
2276
2277                 # Single video case
2278                 if mobj.group(3) is not None:
2279                         self._youtube_ie.extract(mobj.group(3))
2280                         return
2281
2282                 # Download playlist pages
2283                 # prefix is 'p' as default for playlists but there are other types that need extra care
2284                 playlist_prefix = mobj.group(1)
2285                 if playlist_prefix == 'a':
2286                         playlist_access = 'artist'
2287                 else:
2288                         playlist_prefix = 'p'
2289                         playlist_access = 'view_play_list'
2290                 playlist_id = mobj.group(2)
2291                 video_ids = []
2292                 pagenum = 1
2293
2294                 while True:
2295                         self.report_download_page(playlist_id, pagenum)
2296                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2297                         try:
2298                                 page = urllib2.urlopen(request).read()
2299                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2300                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2301                                 return
2302
2303                         # Extract video identifiers
2304                         ids_in_page = []
2305                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2306                                 if mobj.group(1) not in ids_in_page:
2307                                         ids_in_page.append(mobj.group(1))
2308                         video_ids.extend(ids_in_page)
2309
2310                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2311                                 break
2312                         pagenum = pagenum + 1
2313
2314                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2315                 playlistend = self._downloader.params.get('playlistend', -1)
2316                 video_ids = video_ids[playliststart:playlistend]
2317
2318                 for id in video_ids:
2319                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2320                 return
2321
2322 class YoutubeUserIE(InfoExtractor):
2323         """Information Extractor for YouTube users."""
2324
2325         _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2326         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2327         _GDATA_PAGE_SIZE = 50
2328         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2329         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2330         _youtube_ie = None
2331
2332         def __init__(self, youtube_ie, downloader=None):
2333                 InfoExtractor.__init__(self, downloader)
2334                 self._youtube_ie = youtube_ie
2335
2336         @staticmethod
2337         def suitable(url):
2338                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2339
2340         def report_download_page(self, username, start_index):
2341                 """Report attempt to download user page."""
2342                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2343                                            (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2344
2345         def _real_initialize(self):
2346                 self._youtube_ie.initialize()
2347
2348         def _real_extract(self, url):
2349                 # Extract username
2350                 mobj = re.match(self._VALID_URL, url)
2351                 if mobj is None:
2352                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2353                         return
2354
2355                 username = mobj.group(1)
2356
2357                 # Download video ids using YouTube Data API. Result size per
2358                 # query is limited (currently to 50 videos) so we need to query
2359                 # page by page until there are no video ids - it means we got
2360                 # all of them.
2361
2362                 video_ids = []
2363                 pagenum = 0
2364
2365                 while True:
2366                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2367                         self.report_download_page(username, start_index)
2368
2369                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2370
2371                         try:
2372                                 page = urllib2.urlopen(request).read()
2373                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2374                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2375                                 return
2376
2377                         # Extract video identifiers
2378                         ids_in_page = []
2379
2380                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2381                                 if mobj.group(1) not in ids_in_page:
2382                                         ids_in_page.append(mobj.group(1))
2383
2384                         video_ids.extend(ids_in_page)
2385
2386                         # A little optimization - if current page is not
2387                         # "full", ie. does not contain PAGE_SIZE video ids then
2388                         # we can assume that this page is the last one - there
2389                         # are no more ids on further pages - no need to query
2390                         # again.
2391
2392                         if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2393                                 break
2394
2395                         pagenum += 1
2396
2397                 all_ids_count = len(video_ids)
2398                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2399                 playlistend = self._downloader.params.get('playlistend', -1)
2400
2401                 if playlistend == -1:
2402                         video_ids = video_ids[playliststart:]
2403                 else:
2404                         video_ids = video_ids[playliststart:playlistend]
2405
2406                 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2407                                            (username, all_ids_count, len(video_ids)))
2408
2409                 for video_id in video_ids:
2410                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2411
2412
2413 class DepositFilesIE(InfoExtractor):
2414         """Information extractor for depositfiles.com"""
2415
2416         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2417
2418         def __init__(self, downloader=None):
2419                 InfoExtractor.__init__(self, downloader)
2420
2421         @staticmethod
2422         def suitable(url):
2423                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2424
2425         def report_download_webpage(self, file_id):
2426                 """Report webpage download."""
2427                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2428
2429         def report_extraction(self, file_id):
2430                 """Report information extraction."""
2431                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2432
2433         def _real_initialize(self):
2434                 return
2435
2436         def _real_extract(self, url):
2437                 # At this point we have a new file
2438                 self._downloader.increment_downloads()
2439
2440                 file_id = url.split('/')[-1]
2441                 # Rebuild url in english locale
2442                 url = 'http://depositfiles.com/en/files/' + file_id
2443
2444                 # Retrieve file webpage with 'Free download' button pressed
2445                 free_download_indication = { 'gateway_result' : '1' }
2446                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2447                 try:
2448                         self.report_download_webpage(file_id)
2449                         webpage = urllib2.urlopen(request).read()
2450                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2451                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2452                         return
2453
2454                 # Search for the real file URL
2455                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2456                 if (mobj is None) or (mobj.group(1) is None):
2457                         # Try to figure out reason of the error.
2458                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2459                         if (mobj is not None) and (mobj.group(1) is not None):
2460                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2461                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2462                         else:
2463                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2464                         return
2465
2466                 file_url = mobj.group(1)
2467                 file_extension = os.path.splitext(file_url)[1][1:]
2468
2469                 # Search for file title
2470                 mobj = re.search(r'<b title="(.*?)">', webpage)
2471                 if mobj is None:
2472                         self._downloader.trouble(u'ERROR: unable to extract title')
2473                         return
2474                 file_title = mobj.group(1).decode('utf-8')
2475
2476                 try:
2477                         # Process file information
2478                         self._downloader.process_info({
2479                                 'id':           file_id.decode('utf-8'),
2480                                 'url':          file_url.decode('utf-8'),
2481                                 'uploader':     u'NA',
2482                                 'upload_date':  u'NA',
2483                                 'title':        file_title,
2484                                 'stitle':       file_title,
2485                                 'ext':          file_extension.decode('utf-8'),
2486                                 'format':       u'NA',
2487                                 'player_url':   None,
2488                         })
2489                 except UnavailableVideoError, err:
2490                         self._downloader.trouble(u'ERROR: unable to download file')
2491
2492 class FacebookIE(InfoExtractor):
2493         """Information Extractor for Facebook"""
2494
2495         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2496         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2497         _NETRC_MACHINE = 'facebook'
2498         _available_formats = ['highqual', 'lowqual']
2499         _video_extensions = {
2500                 'highqual': 'mp4',
2501                 'lowqual': 'mp4',
2502         }
2503
2504         def __init__(self, downloader=None):
2505                 InfoExtractor.__init__(self, downloader)
2506
2507         @staticmethod
2508         def suitable(url):
2509                 return (re.match(FacebookIE._VALID_URL, url) is not None)
2510
2511         def _reporter(self, message):
2512                 """Add header and report message."""
2513                 self._downloader.to_screen(u'[facebook] %s' % message)
2514
2515         def report_login(self):
2516                 """Report attempt to log in."""
2517                 self._reporter(u'Logging in')
2518
2519         def report_video_webpage_download(self, video_id):
2520                 """Report attempt to download video webpage."""
2521                 self._reporter(u'%s: Downloading video webpage' % video_id)
2522
2523         def report_information_extraction(self, video_id):
2524                 """Report attempt to extract video information."""
2525                 self._reporter(u'%s: Extracting video information' % video_id)
2526
2527         def _parse_page(self, video_webpage):
2528                 """Extract video information from page"""
2529                 # General data
2530                 data = {'title': r'class="video_title datawrap">(.*?)</',
2531                         'description': r'<div class="datawrap">(.*?)</div>',
2532                         'owner': r'\("video_owner_name", "(.*?)"\)',
2533                         'upload_date': r'data-date="(.*?)"',
2534                         'thumbnail':  r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2535                         }
2536                 video_info = {}
2537                 for piece in data.keys():
2538                         mobj = re.search(data[piece], video_webpage)
2539                         if mobj is not None:
2540                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2541
2542                 # Video urls
2543                 video_urls = {}
2544                 for fmt in self._available_formats:
2545                         mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2546                         if mobj is not None:
2547                                 # URL is in a Javascript segment inside an escaped Unicode format within
2548                                 # the generally utf-8 page
2549                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2550                 video_info['video_urls'] = video_urls
2551
2552                 return video_info
2553
2554         def _real_initialize(self):
2555                 if self._downloader is None:
2556                         return
2557
2558                 useremail = None
2559                 password = None
2560                 downloader_params = self._downloader.params
2561
2562                 # Attempt to use provided username and password or .netrc data
2563                 if downloader_params.get('username', None) is not None:
2564                         useremail = downloader_params['username']
2565                         password = downloader_params['password']
2566                 elif downloader_params.get('usenetrc', False):
2567                         try:
2568                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2569                                 if info is not None:
2570                                         useremail = info[0]
2571                                         password = info[2]
2572                                 else:
2573                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2574                         except (IOError, netrc.NetrcParseError), err:
2575                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2576                                 return
2577
2578                 if useremail is None:
2579                         return
2580
2581                 # Log in
2582                 login_form = {
2583                         'email': useremail,
2584                         'pass': password,
2585                         'login': 'Log+In'
2586                         }
2587                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2588                 try:
2589                         self.report_login()
2590                         login_results = urllib2.urlopen(request).read()
2591                         if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2592                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2593                                 return
2594                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2595                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2596                         return
2597
2598         def _real_extract(self, url):
2599                 mobj = re.match(self._VALID_URL, url)
2600                 if mobj is None:
2601                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2602                         return
2603                 video_id = mobj.group('ID')
2604
2605                 # Get video webpage
2606                 self.report_video_webpage_download(video_id)
2607                 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2608                 try:
2609                         page = urllib2.urlopen(request)
2610                         video_webpage = page.read()
2611                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2612                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2613                         return
2614
2615                 # Start extracting information
2616                 self.report_information_extraction(video_id)
2617
2618                 # Extract information
2619                 video_info = self._parse_page(video_webpage)
2620
2621                 # uploader
2622                 if 'owner' not in video_info:
2623                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2624                         return
2625                 video_uploader = video_info['owner']
2626
2627                 # title
2628                 if 'title' not in video_info:
2629                         self._downloader.trouble(u'ERROR: unable to extract video title')
2630                         return
2631                 video_title = video_info['title']
2632                 video_title = video_title.decode('utf-8')
2633                 video_title = sanitize_title(video_title)
2634
2635                 # simplified title
2636                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2637                 simple_title = simple_title.strip(ur'_')
2638
2639                 # thumbnail image
2640                 if 'thumbnail' not in video_info:
2641                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2642                         video_thumbnail = ''
2643                 else:
2644                         video_thumbnail = video_info['thumbnail']
2645
2646                 # upload date
2647                 upload_date = u'NA'
2648                 if 'upload_date' in video_info:
2649                         upload_time = video_info['upload_date']
2650                         timetuple = email.utils.parsedate_tz(upload_time)
2651                         if timetuple is not None:
2652                                 try:
2653                                         upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2654                                 except:
2655                                         pass
2656
2657                 # description
2658                 video_description = video_info.get('description', 'No description available.')
2659
2660                 url_map = video_info['video_urls']
2661                 if len(url_map.keys()) > 0:
2662                         # Decide which formats to download
2663                         req_format = self._downloader.params.get('format', None)
2664                         format_limit = self._downloader.params.get('format_limit', None)
2665
2666                         if format_limit is not None and format_limit in self._available_formats:
2667                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
2668                         else:
2669                                 format_list = self._available_formats
2670                         existing_formats = [x for x in format_list if x in url_map]
2671                         if len(existing_formats) == 0:
2672                                 self._downloader.trouble(u'ERROR: no known formats available for video')
2673                                 return
2674                         if req_format is None:
2675                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2676                         elif req_format == '-1':
2677                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2678                         else:
2679                                 # Specific format
2680                                 if req_format not in url_map:
2681                                         self._downloader.trouble(u'ERROR: requested format not available')
2682                                         return
2683                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
2684
2685                 for format_param, video_real_url in video_url_list:
2686
2687                         # At this point we have a new video
2688                         self._downloader.increment_downloads()
2689
2690                         # Extension
2691                         video_extension = self._video_extensions.get(format_param, 'mp4')
2692
2693                         # Find the video URL in fmt_url_map or conn paramters
2694                         try:
2695                                 # Process video information
2696                                 self._downloader.process_info({
2697                                         'id':           video_id.decode('utf-8'),
2698                                         'url':          video_real_url.decode('utf-8'),
2699                                         'uploader':     video_uploader.decode('utf-8'),
2700                                         'upload_date':  upload_date,
2701                                         'title':        video_title,
2702                                         'stitle':       simple_title,
2703                                         'ext':          video_extension.decode('utf-8'),
2704                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
2705                                         'thumbnail':    video_thumbnail.decode('utf-8'),
2706                                         'description':  video_description.decode('utf-8'),
2707                                         'player_url':   None,
2708                                 })
2709                         except UnavailableVideoError, err:
2710                                 self._downloader.trouble(u'\nERROR: unable to download video')
2711
2712 class BlipTVIE(InfoExtractor):
2713         """Information extractor for blip.tv"""
2714
2715         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv(/.+)$'
2716         _URL_EXT = r'^.*\.([a-z0-9]+)$'
2717
2718         @staticmethod
2719         def suitable(url):
2720                 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2721
2722         def report_extraction(self, file_id):
2723                 """Report information extraction."""
2724                 self._downloader.to_screen(u'[blip.tv] %s: Extracting information' % file_id)
2725
2726         def _simplify_title(self, title):
2727                 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2728                 res = res.strip(ur'_')
2729                 return res
2730
2731         def _real_extract(self, url):
2732                 mobj = re.match(self._VALID_URL, url)
2733                 if mobj is None:
2734                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2735                         return
2736
2737                 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2738                 request = urllib2.Request(json_url)
2739                 self.report_extraction(mobj.group(1))
2740                 try:
2741                         json_code = urllib2.urlopen(request).read()
2742                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2743                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2744                         return
2745                 try:
2746                         json_data = json.loads(json_code)
2747                         data = json_data['Post'] if 'Post' in json_data else json_data
2748
2749                         upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2750                         video_url = data['media']['url']
2751                         umobj = re.match(self._URL_EXT, video_url)
2752                         if umobj is None:
2753                                 raise ValueError('Can not determine filename extension')
2754                         ext = umobj.group(1)
2755
2756                         self._downloader.increment_downloads()
2757
2758                         info = {
2759                                 'id': data['item_id'],
2760                                 'url': video_url,
2761                                 'uploader': data['display_name'],
2762                                 'upload_date': upload_date,
2763                                 'title': data['title'],
2764                                 'stitle': self._simplify_title(data['title']),
2765                                 'ext': ext,
2766                                 'format': data['media']['mimeType'],
2767                                 'thumbnail': data['thumbnailUrl'],
2768                                 'description': data['description'],
2769                                 'player_url': data['embedUrl']
2770                         }
2771                 except (ValueError,KeyError), err:
2772                         self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2773                         return
2774
2775                 try:
2776                         self._downloader.process_info(info)
2777                 except UnavailableVideoError, err:
2778                         self._downloader.trouble(u'\nERROR: unable to download video')
2779
2780
2781 class PostProcessor(object):
2782         """Post Processor class.
2783
2784         PostProcessor objects can be added to downloaders with their
2785         add_post_processor() method. When the downloader has finished a
2786         successful download, it will take its internal chain of PostProcessors
2787         and start calling the run() method on each one of them, first with
2788         an initial argument and then with the returned value of the previous
2789         PostProcessor.
2790
2791         The chain will be stopped if one of them ever returns None or the end
2792         of the chain is reached.
2793
2794         PostProcessor objects follow a "mutual registration" process similar
2795         to InfoExtractor objects.
2796         """
2797
2798         _downloader = None
2799
2800         def __init__(self, downloader=None):
2801                 self._downloader = downloader
2802
2803         def set_downloader(self, downloader):
2804                 """Sets the downloader for this PP."""
2805                 self._downloader = downloader
2806
2807         def run(self, information):
2808                 """Run the PostProcessor.
2809
2810                 The "information" argument is a dictionary like the ones
2811                 composed by InfoExtractors. The only difference is that this
2812                 one has an extra field called "filepath" that points to the
2813                 downloaded file.
2814
2815                 When this method returns None, the postprocessing chain is
2816                 stopped. However, this method may return an information
2817                 dictionary that will be passed to the next postprocessing
2818                 object in the chain. It can be the one it received after
2819                 changing some fields.
2820
2821                 In addition, this method may raise a PostProcessingError
2822                 exception that will be taken into account by the downloader
2823                 it was called from.
2824                 """
2825                 return information # by default, do nothing
2826
2827 class FFmpegExtractAudioPP(PostProcessor):
2828
2829         def __init__(self, downloader=None, preferredcodec=None):
2830                 PostProcessor.__init__(self, downloader)
2831                 if preferredcodec is None:
2832                         preferredcodec = 'best'
2833                 self._preferredcodec = preferredcodec
2834
2835         @staticmethod
2836         def get_audio_codec(path):
2837                 try:
2838                         cmd = ['ffprobe', '-show_streams', '--', path]
2839                         handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2840                         output = handle.communicate()[0]
2841                         if handle.wait() != 0:
2842                                 return None
2843                 except (IOError, OSError):
2844                         return None
2845                 audio_codec = None
2846                 for line in output.split('\n'):
2847                         if line.startswith('codec_name='):
2848                                 audio_codec = line.split('=')[1].strip()
2849                         elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2850                                 return audio_codec
2851                 return None
2852
2853         @staticmethod
2854         def run_ffmpeg(path, out_path, codec, more_opts):
2855                 try:
2856                         cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2857                         ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2858                         return (ret == 0)
2859                 except (IOError, OSError):
2860                         return False
2861
2862         def run(self, information):
2863                 path = information['filepath']
2864
2865                 filecodec = self.get_audio_codec(path)
2866                 if filecodec is None:
2867                         self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2868                         return None
2869
2870                 more_opts = []
2871                 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2872                         if filecodec == 'aac' or filecodec == 'mp3':
2873                                 # Lossless if possible
2874                                 acodec = 'copy'
2875                                 extension = filecodec
2876                                 if filecodec == 'aac':
2877                                         more_opts = ['-f', 'adts']
2878                         else:
2879                                 # MP3 otherwise.
2880                                 acodec = 'libmp3lame'
2881                                 extension = 'mp3'
2882                                 more_opts = ['-ab', '128k']
2883                 else:
2884                         # We convert the audio (lossy)
2885                         acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2886                         extension = self._preferredcodec
2887                         more_opts = ['-ab', '128k']
2888                         if self._preferredcodec == 'aac':
2889                                 more_opts += ['-f', 'adts']
2890
2891                 (prefix, ext) = os.path.splitext(path)
2892                 new_path = prefix + '.' + extension
2893                 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2894                 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2895
2896                 if not status:
2897                         self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2898                         return None
2899
2900                 try:
2901                         os.remove(path)
2902                 except (IOError, OSError):
2903                         self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2904                         return None
2905
2906                 information['filepath'] = new_path
2907                 return information
2908
2909 ### MAIN PROGRAM ###
2910 if __name__ == '__main__':
2911         try:
2912                 # Modules needed only when running the main program
2913                 import getpass
2914                 import optparse
2915
2916                 # Function to update the program file with the latest version from the repository.
2917                 def update_self(downloader, filename):
2918                         # Note: downloader only used for options
2919                         if not os.access(filename, os.W_OK):
2920                                 sys.exit('ERROR: no write permissions on %s' % filename)
2921
2922                         downloader.to_screen('Updating to latest stable version...')
2923                         try:
2924                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2925                                 latest_version = urllib.urlopen(latest_url).read().strip()
2926                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2927                                 newcontent = urllib.urlopen(prog_url).read()
2928                         except (IOError, OSError), err:
2929                                 sys.exit('ERROR: unable to download latest version')
2930                         try:
2931                                 stream = open(filename, 'w')
2932                                 stream.write(newcontent)
2933                                 stream.close()
2934                         except (IOError, OSError), err:
2935                                 sys.exit('ERROR: unable to overwrite current version')
2936                         downloader.to_screen('Updated to version %s' % latest_version)
2937
2938                 # Parse command line
2939                 parser = optparse.OptionParser(
2940                         usage='Usage: %prog [options] url...',
2941                         version='2011.07.09-phihag',
2942                         conflict_handler='resolve',
2943                 )
2944
2945                 parser.add_option('-h', '--help',
2946                                 action='help', help='print this help text and exit')
2947                 parser.add_option('-v', '--version',
2948                                 action='version', help='print program version and exit')
2949                 parser.add_option('-U', '--update',
2950                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2951                 parser.add_option('-i', '--ignore-errors',
2952                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2953                 parser.add_option('-r', '--rate-limit',
2954                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2955                 parser.add_option('-R', '--retries',
2956                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2957                 parser.add_option('--playlist-start',
2958                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2959                 parser.add_option('--playlist-end',
2960                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2961                 parser.add_option('--dump-user-agent',
2962                                 action='store_true', dest='dump_user_agent',
2963                                 help='display the current browser identification', default=False)
2964
2965                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2966                 authentication.add_option('-u', '--username',
2967                                 dest='username', metavar='USERNAME', help='account username')
2968                 authentication.add_option('-p', '--password',
2969                                 dest='password', metavar='PASSWORD', help='account password')
2970                 authentication.add_option('-n', '--netrc',
2971                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2972                 parser.add_option_group(authentication)
2973
2974                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2975                 video_format.add_option('-f', '--format',
2976                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2977                 video_format.add_option('--all-formats',
2978                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2979                 video_format.add_option('--max-quality',
2980                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2981                 parser.add_option_group(video_format)
2982
2983                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2984                 verbosity.add_option('-q', '--quiet',
2985                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2986                 verbosity.add_option('-s', '--simulate',
2987                                 action='store_true', dest='simulate', help='do not download video', default=False)
2988                 verbosity.add_option('-g', '--get-url',
2989                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2990                 verbosity.add_option('-e', '--get-title',
2991                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2992                 verbosity.add_option('--get-thumbnail',
2993                                 action='store_true', dest='getthumbnail',
2994                                 help='simulate, quiet but print thumbnail URL', default=False)
2995                 verbosity.add_option('--get-description',
2996                                 action='store_true', dest='getdescription',
2997                                 help='simulate, quiet but print video description', default=False)
2998                 verbosity.add_option('--get-filename',
2999                                 action='store_true', dest='getfilename',
3000                                 help='simulate, quiet but print output filename', default=False)
3001                 verbosity.add_option('--no-progress',
3002                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3003                 verbosity.add_option('--console-title',
3004                                 action='store_true', dest='consoletitle',
3005                                 help='display progress in console titlebar', default=False)
3006                 parser.add_option_group(verbosity)
3007
3008                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3009                 filesystem.add_option('-t', '--title',
3010                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
3011                 filesystem.add_option('-l', '--literal',
3012                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3013                 filesystem.add_option('-A', '--auto-number',
3014                                 action='store_true', dest='autonumber',
3015                                 help='number downloaded files starting from 00000', default=False)
3016                 filesystem.add_option('-o', '--output',
3017                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
3018                 filesystem.add_option('-a', '--batch-file',
3019                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3020                 filesystem.add_option('-w', '--no-overwrites',
3021                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3022                 filesystem.add_option('-c', '--continue',
3023                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3024                 filesystem.add_option('--cookies',
3025                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
3026                 filesystem.add_option('--no-part',
3027                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
3028                 filesystem.add_option('--no-mtime',
3029                                 action='store_false', dest='updatetime',
3030                                 help='do not use the Last-modified header to set the file modification time', default=True)
3031                 filesystem.add_option('--write-description',
3032                                 action='store_true', dest='writedescription',
3033                                 help='write video description to a .description file', default=False)
3034                 parser.add_option_group(filesystem)
3035
3036                 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3037                 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3038                                 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3039                 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3040                                 help='"best", "aac" or "mp3"; best by default')
3041                 parser.add_option_group(postproc)
3042
3043                 (opts, args) = parser.parse_args()
3044
3045                 # Open appropriate CookieJar
3046                 if opts.cookiefile is None:
3047                         jar = cookielib.CookieJar()
3048                 else:
3049                         try:
3050                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3051                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3052                                         jar.load()
3053                         except (IOError, OSError), err:
3054                                 sys.exit(u'ERROR: unable to open cookie file')
3055
3056                 # Dump user agent
3057                 if opts.dump_user_agent:
3058                         print std_headers['User-Agent']
3059                         sys.exit(0)
3060
3061                 # General configuration
3062                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3063                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
3064                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3065
3066                 # Batch file verification
3067                 batchurls = []
3068                 if opts.batchfile is not None:
3069                         try:
3070                                 if opts.batchfile == '-':
3071                                         batchfd = sys.stdin
3072                                 else:
3073                                         batchfd = open(opts.batchfile, 'r')
3074                                 batchurls = batchfd.readlines()
3075                                 batchurls = [x.strip() for x in batchurls]
3076                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3077                         except IOError:
3078                                 sys.exit(u'ERROR: batch file could not be read')
3079                 all_urls = batchurls + args
3080
3081                 # Conflicting, missing and erroneous options
3082                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3083                         parser.error(u'using .netrc conflicts with giving username/password')
3084                 if opts.password is not None and opts.username is None:
3085                         parser.error(u'account username missing')
3086                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3087                         parser.error(u'using output template conflicts with using title, literal title or auto number')
3088                 if opts.usetitle and opts.useliteral:
3089                         parser.error(u'using title conflicts with using literal title')
3090                 if opts.username is not None and opts.password is None:
3091                         opts.password = getpass.getpass(u'Type account password and press return:')
3092                 if opts.ratelimit is not None:
3093                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3094                         if numeric_limit is None:
3095                                 parser.error(u'invalid rate limit specified')
3096                         opts.ratelimit = numeric_limit
3097                 if opts.retries is not None:
3098                         try:
3099                                 opts.retries = long(opts.retries)
3100                         except (TypeError, ValueError), err:
3101                                 parser.error(u'invalid retry count specified')
3102                 try:
3103                         opts.playliststart = long(opts.playliststart)
3104                         if opts.playliststart <= 0:
3105                                 raise ValueError
3106                 except (TypeError, ValueError), err:
3107                         parser.error(u'invalid playlist start number specified')
3108                 try:
3109                         opts.playlistend = long(opts.playlistend)
3110                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3111                                 raise ValueError
3112                 except (TypeError, ValueError), err:
3113                         parser.error(u'invalid playlist end number specified')
3114                 if opts.extractaudio:
3115                         if opts.audioformat not in ['best', 'aac', 'mp3']:
3116                                 parser.error(u'invalid audio format specified')
3117
3118                 # Information extractors
3119                 youtube_ie = YoutubeIE()
3120                 metacafe_ie = MetacafeIE(youtube_ie)
3121                 dailymotion_ie = DailymotionIE()
3122                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
3123                 youtube_user_ie = YoutubeUserIE(youtube_ie)
3124                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
3125                 google_ie = GoogleIE()
3126                 google_search_ie = GoogleSearchIE(google_ie)
3127                 photobucket_ie = PhotobucketIE()
3128                 yahoo_ie = YahooIE()
3129                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
3130                 deposit_files_ie = DepositFilesIE()
3131                 facebook_ie = FacebookIE()
3132                 bliptv_ie = BlipTVIE()
3133                 generic_ie = GenericIE()
3134
3135                 # File downloader
3136                 fd = FileDownloader({
3137                         'usenetrc': opts.usenetrc,
3138                         'username': opts.username,
3139                         'password': opts.password,
3140                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3141                         'forceurl': opts.geturl,
3142                         'forcetitle': opts.gettitle,
3143                         'forcethumbnail': opts.getthumbnail,
3144                         'forcedescription': opts.getdescription,
3145                         'forcefilename': opts.getfilename,
3146                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3147                         'format': opts.format,
3148                         'format_limit': opts.format_limit,
3149                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3150                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3151                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3152                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3153                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3154                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3155                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3156                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3157                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3158                                 or u'%(id)s.%(ext)s'),
3159                         'ignoreerrors': opts.ignoreerrors,
3160                         'ratelimit': opts.ratelimit,
3161                         'nooverwrites': opts.nooverwrites,
3162                         'retries': opts.retries,
3163                         'continuedl': opts.continue_dl,
3164                         'noprogress': opts.noprogress,
3165                         'playliststart': opts.playliststart,
3166                         'playlistend': opts.playlistend,
3167                         'logtostderr': opts.outtmpl == '-',
3168                         'consoletitle': opts.consoletitle,
3169                         'nopart': opts.nopart,
3170                         'updatetime': opts.updatetime,
3171                         'writedescription': opts.writedescription,
3172                         })
3173                 fd.add_info_extractor(youtube_search_ie)
3174                 fd.add_info_extractor(youtube_pl_ie)
3175                 fd.add_info_extractor(youtube_user_ie)
3176                 fd.add_info_extractor(metacafe_ie)
3177                 fd.add_info_extractor(dailymotion_ie)
3178                 fd.add_info_extractor(youtube_ie)
3179                 fd.add_info_extractor(google_ie)
3180                 fd.add_info_extractor(google_search_ie)
3181                 fd.add_info_extractor(photobucket_ie)
3182                 fd.add_info_extractor(yahoo_ie)
3183                 fd.add_info_extractor(yahoo_search_ie)
3184                 fd.add_info_extractor(deposit_files_ie)
3185                 fd.add_info_extractor(facebook_ie)
3186                 fd.add_info_extractor(bliptv_ie)
3187
3188                 # This must come last since it's the
3189                 # fallback if none of the others work
3190                 fd.add_info_extractor(generic_ie)
3191
3192                 # PostProcessors
3193                 if opts.extractaudio:
3194                         fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3195
3196                 # Update version
3197                 if opts.update_self:
3198                         update_self(fd, sys.argv[0])
3199
3200                 # Maybe do nothing
3201                 if len(all_urls) < 1:
3202                         if not opts.update_self:
3203                                 parser.error(u'you must provide at least one URL')
3204                         else:
3205                                 sys.exit()
3206                 retcode = fd.download(all_urls)
3207
3208                 # Dump cookie jar if requested
3209                 if opts.cookiefile is not None:
3210                         try:
3211                                 jar.save()
3212                         except (IOError, OSError), err:
3213                                 sys.exit(u'ERROR: unable to save cookie jar')
3214
3215                 sys.exit(retcode)
3216
3217         except DownloadError:
3218                 sys.exit(1)
3219         except SameFileError:
3220                 sys.exit(u'ERROR: fixed output name but more than one file to download')
3221         except KeyboardInterrupt:
3222                 sys.exit(u'\nERROR: Interrupted by user')