_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # Author: Vasyl' Vavrychuk
   7 # License: Public domain code
   8 import cookielib
   9 import ctypes
  10 import datetime
  11 import gzip
  12 import htmlentitydefs
  13 import httplib
  14 import locale
  15 import math
  16 import netrc
  17 import os
  18 import os.path
  19 import re
  20 import socket
  21 import string
  22 import StringIO
  23 import subprocess
  24 import sys
  25 import time
  26 import urllib
  27 import urllib2
  28 import zlib
  29
  30 # parse_qs was moved from the cgi module to the urlparse module recently.
  31 try:
  32         from urlparse import parse_qs
  33 except ImportError:
  34         from cgi import parse_qs
  35
  36 std_headers = {
  37         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
  38         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  39         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  40         'Accept-Encoding': 'gzip, deflate',
  41         'Accept-Language': 'en-us,en;q=0.5',
  42 }
  43
  44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  45
  46 def preferredencoding():
  47         """Get preferred encoding.
  48
  49         Returns the best encoding scheme for the system, based on
  50         locale.getpreferredencoding() and some further tweaks.
  51         """
  52         def yield_preferredencoding():
  53                 try:
  54                         pref = locale.getpreferredencoding()
  55                         u'TEST'.encode(pref)
  56                 except:
  57                         pref = 'UTF-8'
  58                 while True:
  59                         yield pref
  60         return yield_preferredencoding().next()
  61
  62 def htmlentity_transform(matchobj):
  63         """Transforms an HTML entity to a Unicode character.
  64
  65         This function receives a match object and is intended to be used with
  66         the re.sub() function.
  67         """
  68         entity = matchobj.group(1)
  69
  70         # Known non-numeric HTML entity
  71         if entity in htmlentitydefs.name2codepoint:
  72                 return unichr(htmlentitydefs.name2codepoint[entity])
  73
  74         # Unicode character
  75         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  76         if mobj is not None:
  77                 numstr = mobj.group(1)
  78                 if numstr.startswith(u'x'):
  79                         base = 16
  80                         numstr = u'0%s' % numstr
  81                 else:
  82                         base = 10
  83                 return unichr(long(numstr, base))
  84
  85         # Unknown entity in name, return its literal representation
  86         return (u'&%s;' % entity)
  87
  88 def sanitize_title(utitle):
  89         """Sanitizes a video title so it could be used as part of a filename."""
  90         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  91         return utitle.replace(unicode(os.sep), u'%')
  92
  93 def sanitize_open(filename, open_mode):
  94         """Try to open the given filename, and slightly tweak it if this fails.
  95
  96         Attempts to open the given filename. If this fails, it tries to change
  97         the filename slightly, step by step, until it's either able to open it
  98         or it fails and raises a final exception, like the standard open()
  99         function.
 100
 101         It returns the tuple (stream, definitive_file_name).
 102         """
 103         try:
 104                 if filename == u'-':
 105                         if sys.platform == 'win32':
 106                                 import msvcrt
 107                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 108                         return (sys.stdout, filename)
 109                 stream = open(filename, open_mode)
 110                 return (stream, filename)
 111         except (IOError, OSError), err:
 112                 # In case of error, try to remove win32 forbidden chars
 113                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 114
 115                 # An exception here should be caught in the caller
 116                 stream = open(filename, open_mode)
 117                 return (stream, filename)
 118
 119 class DownloadError(Exception):
 120         """Download Error exception.
 121
 122         This exception may be thrown by FileDownloader objects if they are not
 123         configured to continue on errors. They will contain the appropriate
 124         error message.
 125         """
 126         pass
 127
 128 class SameFileError(Exception):
 129         """Same File exception.
 130
 131         This exception will be thrown by FileDownloader objects if they detect
 132         multiple files would have to be downloaded to the same file on disk.
 133         """
 134         pass
 135
 136 class PostProcessingError(Exception):
 137         """Post Processing exception.
 138
 139         This exception may be raised by PostProcessor's .run() method to
 140         indicate an error in the postprocessing task.
 141         """
 142         pass
 143
 144 class UnavailableVideoError(Exception):
 145         """Unavailable Format exception.
 146
 147         This exception will be thrown when a video is requested
 148         in a format that is not available for that video.
 149         """
 150         pass
 151
 152 class ContentTooShortError(Exception):
 153         """Content Too Short exception.
 154
 155         This exception may be raised by FileDownloader objects when a file they
 156         download is too small for what the server announced first, indicating
 157         the connection was probably interrupted.
 158         """
 159         # Both in bytes
 160         downloaded = None
 161         expected = None
 162
 163         def __init__(self, downloaded, expected):
 164                 self.downloaded = downloaded
 165                 self.expected = expected
 166
 167 class YoutubeDLHandler(urllib2.HTTPHandler):
 168         """Handler for HTTP requests and responses.
 169
 170         This class, when installed with an OpenerDirector, automatically adds
 171         the standard headers to every HTTP request and handles gzipped and
 172         deflated responses from web servers. If compression is to be avoided in
 173         a particular request, the original request in the program code only has
 174         to include the HTTP header "Youtubedl-No-Compression", which will be
 175         removed before making the real request.
 176
 177         Part of this code was copied from:
 178
 179           http://techknack.net/python-urllib2-handlers/
 180
 181         Andrew Rowls, the author of that code, agreed to release it to the
 182         public domain.
 183         """
 184
 185         @staticmethod
 186         def deflate(data):
 187                 try:
 188                         return zlib.decompress(data, -zlib.MAX_WBITS)
 189                 except zlib.error:
 190                         return zlib.decompress(data)
 191
 192         @staticmethod
 193         def addinfourl_wrapper(stream, headers, url, code):
 194                 if hasattr(urllib2.addinfourl, 'getcode'):
 195                         return urllib2.addinfourl(stream, headers, url, code)
 196                 return urllib2.addinfourl(stream, headers, url)
 197
 198         def http_request(self, req):
 199                 for h in std_headers:
 200                         if h in req.headers:
 201                                 del req.headers[h]
 202                         req.add_header(h, std_headers[h])
 203                 if 'Youtubedl-no-compression' in req.headers:
 204                         if 'Accept-encoding' in req.headers:
 205                                 del req.headers['Accept-encoding']
 206                         del req.headers['Youtubedl-no-compression']
 207                 return req
 208
 209         def http_response(self, req, resp):
 210                 old_resp = resp
 211                 # gzip
 212                 if resp.headers.get('Content-encoding', '') == 'gzip':
 213                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 214                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 215                         resp.msg = old_resp.msg
 216                 # deflate
 217                 if resp.headers.get('Content-encoding', '') == 'deflate':
 218                         gz = StringIO.StringIO(self.deflate(resp.read()))
 219                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 220                         resp.msg = old_resp.msg
 221                 return resp
 222
 223 class FileDownloader(object):
 224         """File Downloader class.
 225
 226         File downloader objects are the ones responsible of downloading the
 227         actual video file and writing it to disk if the user has requested
 228         it, among some other tasks. In most cases there should be one per
 229         program. As, given a video URL, the downloader doesn't know how to
 230         extract all the needed information, task that InfoExtractors do, it
 231         has to pass the URL to one of them.
 232
 233         For this, file downloader objects have a method that allows
 234         InfoExtractors to be registered in a given order. When it is passed
 235         a URL, the file downloader handles it to the first InfoExtractor it
 236         finds that reports being able to handle it. The InfoExtractor extracts
 237         all the information about the video or videos the URL refers to, and
 238         asks the FileDownloader to process the video information, possibly
 239         downloading the video.
 240
 241         File downloaders accept a lot of parameters. In order not to saturate
 242         the object constructor with arguments, it receives a dictionary of
 243         options instead. These options are available through the params
 244         attribute for the InfoExtractors to use. The FileDownloader also
 245         registers itself as the downloader in charge for the InfoExtractors
 246         that are added to it, so this is a "mutual registration".
 247
 248         Available options:
 249
 250         username:         Username for authentication purposes.
 251         password:         Password for authentication purposes.
 252         usenetrc:         Use netrc for authentication instead.
 253         quiet:            Do not print messages to stdout.
 254         forceurl:         Force printing final URL.
 255         forcetitle:       Force printing title.
 256         forcethumbnail:   Force printing thumbnail URL.
 257         forcedescription: Force printing description.
 258         simulate:         Do not download the video files.
 259         format:           Video format code.
 260         format_limit:     Highest quality format to try.
 261         outtmpl:          Template for output names.
 262         ignoreerrors:     Do not stop on download errors.
 263         ratelimit:        Download speed limit, in bytes/sec.
 264         nooverwrites:     Prevent overwriting files.
 265         retries:          Number of times to retry for HTTP error 5xx
 266         continuedl:       Try to continue downloads if possible.
 267         noprogress:       Do not print the progress bar.
 268         playliststart:    Playlist item to start at.
 269         playlistend:      Playlist item to end at.
 270         logtostderr:      Log messages to stderr instead of stdout.
 271         consoletitle:     Display progress in console window's titlebar.
 272         nopart:           Do not use temporary .part files.
 273         """
 274
 275         params = None
 276         _ies = []
 277         _pps = []
 278         _download_retcode = None
 279         _num_downloads = None
 280         _screen_file = None
 281
 282         def __init__(self, params):
 283                 """Create a FileDownloader object with the given options."""
 284                 self._ies = []
 285                 self._pps = []
 286                 self._download_retcode = 0
 287                 self._num_downloads = 0
 288                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 289                 self.params = params
 290
 291         @staticmethod
 292         def pmkdir(filename):
 293                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 294                 components = filename.split(os.sep)
 295                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 296                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 297                 for dir in aggregate:
 298                         if not os.path.exists(dir):
 299                                 os.mkdir(dir)
 300
 301         @staticmethod
 302         def format_bytes(bytes):
 303                 if bytes is None:
 304                         return 'N/A'
 305                 if type(bytes) is str:
 306                         bytes = float(bytes)
 307                 if bytes == 0.0:
 308                         exponent = 0
 309                 else:
 310                         exponent = long(math.log(bytes, 1024.0))
 311                 suffix = 'bkMGTPEZY'[exponent]
 312                 converted = float(bytes) / float(1024**exponent)
 313                 return '%.2f%s' % (converted, suffix)
 314
 315         @staticmethod
 316         def calc_percent(byte_counter, data_len):
 317                 if data_len is None:
 318                         return '---.-%'
 319                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 320
 321         @staticmethod
 322         def calc_eta(start, now, total, current):
 323                 if total is None:
 324                         return '--:--'
 325                 dif = now - start
 326                 if current == 0 or dif < 0.001: # One millisecond
 327                         return '--:--'
 328                 rate = float(current) / dif
 329                 eta = long((float(total) - float(current)) / rate)
 330                 (eta_mins, eta_secs) = divmod(eta, 60)
 331                 if eta_mins > 99:
 332                         return '--:--'
 333                 return '%02d:%02d' % (eta_mins, eta_secs)
 334
 335         @staticmethod
 336         def calc_speed(start, now, bytes):
 337                 dif = now - start
 338                 if bytes == 0 or dif < 0.001: # One millisecond
 339                         return '%10s' % '---b/s'
 340                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 341
 342         @staticmethod
 343         def best_block_size(elapsed_time, bytes):
 344                 new_min = max(bytes / 2.0, 1.0)
 345                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 346                 if elapsed_time < 0.001:
 347                         return long(new_max)
 348                 rate = bytes / elapsed_time
 349                 if rate > new_max:
 350                         return long(new_max)
 351                 if rate < new_min:
 352                         return long(new_min)
 353                 return long(rate)
 354
 355         @staticmethod
 356         def parse_bytes(bytestr):
 357                 """Parse a string indicating a byte quantity into a long integer."""
 358                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 359                 if matchobj is None:
 360                         return None
 361                 number = float(matchobj.group(1))
 362                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 363                 return long(round(number * multiplier))
 364
 365         def add_info_extractor(self, ie):
 366                 """Add an InfoExtractor object to the end of the list."""
 367                 self._ies.append(ie)
 368                 ie.set_downloader(self)
 369
 370         def add_post_processor(self, pp):
 371                 """Add a PostProcessor object to the end of the chain."""
 372                 self._pps.append(pp)
 373                 pp.set_downloader(self)
 374
 375         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 376                 """Print message to stdout if not in quiet mode."""
 377                 try:
 378                         if not self.params.get('quiet', False):
 379                                 terminator = [u'\n', u''][skip_eol]
 380                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 381                         self._screen_file.flush()
 382                 except (UnicodeEncodeError), err:
 383                         if not ignore_encoding_errors:
 384                                 raise
 385
 386         def to_stderr(self, message):
 387                 """Print message to stderr."""
 388                 print >>sys.stderr, message.encode(preferredencoding())
 389
 390         def to_cons_title(self, message):
 391                 """Set console/terminal window title to message."""
 392                 if not self.params.get('consoletitle', False):
 393                         return
 394                 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 395                         # c_wchar_p() might not be necessary if `message` is
 396                         # already of type unicode()
 397                         ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 398                 elif 'TERM' in os.environ:
 399                         sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
 400
 401         def fixed_template(self):
 402                 """Checks if the output template is fixed."""
 403                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 404
 405         def trouble(self, message=None):
 406                 """Determine action to take when a download problem appears.
 407
 408                 Depending on if the downloader has been configured to ignore
 409                 download errors or not, this method may throw an exception or
 410                 not when errors are found, after printing the message.
 411                 """
 412                 if message is not None:
 413                         self.to_stderr(message)
 414                 if not self.params.get('ignoreerrors', False):
 415                         raise DownloadError(message)
 416                 self._download_retcode = 1
 417
 418         def slow_down(self, start_time, byte_counter):
 419                 """Sleep if the download speed is over the rate limit."""
 420                 rate_limit = self.params.get('ratelimit', None)
 421                 if rate_limit is None or byte_counter == 0:
 422                         return
 423                 now = time.time()
 424                 elapsed = now - start_time
 425                 if elapsed <= 0.0:
 426                         return
 427                 speed = float(byte_counter) / elapsed
 428                 if speed > rate_limit:
 429                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 430
 431         def temp_name(self, filename):
 432                 """Returns a temporary filename for the given filename."""
 433                 if self.params.get('nopart', False) or filename == u'-' or \
 434                                 (os.path.exists(filename) and not os.path.isfile(filename)):
 435                         return filename
 436                 return filename + u'.part'
 437
 438         def undo_temp_name(self, filename):
 439                 if filename.endswith(u'.part'):
 440                         return filename[:-len(u'.part')]
 441                 return filename
 442
 443         def try_rename(self, old_filename, new_filename):
 444                 try:
 445                         if old_filename == new_filename:
 446                                 return
 447                         os.rename(old_filename, new_filename)
 448                 except (IOError, OSError), err:
 449                         self.trouble(u'ERROR: unable to rename file')
 450
 451         def report_destination(self, filename):
 452                 """Report destination filename."""
 453                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 454
 455         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 456                 """Report download progress."""
 457                 if self.params.get('noprogress', False):
 458                         return
 459                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 460                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 461                 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
 462                                 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
 463
 464         def report_resuming_byte(self, resume_len):
 465                 """Report attempt to resume at given byte."""
 466                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 467
 468         def report_retry(self, count, retries):
 469                 """Report retry in case of HTTP error 5xx"""
 470                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 471
 472         def report_file_already_downloaded(self, file_name):
 473                 """Report file has already been fully downloaded."""
 474                 try:
 475                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 476                 except (UnicodeEncodeError), err:
 477                         self.to_screen(u'[download] The file has already been downloaded')
 478
 479         def report_unable_to_resume(self):
 480                 """Report it was impossible to resume download."""
 481                 self.to_screen(u'[download] Unable to resume')
 482
 483         def report_finish(self):
 484                 """Report download finished."""
 485                 if self.params.get('noprogress', False):
 486                         self.to_screen(u'[download] Download completed')
 487                 else:
 488                         self.to_screen(u'')
 489
 490         def increment_downloads(self):
 491                 """Increment the ordinal that assigns a number to each file."""
 492                 self._num_downloads += 1
 493
 494         def process_info(self, info_dict):
 495                 """Process a single dictionary returned by an InfoExtractor."""
 496                 # Do nothing else if in simulate mode
 497                 if self.params.get('simulate', False):
 498                         # Forced printings
 499                         if self.params.get('forcetitle', False):
 500                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 501                         if self.params.get('forceurl', False):
 502                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 503                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 504                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 505                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 506                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 507
 508                         return
 509
 510                 try:
 511                         template_dict = dict(info_dict)
 512                         template_dict['epoch'] = unicode(long(time.time()))
 513                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 514                         filename = self.params['outtmpl'] % template_dict
 515                 except (ValueError, KeyError), err:
 516                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 517                         return
 518                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 519                         self.to_stderr(u'WARNING: file exists and will be skipped')
 520                         return
 521
 522                 try:
 523                         self.pmkdir(filename)
 524                 except (OSError, IOError), err:
 525                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 526                         return
 527
 528                 try:
 529                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 530                 except (OSError, IOError), err:
 531                         raise UnavailableVideoError
 532                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 533                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 534                         return
 535                 except (ContentTooShortError, ), err:
 536                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 537                         return
 538
 539                 if success:
 540                         try:
 541                                 self.post_process(filename, info_dict)
 542                         except (PostProcessingError), err:
 543                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 544                                 return
 545
 546         def download(self, url_list):
 547                 """Download a given list of URLs."""
 548                 if len(url_list) > 1 and self.fixed_template():
 549                         raise SameFileError(self.params['outtmpl'])
 550
 551                 for url in url_list:
 552                         suitable_found = False
 553                         for ie in self._ies:
 554                                 # Go to next InfoExtractor if not suitable
 555                                 if not ie.suitable(url):
 556                                         continue
 557
 558                                 # Suitable InfoExtractor found
 559                                 suitable_found = True
 560
 561                                 # Extract information from URL and process it
 562                                 ie.extract(url)
 563
 564                                 # Suitable InfoExtractor had been found; go to next URL
 565                                 break
 566
 567                         if not suitable_found:
 568                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 569
 570                 return self._download_retcode
 571
 572         def post_process(self, filename, ie_info):
 573                 """Run the postprocessing chain on the given file."""
 574                 info = dict(ie_info)
 575                 info['filepath'] = filename
 576                 for pp in self._pps:
 577                         info = pp.run(info)
 578                         if info is None:
 579                                 break
 580
 581         def _download_with_rtmpdump(self, filename, url, player_url):
 582                 self.report_destination(filename)
 583                 tmpfilename = self.temp_name(filename)
 584
 585                 # Check for rtmpdump first
 586                 try:
 587                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 588                 except (OSError, IOError):
 589                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 590                         return False
 591
 592                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 593                 # the connection was interrumpted and resuming appears to be
 594                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 595                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
 596                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 597                 while retval == 2 or retval == 1:
 598                         prevsize = os.path.getsize(tmpfilename)
 599                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 600                         time.sleep(5.0) # This seems to be needed
 601                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 602                         cursize = os.path.getsize(tmpfilename)
 603                         if prevsize == cursize and retval == 1:
 604                                 break
 605                 if retval == 0:
 606                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
 607                         self.try_rename(tmpfilename, filename)
 608                         return True
 609                 else:
 610                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 611                         return False
 612
 613         def _do_download(self, filename, url, player_url):
 614                 # Check file already present
 615                 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
 616                         self.report_file_already_downloaded(filename)
 617                         return True
 618
 619                 # Attempt to download using rtmpdump
 620                 if url.startswith('rtmp'):
 621                         return self._download_with_rtmpdump(filename, url, player_url)
 622
 623                 tmpfilename = self.temp_name(filename)
 624                 stream = None
 625                 open_mode = 'wb'
 626
 627                 # Do not include the Accept-Encoding header
 628                 headers = {'Youtubedl-no-compression': 'True'}
 629                 basic_request = urllib2.Request(url, None, headers)
 630                 request = urllib2.Request(url, None, headers)
 631
 632                 # Establish possible resume length
 633                 if os.path.isfile(tmpfilename):
 634                         resume_len = os.path.getsize(tmpfilename)
 635                 else:
 636                         resume_len = 0
 637
 638                 # Request parameters in case of being able to resume
 639                 if self.params.get('continuedl', False) and resume_len != 0:
 640                         self.report_resuming_byte(resume_len)
 641                         request.add_header('Range','bytes=%d-' % resume_len)
 642                         open_mode = 'ab'
 643
 644                 count = 0
 645                 retries = self.params.get('retries', 0)
 646                 while count <= retries:
 647                         # Establish connection
 648                         try:
 649                                 data = urllib2.urlopen(request)
 650                                 break
 651                         except (urllib2.HTTPError, ), err:
 652                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 653                                         # Unexpected HTTP error
 654                                         raise
 655                                 elif err.code == 416:
 656                                         # Unable to resume (requested range not satisfiable)
 657                                         try:
 658                                                 # Open the connection again without the range header
 659                                                 data = urllib2.urlopen(basic_request)
 660                                                 content_length = data.info()['Content-Length']
 661                                         except (urllib2.HTTPError, ), err:
 662                                                 if err.code < 500 or err.code >= 600:
 663                                                         raise
 664                                         else:
 665                                                 # Examine the reported length
 666                                                 if (content_length is not None and
 667                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 668                                                         # The file had already been fully downloaded.
 669                                                         # Explanation to the above condition: in issue #175 it was revealed that
 670                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 671                                                         # changing the file size slightly and causing problems for some users. So
 672                                                         # I decided to implement a suggested change and consider the file
 673                                                         # completely downloaded if the file size differs less than 100 bytes from
 674                                                         # the one in the hard drive.
 675                                                         self.report_file_already_downloaded(filename)
 676                                                         self.try_rename(tmpfilename, filename)
 677                                                         return True
 678                                                 else:
 679                                                         # The length does not match, we start the download over
 680                                                         self.report_unable_to_resume()
 681                                                         open_mode = 'wb'
 682                                                         break
 683                         # Retry
 684                         count += 1
 685                         if count <= retries:
 686                                 self.report_retry(count, retries)
 687
 688                 if count > retries:
 689                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 690                         return False
 691
 692                 data_len = data.info().get('Content-length', None)
 693                 if data_len is not None:
 694                         data_len = long(data_len) + resume_len
 695                 data_len_str = self.format_bytes(data_len)
 696                 byte_counter = 0 + resume_len
 697                 block_size = 1024
 698                 start = time.time()
 699                 while True:
 700                         # Download and write
 701                         before = time.time()
 702                         data_block = data.read(block_size)
 703                         after = time.time()
 704                         if len(data_block) == 0:
 705                                 break
 706                         byte_counter += len(data_block)
 707
 708                         # Open file just in time
 709                         if stream is None:
 710                                 try:
 711                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
 712                                         filename = self.undo_temp_name(tmpfilename)
 713                                         self.report_destination(filename)
 714                                 except (OSError, IOError), err:
 715                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 716                                         return False
 717                         try:
 718                                 stream.write(data_block)
 719                         except (IOError, OSError), err:
 720                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 721                                 return False
 722                         block_size = self.best_block_size(after - before, len(data_block))
 723
 724                         # Progress message
 725                         percent_str = self.calc_percent(byte_counter, data_len)
 726                         eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
 727                         speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
 728                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 729
 730                         # Apply rate limit
 731                         self.slow_down(start, byte_counter - resume_len)
 732
 733                 stream.close()
 734                 self.report_finish()
 735                 if data_len is not None and byte_counter != data_len:
 736                         raise ContentTooShortError(byte_counter, long(data_len))
 737                 self.try_rename(tmpfilename, filename)
 738                 return True
 739
 740 class InfoExtractor(object):
 741         """Information Extractor class.
 742
 743         Information extractors are the classes that, given a URL, extract
 744         information from the video (or videos) the URL refers to. This
 745         information includes the real video URL, the video title and simplified
 746         title, author and others. The information is stored in a dictionary
 747         which is then passed to the FileDownloader. The FileDownloader
 748         processes this information possibly downloading the video to the file
 749         system, among other possible outcomes. The dictionaries must include
 750         the following fields:
 751
 752         id:             Video identifier.
 753         url:            Final video URL.
 754         uploader:       Nickname of the video uploader.
 755         title:          Literal title.
 756         stitle:         Simplified title.
 757         ext:            Video filename extension.
 758         format:         Video format.
 759         player_url:     SWF Player URL (may be None).
 760
 761         The following fields are optional. Their primary purpose is to allow
 762         youtube-dl to serve as the backend for a video search function, such
 763         as the one in youtube2mp3.  They are only used when their respective
 764         forced printing functions are called:
 765
 766         thumbnail:      Full URL to a video thumbnail image.
 767         description:    One-line video description.
 768
 769         Subclasses of this one should re-define the _real_initialize() and
 770         _real_extract() methods, as well as the suitable() static method.
 771         Probably, they should also be instantiated and added to the main
 772         downloader.
 773         """
 774
 775         _ready = False
 776         _downloader = None
 777
 778         def __init__(self, downloader=None):
 779                 """Constructor. Receives an optional downloader."""
 780                 self._ready = False
 781                 self.set_downloader(downloader)
 782
 783         @staticmethod
 784         def suitable(url):
 785                 """Receives a URL and returns True if suitable for this IE."""
 786                 return False
 787
 788         def initialize(self):
 789                 """Initializes an instance (authentication, etc)."""
 790                 if not self._ready:
 791                         self._real_initialize()
 792                         self._ready = True
 793
 794         def extract(self, url):
 795                 """Extracts URL information and returns it in list of dicts."""
 796                 self.initialize()
 797                 return self._real_extract(url)
 798
 799         def set_downloader(self, downloader):
 800                 """Sets the downloader for this IE."""
 801                 self._downloader = downloader
 802
 803         def _real_initialize(self):
 804                 """Real initialization process. Redefine in subclasses."""
 805                 pass
 806
 807         def _real_extract(self, url):
 808                 """Real extraction process. Redefine in subclasses."""
 809                 pass
 810
 811 class YoutubeIE(InfoExtractor):
 812         """Information extractor for youtube.com."""
 813
 814         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 815         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 816         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 817         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 818         _NETRC_MACHINE = 'youtube'
 819         # Listed in order of quality
 820         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 821         _video_extensions = {
 822                 '13': '3gp',
 823                 '17': 'mp4',
 824                 '18': 'mp4',
 825                 '22': 'mp4',
 826                 '37': 'mp4',
 827                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 828                 '43': 'webm',
 829                 '45': 'webm',
 830         }
 831
 832         @staticmethod
 833         def suitable(url):
 834                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 835
 836         def report_lang(self):
 837                 """Report attempt to set language."""
 838                 self._downloader.to_screen(u'[youtube] Setting language')
 839
 840         def report_login(self):
 841                 """Report attempt to log in."""
 842                 self._downloader.to_screen(u'[youtube] Logging in')
 843
 844         def report_age_confirmation(self):
 845                 """Report attempt to confirm age."""
 846                 self._downloader.to_screen(u'[youtube] Confirming age')
 847
 848         def report_video_webpage_download(self, video_id):
 849                 """Report attempt to download video webpage."""
 850                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 851
 852         def report_video_info_webpage_download(self, video_id):
 853                 """Report attempt to download video info webpage."""
 854                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 855
 856         def report_information_extraction(self, video_id):
 857                 """Report attempt to extract video information."""
 858                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 859
 860         def report_unavailable_format(self, video_id, format):
 861                 """Report extracted video URL."""
 862                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 863
 864         def report_rtmp_download(self):
 865                 """Indicate the download will use the RTMP protocol."""
 866                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 867
 868         def _real_initialize(self):
 869                 if self._downloader is None:
 870                         return
 871
 872                 username = None
 873                 password = None
 874                 downloader_params = self._downloader.params
 875
 876                 # Attempt to use provided username and password or .netrc data
 877                 if downloader_params.get('username', None) is not None:
 878                         username = downloader_params['username']
 879                         password = downloader_params['password']
 880                 elif downloader_params.get('usenetrc', False):
 881                         try:
 882                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 883                                 if info is not None:
 884                                         username = info[0]
 885                                         password = info[2]
 886                                 else:
 887                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 888                         except (IOError, netrc.NetrcParseError), err:
 889                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 890                                 return
 891
 892                 # Set language
 893                 request = urllib2.Request(self._LANG_URL)
 894                 try:
 895                         self.report_lang()
 896                         urllib2.urlopen(request).read()
 897                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 898                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 899                         return
 900
 901                 # No authentication to be performed
 902                 if username is None:
 903                         return
 904
 905                 # Log in
 906                 login_form = {
 907                                 'current_form': 'loginForm',
 908                                 'next':         '/',
 909                                 'action_login': 'Log In',
 910                                 'username':     username,
 911                                 'password':     password,
 912                                 }
 913                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
 914                 try:
 915                         self.report_login()
 916                         login_results = urllib2.urlopen(request).read()
 917                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 918                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 919                                 return
 920                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 921                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 922                         return
 923
 924                 # Confirm age
 925                 age_form = {
 926                                 'next_url':             '/',
 927                                 'action_confirm':       'Confirm',
 928                                 }
 929                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
 930                 try:
 931                         self.report_age_confirmation()
 932                         age_results = urllib2.urlopen(request).read()
 933                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 934                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 935                         return
 936
 937         def _real_extract(self, url):
 938                 # Extract video id from URL
 939                 mobj = re.match(self._VALID_URL, url)
 940                 if mobj is None:
 941                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 942                         return
 943                 video_id = mobj.group(2)
 944
 945                 # Get video webpage
 946                 self.report_video_webpage_download(video_id)
 947                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
 948                 try:
 949                         video_webpage = urllib2.urlopen(request).read()
 950                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 951                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 952                         return
 953
 954                 # Attempt to extract SWF player URL
 955                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 956                 if mobj is not None:
 957                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 958                 else:
 959                         player_url = None
 960
 961                 # Get video info
 962                 self.report_video_info_webpage_download(video_id)
 963                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 964                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 965                                            % (video_id, el_type))
 966                         request = urllib2.Request(video_info_url)
 967                         try:
 968                                 video_info_webpage = urllib2.urlopen(request).read()
 969                                 video_info = parse_qs(video_info_webpage)
 970                                 if 'token' in video_info:
 971                                         break
 972                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 973                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 974                                 return
 975                 if 'token' not in video_info:
 976                         if 'reason' in video_info:
 977                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 978                         else:
 979                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 980                         return
 981
 982                 # Start extracting information
 983                 self.report_information_extraction(video_id)
 984
 985                 # uploader
 986                 if 'author' not in video_info:
 987                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 988                         return
 989                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 990
 991                 # title
 992                 if 'title' not in video_info:
 993                         self._downloader.trouble(u'ERROR: unable to extract video title')
 994                         return
 995                 video_title = urllib.unquote_plus(video_info['title'][0])
 996                 video_title = video_title.decode('utf-8')
 997                 video_title = sanitize_title(video_title)
 998
 999                 # simplified title
1000                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1001                 simple_title = simple_title.strip(ur'_')
1002
1003                 # thumbnail image
1004                 if 'thumbnail_url' not in video_info:
1005                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1006                         video_thumbnail = ''
1007                 else:   # don't panic if we can't find it
1008                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1009
1010                 # upload date
1011                 upload_date = u'NA'
1012                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1013                 if mobj is not None:
1014                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1015                         format_expressions = ['%d %B %Y', '%B %d %Y']
1016                         for expression in format_expressions:
1017                                 try:
1018                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1019                                 except:
1020                                         pass
1021
1022                 # description
1023                 video_description = 'No description available.'
1024                 if self._downloader.params.get('forcedescription', False):
1025                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1026                         if mobj is not None:
1027                                 video_description = mobj.group(1)
1028
1029                 # token
1030                 video_token = urllib.unquote_plus(video_info['token'][0])
1031
1032                 # Decide which formats to download
1033                 req_format = self._downloader.params.get('format', None)
1034
1035                 if 'fmt_url_map' in video_info:
1036                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1037                         format_limit = self._downloader.params.get('format_limit', None)
1038                         if format_limit is not None and format_limit in self._available_formats:
1039                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
1040                         else:
1041                                 format_list = self._available_formats
1042                         existing_formats = [x for x in format_list if x in url_map]
1043                         if len(existing_formats) == 0:
1044                                 self._downloader.trouble(u'ERROR: no known formats available for video')
1045                                 return
1046                         if req_format is None:
1047                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1048                         elif req_format == '-1':
1049                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1050                         else:
1051                                 # Specific format
1052                                 if req_format not in url_map:
1053                                         self._downloader.trouble(u'ERROR: requested format not available')
1054                                         return
1055                                 video_url_list = [(req_format, url_map[req_format])] # Specific format
1056
1057                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1058                         self.report_rtmp_download()
1059                         video_url_list = [(None, video_info['conn'][0])]
1060
1061                 else:
1062                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1063                         return
1064
1065                 for format_param, video_real_url in video_url_list:
1066                         # At this point we have a new video
1067                         self._downloader.increment_downloads()
1068
1069                         # Extension
1070                         video_extension = self._video_extensions.get(format_param, 'flv')
1071
1072                         # Find the video URL in fmt_url_map or conn paramters
1073                         try:
1074                                 # Process video information
1075                                 self._downloader.process_info({
1076                                         'id':           video_id.decode('utf-8'),
1077                                         'url':          video_real_url.decode('utf-8'),
1078                                         'uploader':     video_uploader.decode('utf-8'),
1079                                         'upload_date':  upload_date,
1080                                         'title':        video_title,
1081                                         'stitle':       simple_title,
1082                                         'ext':          video_extension.decode('utf-8'),
1083                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1084                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1085                                         'description':  video_description.decode('utf-8'),
1086                                         'player_url':   player_url,
1087                                 })
1088                         except UnavailableVideoError, err:
1089                                 self._downloader.trouble(u'\nERROR: unable to download video')
1090
1091
1092 class MetacafeIE(InfoExtractor):
1093         """Information Extractor for metacafe.com."""
1094
1095         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1096         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1097         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1098         _youtube_ie = None
1099
1100         def __init__(self, youtube_ie, downloader=None):
1101                 InfoExtractor.__init__(self, downloader)
1102                 self._youtube_ie = youtube_ie
1103
1104         @staticmethod
1105         def suitable(url):
1106                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1107
1108         def report_disclaimer(self):
1109                 """Report disclaimer retrieval."""
1110                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1111
1112         def report_age_confirmation(self):
1113                 """Report attempt to confirm age."""
1114                 self._downloader.to_screen(u'[metacafe] Confirming age')
1115
1116         def report_download_webpage(self, video_id):
1117                 """Report webpage download."""
1118                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1119
1120         def report_extraction(self, video_id):
1121                 """Report information extraction."""
1122                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1123
1124         def _real_initialize(self):
1125                 # Retrieve disclaimer
1126                 request = urllib2.Request(self._DISCLAIMER)
1127                 try:
1128                         self.report_disclaimer()
1129                         disclaimer = urllib2.urlopen(request).read()
1130                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1131                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1132                         return
1133
1134                 # Confirm age
1135                 disclaimer_form = {
1136                         'filters': '0',
1137                         'submit': "Continue - I'm over 18",
1138                         }
1139                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1140                 try:
1141                         self.report_age_confirmation()
1142                         disclaimer = urllib2.urlopen(request).read()
1143                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1144                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1145                         return
1146
1147         def _real_extract(self, url):
1148                 # Extract id and simplified title from URL
1149                 mobj = re.match(self._VALID_URL, url)
1150                 if mobj is None:
1151                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1152                         return
1153
1154                 video_id = mobj.group(1)
1155
1156                 # Check if video comes from YouTube
1157                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1158                 if mobj2 is not None:
1159                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1160                         return
1161
1162                 # At this point we have a new video
1163                 self._downloader.increment_downloads()
1164
1165                 simple_title = mobj.group(2).decode('utf-8')
1166
1167                 # Retrieve video webpage to extract further information
1168                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1169                 try:
1170                         self.report_download_webpage(video_id)
1171                         webpage = urllib2.urlopen(request).read()
1172                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1173                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1174                         return
1175
1176                 # Extract URL, uploader and title from webpage
1177                 self.report_extraction(video_id)
1178                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1179                 if mobj is not None:
1180                         mediaURL = urllib.unquote(mobj.group(1))
1181                         video_extension = mediaURL[-3:]
1182
1183                         # Extract gdaKey if available
1184                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1185                         if mobj is None:
1186                                 video_url = mediaURL
1187                         else:
1188                                 gdaKey = mobj.group(1)
1189                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1190                 else:
1191                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1192                         if mobj is None:
1193                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1194                                 return
1195                         vardict = parse_qs(mobj.group(1))
1196                         if 'mediaData' not in vardict:
1197                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1198                                 return
1199                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1200                         if mobj is None:
1201                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1202                                 return
1203                         mediaURL = mobj.group(1).replace('\\/', '/')
1204                         video_extension = mediaURL[-3:]
1205                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1206
1207                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1208                 if mobj is None:
1209                         self._downloader.trouble(u'ERROR: unable to extract title')
1210                         return
1211                 video_title = mobj.group(1).decode('utf-8')
1212                 video_title = sanitize_title(video_title)
1213
1214                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1215                 if mobj is None:
1216                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1217                         return
1218                 video_uploader = mobj.group(1)
1219
1220                 try:
1221                         # Process video information
1222                         self._downloader.process_info({
1223                                 'id':           video_id.decode('utf-8'),
1224                                 'url':          video_url.decode('utf-8'),
1225                                 'uploader':     video_uploader.decode('utf-8'),
1226                                 'upload_date':  u'NA',
1227                                 'title':        video_title,
1228                                 'stitle':       simple_title,
1229                                 'ext':          video_extension.decode('utf-8'),
1230                                 'format':       u'NA',
1231                                 'player_url':   None,
1232                         })
1233                 except UnavailableVideoError:
1234                         self._downloader.trouble(u'\nERROR: unable to download video')
1235
1236
1237 class DailymotionIE(InfoExtractor):
1238         """Information Extractor for Dailymotion"""
1239
1240         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1241
1242         def __init__(self, downloader=None):
1243                 InfoExtractor.__init__(self, downloader)
1244
1245         @staticmethod
1246         def suitable(url):
1247                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1248
1249         def report_download_webpage(self, video_id):
1250                 """Report webpage download."""
1251                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1252
1253         def report_extraction(self, video_id):
1254                 """Report information extraction."""
1255                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1256
1257         def _real_initialize(self):
1258                 return
1259
1260         def _real_extract(self, url):
1261                 # Extract id and simplified title from URL
1262                 mobj = re.match(self._VALID_URL, url)
1263                 if mobj is None:
1264                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1265                         return
1266
1267                 # At this point we have a new video
1268                 self._downloader.increment_downloads()
1269                 video_id = mobj.group(1)
1270
1271                 simple_title = mobj.group(2).decode('utf-8')
1272                 video_extension = 'flv'
1273
1274                 # Retrieve video webpage to extract further information
1275                 request = urllib2.Request(url)
1276                 try:
1277                         self.report_download_webpage(video_id)
1278                         webpage = urllib2.urlopen(request).read()
1279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1280                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1281                         return
1282
1283                 # Extract URL, uploader and title from webpage
1284                 self.report_extraction(video_id)
1285                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1286                 if mobj is None:
1287                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1288                         return
1289                 mediaURL = urllib.unquote(mobj.group(1))
1290
1291                 # if needed add http://www.dailymotion.com/ if relative URL
1292
1293                 video_url = mediaURL
1294
1295                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1296                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1297                 if mobj is None:
1298                         self._downloader.trouble(u'ERROR: unable to extract title')
1299                         return
1300                 video_title = mobj.group(1).decode('utf-8')
1301                 video_title = sanitize_title(video_title)
1302
1303                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1304                 if mobj is None:
1305                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1306                         return
1307                 video_uploader = mobj.group(1)
1308
1309                 try:
1310                         # Process video information
1311                         self._downloader.process_info({
1312                                 'id':           video_id.decode('utf-8'),
1313                                 'url':          video_url.decode('utf-8'),
1314                                 'uploader':     video_uploader.decode('utf-8'),
1315                                 'upload_date':  u'NA',
1316                                 'title':        video_title,
1317                                 'stitle':       simple_title,
1318                                 'ext':          video_extension.decode('utf-8'),
1319                                 'format':       u'NA',
1320                                 'player_url':   None,
1321                         })
1322                 except UnavailableVideoError:
1323                         self._downloader.trouble(u'\nERROR: unable to download video')
1324
1325 class GoogleIE(InfoExtractor):
1326         """Information extractor for video.google.com."""
1327
1328         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1329
1330         def __init__(self, downloader=None):
1331                 InfoExtractor.__init__(self, downloader)
1332
1333         @staticmethod
1334         def suitable(url):
1335                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1336
1337         def report_download_webpage(self, video_id):
1338                 """Report webpage download."""
1339                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1340
1341         def report_extraction(self, video_id):
1342                 """Report information extraction."""
1343                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1344
1345         def _real_initialize(self):
1346                 return
1347
1348         def _real_extract(self, url):
1349                 # Extract id from URL
1350                 mobj = re.match(self._VALID_URL, url)
1351                 if mobj is None:
1352                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1353                         return
1354
1355                 # At this point we have a new video
1356                 self._downloader.increment_downloads()
1357                 video_id = mobj.group(1)
1358
1359                 video_extension = 'mp4'
1360
1361                 # Retrieve video webpage to extract further information
1362                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1363                 try:
1364                         self.report_download_webpage(video_id)
1365                         webpage = urllib2.urlopen(request).read()
1366                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1367                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1368                         return
1369
1370                 # Extract URL, uploader, and title from webpage
1371                 self.report_extraction(video_id)
1372                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1373                 if mobj is None:
1374                         video_extension = 'flv'
1375                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1376                 if mobj is None:
1377                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1378                         return
1379                 mediaURL = urllib.unquote(mobj.group(1))
1380                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1381                 mediaURL = mediaURL.replace('\\x26', '\x26')
1382
1383                 video_url = mediaURL
1384
1385                 mobj = re.search(r'<title>(.*)</title>', webpage)
1386                 if mobj is None:
1387                         self._downloader.trouble(u'ERROR: unable to extract title')
1388                         return
1389                 video_title = mobj.group(1).decode('utf-8')
1390                 video_title = sanitize_title(video_title)
1391                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1392
1393                 # Extract video description
1394                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1395                 if mobj is None:
1396                         self._downloader.trouble(u'ERROR: unable to extract video description')
1397                         return
1398                 video_description = mobj.group(1).decode('utf-8')
1399                 if not video_description:
1400                         video_description = 'No description available.'
1401
1402                 # Extract video thumbnail
1403                 if self._downloader.params.get('forcethumbnail', False):
1404                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1405                         try:
1406                                 webpage = urllib2.urlopen(request).read()
1407                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1408                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1409                                 return
1410                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1411                         if mobj is None:
1412                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1413                                 return
1414                         video_thumbnail = mobj.group(1)
1415                 else:   # we need something to pass to process_info
1416                         video_thumbnail = ''
1417
1418
1419                 try:
1420                         # Process video information
1421                         self._downloader.process_info({
1422                                 'id':           video_id.decode('utf-8'),
1423                                 'url':          video_url.decode('utf-8'),
1424                                 'uploader':     u'NA',
1425                                 'upload_date':  u'NA',
1426                                 'title':        video_title,
1427                                 'stitle':       simple_title,
1428                                 'ext':          video_extension.decode('utf-8'),
1429                                 'format':       u'NA',
1430                                 'player_url':   None,
1431                         })
1432                 except UnavailableVideoError:
1433                         self._downloader.trouble(u'\nERROR: unable to download video')
1434
1435
1436 class PhotobucketIE(InfoExtractor):
1437         """Information extractor for photobucket.com."""
1438
1439         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1440
1441         def __init__(self, downloader=None):
1442                 InfoExtractor.__init__(self, downloader)
1443
1444         @staticmethod
1445         def suitable(url):
1446                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1447
1448         def report_download_webpage(self, video_id):
1449                 """Report webpage download."""
1450                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1451
1452         def report_extraction(self, video_id):
1453                 """Report information extraction."""
1454                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1455
1456         def _real_initialize(self):
1457                 return
1458
1459         def _real_extract(self, url):
1460                 # Extract id from URL
1461                 mobj = re.match(self._VALID_URL, url)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1464                         return
1465
1466                 # At this point we have a new video
1467                 self._downloader.increment_downloads()
1468                 video_id = mobj.group(1)
1469
1470                 video_extension = 'flv'
1471
1472                 # Retrieve video webpage to extract further information
1473                 request = urllib2.Request(url)
1474                 try:
1475                         self.report_download_webpage(video_id)
1476                         webpage = urllib2.urlopen(request).read()
1477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1479                         return
1480
1481                 # Extract URL, uploader, and title from webpage
1482                 self.report_extraction(video_id)
1483                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1486                         return
1487                 mediaURL = urllib.unquote(mobj.group(1))
1488
1489                 video_url = mediaURL
1490
1491                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1492                 if mobj is None:
1493                         self._downloader.trouble(u'ERROR: unable to extract title')
1494                         return
1495                 video_title = mobj.group(1).decode('utf-8')
1496                 video_title = sanitize_title(video_title)
1497                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1498
1499                 video_uploader = mobj.group(2).decode('utf-8')
1500
1501                 try:
1502                         # Process video information
1503                         self._downloader.process_info({
1504                                 'id':           video_id.decode('utf-8'),
1505                                 'url':          video_url.decode('utf-8'),
1506                                 'uploader':     video_uploader,
1507                                 'upload_date':  u'NA',
1508                                 'title':        video_title,
1509                                 'stitle':       simple_title,
1510                                 'ext':          video_extension.decode('utf-8'),
1511                                 'format':       u'NA',
1512                                 'player_url':   None,
1513                         })
1514                 except UnavailableVideoError:
1515                         self._downloader.trouble(u'\nERROR: unable to download video')
1516
1517
1518 class YahooIE(InfoExtractor):
1519         """Information extractor for video.yahoo.com."""
1520
1521         # _VALID_URL matches all Yahoo! Video URLs
1522         # _VPAGE_URL matches only the extractable '/watch/' URLs
1523         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1524         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1525
1526         def __init__(self, downloader=None):
1527                 InfoExtractor.__init__(self, downloader)
1528
1529         @staticmethod
1530         def suitable(url):
1531                 return (re.match(YahooIE._VALID_URL, url) is not None)
1532
1533         def report_download_webpage(self, video_id):
1534                 """Report webpage download."""
1535                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1536
1537         def report_extraction(self, video_id):
1538                 """Report information extraction."""
1539                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1540
1541         def _real_initialize(self):
1542                 return
1543
1544         def _real_extract(self, url, new_video=True):
1545                 # Extract ID from URL
1546                 mobj = re.match(self._VALID_URL, url)
1547                 if mobj is None:
1548                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1549                         return
1550
1551                 # At this point we have a new video
1552                 self._downloader.increment_downloads()
1553                 video_id = mobj.group(2)
1554                 video_extension = 'flv'
1555
1556                 # Rewrite valid but non-extractable URLs as
1557                 # extractable English language /watch/ URLs
1558                 if re.match(self._VPAGE_URL, url) is None:
1559                         request = urllib2.Request(url)
1560                         try:
1561                                 webpage = urllib2.urlopen(request).read()
1562                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1563                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1564                                 return
1565
1566                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1567                         if mobj is None:
1568                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1569                                 return
1570                         yahoo_id = mobj.group(1)
1571
1572                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1573                         if mobj is None:
1574                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1575                                 return
1576                         yahoo_vid = mobj.group(1)
1577
1578                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1579                         return self._real_extract(url, new_video=False)
1580
1581                 # Retrieve video webpage to extract further information
1582                 request = urllib2.Request(url)
1583                 try:
1584                         self.report_download_webpage(video_id)
1585                         webpage = urllib2.urlopen(request).read()
1586                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1588                         return
1589
1590                 # Extract uploader and title from webpage
1591                 self.report_extraction(video_id)
1592                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1593                 if mobj is None:
1594                         self._downloader.trouble(u'ERROR: unable to extract video title')
1595                         return
1596                 video_title = mobj.group(1).decode('utf-8')
1597                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1598
1599                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1600                 if mobj is None:
1601                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1602                         return
1603                 video_uploader = mobj.group(1).decode('utf-8')
1604
1605                 # Extract video thumbnail
1606                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1607                 if mobj is None:
1608                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1609                         return
1610                 video_thumbnail = mobj.group(1).decode('utf-8')
1611
1612                 # Extract video description
1613                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1614                 if mobj is None:
1615                         self._downloader.trouble(u'ERROR: unable to extract video description')
1616                         return
1617                 video_description = mobj.group(1).decode('utf-8')
1618                 if not video_description: video_description = 'No description available.'
1619
1620                 # Extract video height and width
1621                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1622                 if mobj is None:
1623                         self._downloader.trouble(u'ERROR: unable to extract video height')
1624                         return
1625                 yv_video_height = mobj.group(1)
1626
1627                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1628                 if mobj is None:
1629                         self._downloader.trouble(u'ERROR: unable to extract video width')
1630                         return
1631                 yv_video_width = mobj.group(1)
1632
1633                 # Retrieve video playlist to extract media URL
1634                 # I'm not completely sure what all these options are, but we
1635                 # seem to need most of them, otherwise the server sends a 401.
1636                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1637                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1638                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1639                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1640                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1641                 try:
1642                         self.report_download_webpage(video_id)
1643                         webpage = urllib2.urlopen(request).read()
1644                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1645                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1646                         return
1647
1648                 # Extract media URL from playlist XML
1649                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1650                 if mobj is None:
1651                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1652                         return
1653                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1654                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1655
1656                 try:
1657                         # Process video information
1658                         self._downloader.process_info({
1659                                 'id':           video_id.decode('utf-8'),
1660                                 'url':          video_url,
1661                                 'uploader':     video_uploader,
1662                                 'upload_date':  u'NA',
1663                                 'title':        video_title,
1664                                 'stitle':       simple_title,
1665                                 'ext':          video_extension.decode('utf-8'),
1666                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1667                                 'description':  video_description,
1668                                 'thumbnail':    video_thumbnail,
1669                                 'description':  video_description,
1670                                 'player_url':   None,
1671                         })
1672                 except UnavailableVideoError:
1673                         self._downloader.trouble(u'\nERROR: unable to download video')
1674
1675
1676 class GenericIE(InfoExtractor):
1677         """Generic last-resort information extractor."""
1678
1679         def __init__(self, downloader=None):
1680                 InfoExtractor.__init__(self, downloader)
1681
1682         @staticmethod
1683         def suitable(url):
1684                 return True
1685
1686         def report_download_webpage(self, video_id):
1687                 """Report webpage download."""
1688                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1689                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1690
1691         def report_extraction(self, video_id):
1692                 """Report information extraction."""
1693                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1694
1695         def _real_initialize(self):
1696                 return
1697
1698         def _real_extract(self, url):
1699                 # At this point we have a new video
1700                 self._downloader.increment_downloads()
1701
1702                 video_id = url.split('/')[-1]
1703                 request = urllib2.Request(url)
1704                 try:
1705                         self.report_download_webpage(video_id)
1706                         webpage = urllib2.urlopen(request).read()
1707                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1708                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1709                         return
1710                 except ValueError, err:
1711                         # since this is the last-resort InfoExtractor, if
1712                         # this error is thrown, it'll be thrown here
1713                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1714                         return
1715
1716                 self.report_extraction(video_id)
1717                 # Start with something easy: JW Player in SWFObject
1718                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1719                 if mobj is None:
1720                         # Broaden the search a little bit
1721                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1722                 if mobj is None:
1723                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1724                         return
1725
1726                 # It's possible that one of the regexes
1727                 # matched, but returned an empty group:
1728                 if mobj.group(1) is None:
1729                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1730                         return
1731
1732                 video_url = urllib.unquote(mobj.group(1))
1733                 video_id  = os.path.basename(video_url)
1734
1735                 # here's a fun little line of code for you:
1736                 video_extension = os.path.splitext(video_id)[1][1:]
1737                 video_id        = os.path.splitext(video_id)[0]
1738
1739                 # it's tempting to parse this further, but you would
1740                 # have to take into account all the variations like
1741                 #   Video Title - Site Name
1742                 #   Site Name | Video Title
1743                 #   Video Title - Tagline | Site Name
1744                 # and so on and so forth; it's just not practical
1745                 mobj = re.search(r'<title>(.*)</title>', webpage)
1746                 if mobj is None:
1747                         self._downloader.trouble(u'ERROR: unable to extract title')
1748                         return
1749                 video_title = mobj.group(1).decode('utf-8')
1750                 video_title = sanitize_title(video_title)
1751                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1752
1753                 # video uploader is domain name
1754                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1755                 if mobj is None:
1756                         self._downloader.trouble(u'ERROR: unable to extract title')
1757                         return
1758                 video_uploader = mobj.group(1).decode('utf-8')
1759
1760                 try:
1761                         # Process video information
1762                         self._downloader.process_info({
1763                                 'id':           video_id.decode('utf-8'),
1764                                 'url':          video_url.decode('utf-8'),
1765                                 'uploader':     video_uploader,
1766                                 'upload_date':  u'NA',
1767                                 'title':        video_title,
1768                                 'stitle':       simple_title,
1769                                 'ext':          video_extension.decode('utf-8'),
1770                                 'format':       u'NA',
1771                                 'player_url':   None,
1772                         })
1773                 except UnavailableVideoError, err:
1774                         self._downloader.trouble(u'\nERROR: unable to download video')
1775
1776
1777 class YoutubeSearchIE(InfoExtractor):
1778         """Information Extractor for YouTube search queries."""
1779         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1780         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1781         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1782         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1783         _youtube_ie = None
1784         _max_youtube_results = 1000
1785
1786         def __init__(self, youtube_ie, downloader=None):
1787                 InfoExtractor.__init__(self, downloader)
1788                 self._youtube_ie = youtube_ie
1789
1790         @staticmethod
1791         def suitable(url):
1792                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1793
1794         def report_download_page(self, query, pagenum):
1795                 """Report attempt to download playlist page with given number."""
1796                 query = query.decode(preferredencoding())
1797                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1798
1799         def _real_initialize(self):
1800                 self._youtube_ie.initialize()
1801
1802         def _real_extract(self, query):
1803                 mobj = re.match(self._VALID_QUERY, query)
1804                 if mobj is None:
1805                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1806                         return
1807
1808                 prefix, query = query.split(':')
1809                 prefix = prefix[8:]
1810                 query  = query.encode('utf-8')
1811                 if prefix == '':
1812                         self._download_n_results(query, 1)
1813                         return
1814                 elif prefix == 'all':
1815                         self._download_n_results(query, self._max_youtube_results)
1816                         return
1817                 else:
1818                         try:
1819                                 n = long(prefix)
1820                                 if n <= 0:
1821                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1822                                         return
1823                                 elif n > self._max_youtube_results:
1824                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1825                                         n = self._max_youtube_results
1826                                 self._download_n_results(query, n)
1827                                 return
1828                         except ValueError: # parsing prefix as integer fails
1829                                 self._download_n_results(query, 1)
1830                                 return
1831
1832         def _download_n_results(self, query, n):
1833                 """Downloads a specified number of results for a query"""
1834
1835                 video_ids = []
1836                 already_seen = set()
1837                 pagenum = 1
1838
1839                 while True:
1840                         self.report_download_page(query, pagenum)
1841                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1842                         request = urllib2.Request(result_url)
1843                         try:
1844                                 page = urllib2.urlopen(request).read()
1845                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1846                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1847                                 return
1848
1849                         # Extract video identifiers
1850                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1851                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1852                                 if video_id not in already_seen:
1853                                         video_ids.append(video_id)
1854                                         already_seen.add(video_id)
1855                                         if len(video_ids) == n:
1856                                                 # Specified n videos reached
1857                                                 for id in video_ids:
1858                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1859                                                 return
1860
1861                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1862                                 for id in video_ids:
1863                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1864                                 return
1865
1866                         pagenum = pagenum + 1
1867
1868 class GoogleSearchIE(InfoExtractor):
1869         """Information Extractor for Google Video search queries."""
1870         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1871         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1872         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1873         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1874         _google_ie = None
1875         _max_google_results = 1000
1876
1877         def __init__(self, google_ie, downloader=None):
1878                 InfoExtractor.__init__(self, downloader)
1879                 self._google_ie = google_ie
1880
1881         @staticmethod
1882         def suitable(url):
1883                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1884
1885         def report_download_page(self, query, pagenum):
1886                 """Report attempt to download playlist page with given number."""
1887                 query = query.decode(preferredencoding())
1888                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1889
1890         def _real_initialize(self):
1891                 self._google_ie.initialize()
1892
1893         def _real_extract(self, query):
1894                 mobj = re.match(self._VALID_QUERY, query)
1895                 if mobj is None:
1896                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1897                         return
1898
1899                 prefix, query = query.split(':')
1900                 prefix = prefix[8:]
1901                 query  = query.encode('utf-8')
1902                 if prefix == '':
1903                         self._download_n_results(query, 1)
1904                         return
1905                 elif prefix == 'all':
1906                         self._download_n_results(query, self._max_google_results)
1907                         return
1908                 else:
1909                         try:
1910                                 n = long(prefix)
1911                                 if n <= 0:
1912                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1913                                         return
1914                                 elif n > self._max_google_results:
1915                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1916                                         n = self._max_google_results
1917                                 self._download_n_results(query, n)
1918                                 return
1919                         except ValueError: # parsing prefix as integer fails
1920                                 self._download_n_results(query, 1)
1921                                 return
1922
1923         def _download_n_results(self, query, n):
1924                 """Downloads a specified number of results for a query"""
1925
1926                 video_ids = []
1927                 already_seen = set()
1928                 pagenum = 1
1929
1930                 while True:
1931                         self.report_download_page(query, pagenum)
1932                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1933                         request = urllib2.Request(result_url)
1934                         try:
1935                                 page = urllib2.urlopen(request).read()
1936                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1937                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1938                                 return
1939
1940                         # Extract video identifiers
1941                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1942                                 video_id = mobj.group(1)
1943                                 if video_id not in already_seen:
1944                                         video_ids.append(video_id)
1945                                         already_seen.add(video_id)
1946                                         if len(video_ids) == n:
1947                                                 # Specified n videos reached
1948                                                 for id in video_ids:
1949                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1950                                                 return
1951
1952                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1953                                 for id in video_ids:
1954                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1955                                 return
1956
1957                         pagenum = pagenum + 1
1958
1959 class YahooSearchIE(InfoExtractor):
1960         """Information Extractor for Yahoo! Video search queries."""
1961         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1962         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1963         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1964         _MORE_PAGES_INDICATOR = r'\s*Next'
1965         _yahoo_ie = None
1966         _max_yahoo_results = 1000
1967
1968         def __init__(self, yahoo_ie, downloader=None):
1969                 InfoExtractor.__init__(self, downloader)
1970                 self._yahoo_ie = yahoo_ie
1971
1972         @staticmethod
1973         def suitable(url):
1974                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1975
1976         def report_download_page(self, query, pagenum):
1977                 """Report attempt to download playlist page with given number."""
1978                 query = query.decode(preferredencoding())
1979                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1980
1981         def _real_initialize(self):
1982                 self._yahoo_ie.initialize()
1983
1984         def _real_extract(self, query):
1985                 mobj = re.match(self._VALID_QUERY, query)
1986                 if mobj is None:
1987                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1988                         return
1989
1990                 prefix, query = query.split(':')
1991                 prefix = prefix[8:]
1992                 query  = query.encode('utf-8')
1993                 if prefix == '':
1994                         self._download_n_results(query, 1)
1995                         return
1996                 elif prefix == 'all':
1997                         self._download_n_results(query, self._max_yahoo_results)
1998                         return
1999                 else:
2000                         try:
2001                                 n = long(prefix)
2002                                 if n <= 0:
2003                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2004                                         return
2005                                 elif n > self._max_yahoo_results:
2006                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
2007                                         n = self._max_yahoo_results
2008                                 self._download_n_results(query, n)
2009                                 return
2010                         except ValueError: # parsing prefix as integer fails
2011                                 self._download_n_results(query, 1)
2012                                 return
2013
2014         def _download_n_results(self, query, n):
2015                 """Downloads a specified number of results for a query"""
2016
2017                 video_ids = []
2018                 already_seen = set()
2019                 pagenum = 1
2020
2021                 while True:
2022                         self.report_download_page(query, pagenum)
2023                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2024                         request = urllib2.Request(result_url)
2025                         try:
2026                                 page = urllib2.urlopen(request).read()
2027                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2028                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2029                                 return
2030
2031                         # Extract video identifiers
2032                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2033                                 video_id = mobj.group(1)
2034                                 if video_id not in already_seen:
2035                                         video_ids.append(video_id)
2036                                         already_seen.add(video_id)
2037                                         if len(video_ids) == n:
2038                                                 # Specified n videos reached
2039                                                 for id in video_ids:
2040                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2041                                                 return
2042
2043                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2044                                 for id in video_ids:
2045                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2046                                 return
2047
2048                         pagenum = pagenum + 1
2049
2050 class YoutubePlaylistIE(InfoExtractor):
2051         """Information Extractor for YouTube playlists."""
2052
2053         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2054         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2055         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2056         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2057         _youtube_ie = None
2058
2059         def __init__(self, youtube_ie, downloader=None):
2060                 InfoExtractor.__init__(self, downloader)
2061                 self._youtube_ie = youtube_ie
2062
2063         @staticmethod
2064         def suitable(url):
2065                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2066
2067         def report_download_page(self, playlist_id, pagenum):
2068                 """Report attempt to download playlist page with given number."""
2069                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2070
2071         def _real_initialize(self):
2072                 self._youtube_ie.initialize()
2073
2074         def _real_extract(self, url):
2075                 # Extract playlist id
2076                 mobj = re.match(self._VALID_URL, url)
2077                 if mobj is None:
2078                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2079                         return
2080
2081                 # Download playlist pages
2082                 playlist_id = mobj.group(1)
2083                 video_ids = []
2084                 pagenum = 1
2085
2086                 while True:
2087                         self.report_download_page(playlist_id, pagenum)
2088                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2089                         try:
2090                                 page = urllib2.urlopen(request).read()
2091                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2092                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2093                                 return
2094
2095                         # Extract video identifiers
2096                         ids_in_page = []
2097                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2098                                 if mobj.group(1) not in ids_in_page:
2099                                         ids_in_page.append(mobj.group(1))
2100                         video_ids.extend(ids_in_page)
2101
2102                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2103                                 break
2104                         pagenum = pagenum + 1
2105
2106                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2107                 playlistend = self._downloader.params.get('playlistend', -1)
2108                 video_ids = video_ids[playliststart:playlistend]
2109
2110                 for id in video_ids:
2111                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2112                 return
2113
2114 class YoutubeUserIE(InfoExtractor):
2115         """Information Extractor for YouTube users."""
2116
2117         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2118         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2119         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2120         _youtube_ie = None
2121
2122         def __init__(self, youtube_ie, downloader=None):
2123                 InfoExtractor.__init__(self, downloader)
2124                 self._youtube_ie = youtube_ie
2125
2126         @staticmethod
2127         def suitable(url):
2128                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2129
2130         def report_download_page(self, username):
2131                 """Report attempt to download user page."""
2132                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2133
2134         def _real_initialize(self):
2135                 self._youtube_ie.initialize()
2136
2137         def _real_extract(self, url):
2138                 # Extract username
2139                 mobj = re.match(self._VALID_URL, url)
2140                 if mobj is None:
2141                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2142                         return
2143
2144                 # Download user page
2145                 username = mobj.group(1)
2146                 video_ids = []
2147                 pagenum = 1
2148
2149                 self.report_download_page(username)
2150                 request = urllib2.Request(self._TEMPLATE_URL % (username))
2151                 try:
2152                         page = urllib2.urlopen(request).read()
2153                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2155                         return
2156
2157                 # Extract video identifiers
2158                 ids_in_page = []
2159
2160                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2161                         if mobj.group(1) not in ids_in_page:
2162                                 ids_in_page.append(mobj.group(1))
2163                 video_ids.extend(ids_in_page)
2164
2165                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2166                 playlistend = self._downloader.params.get('playlistend', -1)
2167                 video_ids = video_ids[playliststart:playlistend]
2168
2169                 for id in video_ids:
2170                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2171                 return
2172
2173 class DepositFilesIE(InfoExtractor):
2174         """Information extractor for depositfiles.com"""
2175
2176         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2177
2178         def __init__(self, downloader=None):
2179                 InfoExtractor.__init__(self, downloader)
2180
2181         @staticmethod
2182         def suitable(url):
2183                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2184
2185         def report_download_webpage(self, file_id):
2186                 """Report webpage download."""
2187                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2188
2189         def report_extraction(self, file_id):
2190                 """Report information extraction."""
2191                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2192
2193         def _real_initialize(self):
2194                 return
2195
2196         def _real_extract(self, url):
2197                 # At this point we have a new file
2198                 self._downloader.increment_downloads()
2199
2200                 file_id = url.split('/')[-1]
2201                 # Rebuild url in english locale
2202                 url = 'http://depositfiles.com/en/files/' + file_id
2203
2204                 # Retrieve file webpage with 'Free download' button pressed
2205                 free_download_indication = { 'gateway_result' : '1' }
2206                 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2207                 try:
2208                         self.report_download_webpage(file_id)
2209                         webpage = urllib2.urlopen(request).read()
2210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2211                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2212                         return
2213
2214                 # Search for the real file URL
2215                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2216                 if (mobj is None) or (mobj.group(1) is None):
2217                         # Try to figure out reason of the error.
2218                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2219                         if (mobj is not None) and (mobj.group(1) is not None):
2220                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2221                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2222                         else:
2223                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2224                         return
2225
2226                 file_url = mobj.group(1)
2227                 file_extension = os.path.splitext(file_url)[1][1:]
2228
2229                 # Search for file title
2230                 mobj = re.search(r'<b title="(.*?)">', webpage)
2231                 if mobj is None:
2232                         self._downloader.trouble(u'ERROR: unable to extract title')
2233                         return
2234                 file_title = mobj.group(1).decode('utf-8')
2235
2236                 try:
2237                         # Process file information
2238                         self._downloader.process_info({
2239                                 'id':           file_id.decode('utf-8'),
2240                                 'url':          file_url.decode('utf-8'),
2241                                 'uploader':     u'NA',
2242                                 'upload_date':  u'NA',
2243                                 'title':        file_title,
2244                                 'stitle':       file_title,
2245                                 'ext':          file_extension.decode('utf-8'),
2246                                 'format':       u'NA',
2247                                 'player_url':   None,
2248                         })
2249                 except UnavailableVideoError, err:
2250                         self._downloader.trouble(u'ERROR: unable to download file')
2251
2252 class PostProcessor(object):
2253         """Post Processor class.
2254
2255         PostProcessor objects can be added to downloaders with their
2256         add_post_processor() method. When the downloader has finished a
2257         successful download, it will take its internal chain of PostProcessors
2258         and start calling the run() method on each one of them, first with
2259         an initial argument and then with the returned value of the previous
2260         PostProcessor.
2261
2262         The chain will be stopped if one of them ever returns None or the end
2263         of the chain is reached.
2264
2265         PostProcessor objects follow a "mutual registration" process similar
2266         to InfoExtractor objects.
2267         """
2268
2269         _downloader = None
2270
2271         def __init__(self, downloader=None):
2272                 self._downloader = downloader
2273
2274         def set_downloader(self, downloader):
2275                 """Sets the downloader for this PP."""
2276                 self._downloader = downloader
2277
2278         def run(self, information):
2279                 """Run the PostProcessor.
2280
2281                 The "information" argument is a dictionary like the ones
2282                 composed by InfoExtractors. The only difference is that this
2283                 one has an extra field called "filepath" that points to the
2284                 downloaded file.
2285
2286                 When this method returns None, the postprocessing chain is
2287                 stopped. However, this method may return an information
2288                 dictionary that will be passed to the next postprocessing
2289                 object in the chain. It can be the one it received after
2290                 changing some fields.
2291
2292                 In addition, this method may raise a PostProcessingError
2293                 exception that will be taken into account by the downloader
2294                 it was called from.
2295                 """
2296                 return information # by default, do nothing
2297
2298 ### MAIN PROGRAM ###
2299 if __name__ == '__main__':
2300         try:
2301                 # Modules needed only when running the main program
2302                 import getpass
2303                 import optparse
2304
2305                 # Function to update the program file with the latest version from the repository.
2306                 def update_self(downloader, filename):
2307                         # Note: downloader only used for options
2308                         if not os.access(filename, os.W_OK):
2309                                 sys.exit('ERROR: no write permissions on %s' % filename)
2310
2311                         downloader.to_screen('Updating to latest stable version...')
2312                         try:
2313                                 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2314                                 latest_version = urllib.urlopen(latest_url).read().strip()
2315                                 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2316                                 newcontent = urllib.urlopen(prog_url).read()
2317                         except (IOError, OSError), err:
2318                                 sys.exit('ERROR: unable to download latest version')
2319                         try:
2320                                 stream = open(filename, 'w')
2321                                 stream.write(newcontent)
2322                                 stream.close()
2323                         except (IOError, OSError), err:
2324                                 sys.exit('ERROR: unable to overwrite current version')
2325                         downloader.to_screen('Updated to version %s' % latest_version)
2326
2327                 # Parse command line
2328                 parser = optparse.OptionParser(
2329                         usage='Usage: %prog [options] url...',
2330                         version='2010.12.09',
2331                         conflict_handler='resolve',
2332                 )
2333
2334                 parser.add_option('-h', '--help',
2335                                 action='help', help='print this help text and exit')
2336                 parser.add_option('-v', '--version',
2337                                 action='version', help='print program version and exit')
2338                 parser.add_option('-U', '--update',
2339                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2340                 parser.add_option('-i', '--ignore-errors',
2341                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2342                 parser.add_option('-r', '--rate-limit',
2343                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2344                 parser.add_option('-R', '--retries',
2345                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2346                 parser.add_option('--playlist-start',
2347                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2348                 parser.add_option('--playlist-end',
2349                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2350                 parser.add_option('--dump-user-agent',
2351                                 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2352
2353                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2354                 authentication.add_option('-u', '--username',
2355                                 dest='username', metavar='USERNAME', help='account username')
2356                 authentication.add_option('-p', '--password',
2357                                 dest='password', metavar='PASSWORD', help='account password')
2358                 authentication.add_option('-n', '--netrc',
2359                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2360                 parser.add_option_group(authentication)
2361
2362                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2363                 video_format.add_option('-f', '--format',
2364                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2365                 video_format.add_option('--all-formats',
2366                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2367                 video_format.add_option('--max-quality',
2368                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2369                 parser.add_option_group(video_format)
2370
2371                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2372                 verbosity.add_option('-q', '--quiet',
2373                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2374                 verbosity.add_option('-s', '--simulate',
2375                                 action='store_true', dest='simulate', help='do not download video', default=False)
2376                 verbosity.add_option('-g', '--get-url',
2377                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2378                 verbosity.add_option('-e', '--get-title',
2379                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2380                 verbosity.add_option('--get-thumbnail',
2381                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2382                 verbosity.add_option('--get-description',
2383                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2384                 verbosity.add_option('--no-progress',
2385                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2386                 verbosity.add_option('--console-title',
2387                                 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2388                 parser.add_option_group(verbosity)
2389
2390                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2391                 filesystem.add_option('-t', '--title',
2392                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2393                 filesystem.add_option('-l', '--literal',
2394                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2395                 filesystem.add_option('-A', '--auto-number',
2396                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2397                 filesystem.add_option('-o', '--output',
2398                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2399                 filesystem.add_option('-a', '--batch-file',
2400                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2401                 filesystem.add_option('-w', '--no-overwrites',
2402                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2403                 filesystem.add_option('-c', '--continue',
2404                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2405                 filesystem.add_option('--cookies',
2406                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2407                 filesystem.add_option('--no-part',
2408                                 action='store_true', dest='nopart', help='do not use .part files', default=False)
2409                 parser.add_option_group(filesystem)
2410
2411                 (opts, args) = parser.parse_args()
2412
2413                 # Open appropriate CookieJar
2414                 if opts.cookiefile is None:
2415                         jar = cookielib.CookieJar()
2416                 else:
2417                         try:
2418                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2419                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2420                                         jar.load()
2421                         except (IOError, OSError), err:
2422                                 sys.exit(u'ERROR: unable to open cookie file')
2423
2424                 # Dump user agent
2425                 if opts.dump_user_agent:
2426                         print std_headers['User-Agent']
2427                         sys.exit(0)
2428
2429                 # General configuration
2430                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2431                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2432                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2433
2434                 # Batch file verification
2435                 batchurls = []
2436                 if opts.batchfile is not None:
2437                         try:
2438                                 if opts.batchfile == '-':
2439                                         batchfd = sys.stdin
2440                                 else:
2441                                         batchfd = open(opts.batchfile, 'r')
2442                                 batchurls = batchfd.readlines()
2443                                 batchurls = [x.strip() for x in batchurls]
2444                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2445                         except IOError:
2446                                 sys.exit(u'ERROR: batch file could not be read')
2447                 all_urls = batchurls + args
2448
2449                 # Conflicting, missing and erroneous options
2450                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2451                         parser.error(u'using .netrc conflicts with giving username/password')
2452                 if opts.password is not None and opts.username is None:
2453                         parser.error(u'account username missing')
2454                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2455                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2456                 if opts.usetitle and opts.useliteral:
2457                         parser.error(u'using title conflicts with using literal title')
2458                 if opts.username is not None and opts.password is None:
2459                         opts.password = getpass.getpass(u'Type account password and press return:')
2460                 if opts.ratelimit is not None:
2461                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2462                         if numeric_limit is None:
2463                                 parser.error(u'invalid rate limit specified')
2464                         opts.ratelimit = numeric_limit
2465                 if opts.retries is not None:
2466                         try:
2467                                 opts.retries = long(opts.retries)
2468                         except (TypeError, ValueError), err:
2469                                 parser.error(u'invalid retry count specified')
2470                 try:
2471                         opts.playliststart = long(opts.playliststart)
2472                         if opts.playliststart <= 0:
2473                                 raise ValueError
2474                 except (TypeError, ValueError), err:
2475                         parser.error(u'invalid playlist start number specified')
2476                 try:
2477                         opts.playlistend = long(opts.playlistend)
2478                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2479                                 raise ValueError
2480                 except (TypeError, ValueError), err:
2481                         parser.error(u'invalid playlist end number specified')
2482
2483                 # Information extractors
2484                 youtube_ie = YoutubeIE()
2485                 metacafe_ie = MetacafeIE(youtube_ie)
2486                 dailymotion_ie = DailymotionIE()
2487                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2488                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2489                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2490                 google_ie = GoogleIE()
2491                 google_search_ie = GoogleSearchIE(google_ie)
2492                 photobucket_ie = PhotobucketIE()
2493                 yahoo_ie = YahooIE()
2494                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2495                 deposit_files_ie = DepositFilesIE()
2496                 generic_ie = GenericIE()
2497
2498                 # File downloader
2499                 fd = FileDownloader({
2500                         'usenetrc': opts.usenetrc,
2501                         'username': opts.username,
2502                         'password': opts.password,
2503                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2504                         'forceurl': opts.geturl,
2505                         'forcetitle': opts.gettitle,
2506                         'forcethumbnail': opts.getthumbnail,
2507                         'forcedescription': opts.getdescription,
2508                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2509                         'format': opts.format,
2510                         'format_limit': opts.format_limit,
2511                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2512                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2513                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2514                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2515                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2516                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2517                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2518                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2519                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2520                                 or u'%(id)s.%(ext)s'),
2521                         'ignoreerrors': opts.ignoreerrors,
2522                         'ratelimit': opts.ratelimit,
2523                         'nooverwrites': opts.nooverwrites,
2524                         'retries': opts.retries,
2525                         'continuedl': opts.continue_dl,
2526                         'noprogress': opts.noprogress,
2527                         'playliststart': opts.playliststart,
2528                         'playlistend': opts.playlistend,
2529                         'logtostderr': opts.outtmpl == '-',
2530                         'consoletitle': opts.consoletitle,
2531                         'nopart': opts.nopart,
2532                         })
2533                 fd.add_info_extractor(youtube_search_ie)
2534                 fd.add_info_extractor(youtube_pl_ie)
2535                 fd.add_info_extractor(youtube_user_ie)
2536                 fd.add_info_extractor(metacafe_ie)
2537                 fd.add_info_extractor(dailymotion_ie)
2538                 fd.add_info_extractor(youtube_ie)
2539                 fd.add_info_extractor(google_ie)
2540                 fd.add_info_extractor(google_search_ie)
2541                 fd.add_info_extractor(photobucket_ie)
2542                 fd.add_info_extractor(yahoo_ie)
2543                 fd.add_info_extractor(yahoo_search_ie)
2544                 fd.add_info_extractor(deposit_files_ie)
2545
2546                 # This must come last since it's the
2547                 # fallback if none of the others work
2548                 fd.add_info_extractor(generic_ie)
2549
2550                 # Update version
2551                 if opts.update_self:
2552                         update_self(fd, sys.argv[0])
2553
2554                 # Maybe do nothing
2555                 if len(all_urls) < 1:
2556                         if not opts.update_self:
2557                                 parser.error(u'you must provide at least one URL')
2558                         else:
2559                                 sys.exit()
2560                 retcode = fd.download(all_urls)
2561
2562                 # Dump cookie jar if requested
2563                 if opts.cookiefile is not None:
2564                         try:
2565                                 jar.save()
2566                         except (IOError, OSError), err:
2567                                 sys.exit(u'ERROR: unable to save cookie jar')
2568
2569                 sys.exit(retcode)
2570
2571         except DownloadError:
2572                 sys.exit(1)
2573         except SameFileError:
2574                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2575         except KeyboardInterrupt:
2576                 sys.exit(u'\nERROR: Interrupted by user')