_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import cookielib
   8 import datetime
   9 import htmlentitydefs
  10 import httplib
  11 import locale
  12 import math
  13 import netrc
  14 import os
  15 import os.path
  16 import re
  17 import socket
  18 import string
  19 import subprocess
  20 import sys
  21 import time
  22 import urllib
  23 import urllib2
  24
  25 # parse_qs was moved from the cgi module to the urlparse module recently.
  26 try:
  27         from urlparse import parse_qs
  28 except ImportError:
  29         from cgi import parse_qs
  30
  31 std_headers = {
  32         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.11) Gecko/20101019 Firefox/3.6.11',
  33         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  34         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  35         'Accept-Language': 'en-us,en;q=0.5',
  36 }
  37
  38 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  39
  40 def preferredencoding():
  41         """Get preferred encoding.
  42
  43         Returns the best encoding scheme for the system, based on
  44         locale.getpreferredencoding() and some further tweaks.
  45         """
  46         def yield_preferredencoding():
  47                 try:
  48                         pref = locale.getpreferredencoding()
  49                         u'TEST'.encode(pref)
  50                 except:
  51                         pref = 'UTF-8'
  52                 while True:
  53                         yield pref
  54         return yield_preferredencoding().next()
  55
  56 def htmlentity_transform(matchobj):
  57         """Transforms an HTML entity to a Unicode character.
  58
  59         This function receives a match object and is intended to be used with
  60         the re.sub() function.
  61         """
  62         entity = matchobj.group(1)
  63
  64         # Known non-numeric HTML entity
  65         if entity in htmlentitydefs.name2codepoint:
  66                 return unichr(htmlentitydefs.name2codepoint[entity])
  67
  68         # Unicode character
  69         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  70         if mobj is not None:
  71                 numstr = mobj.group(1)
  72                 if numstr.startswith(u'x'):
  73                         base = 16
  74                         numstr = u'0%s' % numstr
  75                 else:
  76                         base = 10
  77                 return unichr(long(numstr, base))
  78
  79         # Unknown entity in name, return its literal representation
  80         return (u'&%s;' % entity)
  81
  82 def sanitize_title(utitle):
  83         """Sanitizes a video title so it could be used as part of a filename."""
  84         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  85         return utitle.replace(unicode(os.sep), u'%')
  86
  87 def sanitize_open(filename, open_mode):
  88         """Try to open the given filename, and slightly tweak it if this fails.
  89
  90         Attempts to open the given filename. If this fails, it tries to change
  91         the filename slightly, step by step, until it's either able to open it
  92         or it fails and raises a final exception, like the standard open()
  93         function.
  94
  95         It returns the tuple (stream, definitive_file_name).
  96         """
  97         try:
  98                 if filename == u'-':
  99                         if sys.platform == 'win32':
 100                                 import msvcrt
 101                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 102                         return (sys.stdout, filename)
 103                 stream = open(filename, open_mode)
 104                 return (stream, filename)
 105         except (IOError, OSError), err:
 106                 # In case of error, try to remove win32 forbidden chars
 107                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 108
 109                 # An exception here should be caught in the caller
 110                 stream = open(filename, open_mode)
 111                 return (stream, filename)
 112
 113
 114 class DownloadError(Exception):
 115         """Download Error exception.
 116
 117         This exception may be thrown by FileDownloader objects if they are not
 118         configured to continue on errors. They will contain the appropriate
 119         error message.
 120         """
 121         pass
 122
 123 class SameFileError(Exception):
 124         """Same File exception.
 125
 126         This exception will be thrown by FileDownloader objects if they detect
 127         multiple files would have to be downloaded to the same file on disk.
 128         """
 129         pass
 130
 131 class PostProcessingError(Exception):
 132         """Post Processing exception.
 133
 134         This exception may be raised by PostProcessor's .run() method to
 135         indicate an error in the postprocessing task.
 136         """
 137         pass
 138
 139 class UnavailableVideoError(Exception):
 140         """Unavailable Format exception.
 141
 142         This exception will be thrown when a video is requested
 143         in a format that is not available for that video.
 144         """
 145         pass
 146
 147 class ContentTooShortError(Exception):
 148         """Content Too Short exception.
 149
 150         This exception may be raised by FileDownloader objects when a file they
 151         download is too small for what the server announced first, indicating
 152         the connection was probably interrupted.
 153         """
 154         # Both in bytes
 155         downloaded = None
 156         expected = None
 157
 158         def __init__(self, downloaded, expected):
 159                 self.downloaded = downloaded
 160                 self.expected = expected
 161
 162 class FileDownloader(object):
 163         """File Downloader class.
 164
 165         File downloader objects are the ones responsible of downloading the
 166         actual video file and writing it to disk if the user has requested
 167         it, among some other tasks. In most cases there should be one per
 168         program. As, given a video URL, the downloader doesn't know how to
 169         extract all the needed information, task that InfoExtractors do, it
 170         has to pass the URL to one of them.
 171
 172         For this, file downloader objects have a method that allows
 173         InfoExtractors to be registered in a given order. When it is passed
 174         a URL, the file downloader handles it to the first InfoExtractor it
 175         finds that reports being able to handle it. The InfoExtractor extracts
 176         all the information about the video or videos the URL refers to, and
 177         asks the FileDownloader to process the video information, possibly
 178         downloading the video.
 179
 180         File downloaders accept a lot of parameters. In order not to saturate
 181         the object constructor with arguments, it receives a dictionary of
 182         options instead. These options are available through the params
 183         attribute for the InfoExtractors to use. The FileDownloader also
 184         registers itself as the downloader in charge for the InfoExtractors
 185         that are added to it, so this is a "mutual registration".
 186
 187         Available options:
 188
 189         username:         Username for authentication purposes.
 190         password:         Password for authentication purposes.
 191         usenetrc:         Use netrc for authentication instead.
 192         quiet:            Do not print messages to stdout.
 193         forceurl:         Force printing final URL.
 194         forcetitle:       Force printing title.
 195         forcethumbnail:   Force printing thumbnail URL.
 196         forcedescription: Force printing description.
 197         simulate:         Do not download the video files.
 198         format:           Video format code.
 199         format_limit:     Highest quality format to try.
 200         outtmpl:          Template for output names.
 201         ignoreerrors:     Do not stop on download errors.
 202         ratelimit:        Download speed limit, in bytes/sec.
 203         nooverwrites:     Prevent overwriting files.
 204         retries:          Number of times to retry for HTTP error 5xx
 205         continuedl:       Try to continue downloads if possible.
 206         noprogress:       Do not print the progress bar.
 207         playliststart:    Playlist item to start at.
 208         playlistend:      Playlist item to end at.
 209         logtostderr:      Log messages to stderr instead of stdout.
 210         """
 211
 212         params = None
 213         _ies = []
 214         _pps = []
 215         _download_retcode = None
 216         _num_downloads = None
 217         _screen_file = None
 218
 219         def __init__(self, params):
 220                 """Create a FileDownloader object with the given options."""
 221                 self._ies = []
 222                 self._pps = []
 223                 self._download_retcode = 0
 224                 self._num_downloads = 0
 225                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 226                 self.params = params
 227
 228         @staticmethod
 229         def pmkdir(filename):
 230                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 231                 components = filename.split(os.sep)
 232                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 233                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 234                 for dir in aggregate:
 235                         if not os.path.exists(dir):
 236                                 os.mkdir(dir)
 237
 238         @staticmethod
 239         def format_bytes(bytes):
 240                 if bytes is None:
 241                         return 'N/A'
 242                 if type(bytes) is str:
 243                         bytes = float(bytes)
 244                 if bytes == 0.0:
 245                         exponent = 0
 246                 else:
 247                         exponent = long(math.log(bytes, 1024.0))
 248                 suffix = 'bkMGTPEZY'[exponent]
 249                 converted = float(bytes) / float(1024**exponent)
 250                 return '%.2f%s' % (converted, suffix)
 251
 252         @staticmethod
 253         def calc_percent(byte_counter, data_len):
 254                 if data_len is None:
 255                         return '---.-%'
 256                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 257
 258         @staticmethod
 259         def calc_eta(start, now, total, current):
 260                 if total is None:
 261                         return '--:--'
 262                 dif = now - start
 263                 if current == 0 or dif < 0.001: # One millisecond
 264                         return '--:--'
 265                 rate = float(current) / dif
 266                 eta = long((float(total) - float(current)) / rate)
 267                 (eta_mins, eta_secs) = divmod(eta, 60)
 268                 if eta_mins > 99:
 269                         return '--:--'
 270                 return '%02d:%02d' % (eta_mins, eta_secs)
 271
 272         @staticmethod
 273         def calc_speed(start, now, bytes):
 274                 dif = now - start
 275                 if bytes == 0 or dif < 0.001: # One millisecond
 276                         return '%10s' % '---b/s'
 277                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 278
 279         @staticmethod
 280         def best_block_size(elapsed_time, bytes):
 281                 new_min = max(bytes / 2.0, 1.0)
 282                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 283                 if elapsed_time < 0.001:
 284                         return long(new_max)
 285                 rate = bytes / elapsed_time
 286                 if rate > new_max:
 287                         return long(new_max)
 288                 if rate < new_min:
 289                         return long(new_min)
 290                 return long(rate)
 291
 292         @staticmethod
 293         def parse_bytes(bytestr):
 294                 """Parse a string indicating a byte quantity into a long integer."""
 295                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 296                 if matchobj is None:
 297                         return None
 298                 number = float(matchobj.group(1))
 299                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 300                 return long(round(number * multiplier))
 301
 302         def add_info_extractor(self, ie):
 303                 """Add an InfoExtractor object to the end of the list."""
 304                 self._ies.append(ie)
 305                 ie.set_downloader(self)
 306
 307         def add_post_processor(self, pp):
 308                 """Add a PostProcessor object to the end of the chain."""
 309                 self._pps.append(pp)
 310                 pp.set_downloader(self)
 311
 312         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
 313                 """Print message to stdout if not in quiet mode."""
 314                 try:
 315                         if not self.params.get('quiet', False):
 316                                 terminator = [u'\n', u''][skip_eol]
 317                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
 318                         self._screen_file.flush()
 319                 except (UnicodeEncodeError), err:
 320                         if not ignore_encoding_errors:
 321                                 raise
 322
 323         def to_stderr(self, message):
 324                 """Print message to stderr."""
 325                 print >>sys.stderr, message.encode(preferredencoding())
 326
 327         def fixed_template(self):
 328                 """Checks if the output template is fixed."""
 329                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 330
 331         def trouble(self, message=None):
 332                 """Determine action to take when a download problem appears.
 333
 334                 Depending on if the downloader has been configured to ignore
 335                 download errors or not, this method may throw an exception or
 336                 not when errors are found, after printing the message.
 337                 """
 338                 if message is not None:
 339                         self.to_stderr(message)
 340                 if not self.params.get('ignoreerrors', False):
 341                         raise DownloadError(message)
 342                 self._download_retcode = 1
 343
 344         def slow_down(self, start_time, byte_counter):
 345                 """Sleep if the download speed is over the rate limit."""
 346                 rate_limit = self.params.get('ratelimit', None)
 347                 if rate_limit is None or byte_counter == 0:
 348                         return
 349                 now = time.time()
 350                 elapsed = now - start_time
 351                 if elapsed <= 0.0:
 352                         return
 353                 speed = float(byte_counter) / elapsed
 354                 if speed > rate_limit:
 355                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 356
 357         def report_destination(self, filename):
 358                 """Report destination filename."""
 359                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 360
 361         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 362                 """Report download progress."""
 363                 if self.params.get('noprogress', False):
 364                         return
 365                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
 366                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 367
 368         def report_resuming_byte(self, resume_len):
 369                 """Report attempt to resume at given byte."""
 370                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
 371
 372         def report_retry(self, count, retries):
 373                 """Report retry in case of HTTP error 5xx"""
 374                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
 375
 376         def report_file_already_downloaded(self, file_name):
 377                 """Report file has already been fully downloaded."""
 378                 try:
 379                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
 380                 except (UnicodeEncodeError), err:
 381                         self.to_screen(u'[download] The file has already been downloaded')
 382
 383         def report_unable_to_resume(self):
 384                 """Report it was impossible to resume download."""
 385                 self.to_screen(u'[download] Unable to resume')
 386
 387         def report_finish(self):
 388                 """Report download finished."""
 389                 if self.params.get('noprogress', False):
 390                         self.to_screen(u'[download] Download completed')
 391                 else:
 392                         self.to_screen(u'')
 393
 394         def increment_downloads(self):
 395                 """Increment the ordinal that assigns a number to each file."""
 396                 self._num_downloads += 1
 397
 398         def process_info(self, info_dict):
 399                 """Process a single dictionary returned by an InfoExtractor."""
 400                 # Do nothing else if in simulate mode
 401                 if self.params.get('simulate', False):
 402                         # Forced printings
 403                         if self.params.get('forcetitle', False):
 404                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 405                         if self.params.get('forceurl', False):
 406                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 407                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 408                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 409                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 410                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 411
 412                         return
 413
 414                 try:
 415                         template_dict = dict(info_dict)
 416                         template_dict['epoch'] = unicode(long(time.time()))
 417                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
 418                         filename = self.params['outtmpl'] % template_dict
 419                 except (ValueError, KeyError), err:
 420                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
 421                         return
 422                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 423                         self.to_stderr(u'WARNING: file exists and will be skipped')
 424                         return
 425
 426                 try:
 427                         self.pmkdir(filename)
 428                 except (OSError, IOError), err:
 429                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
 430                         return
 431
 432                 try:
 433                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
 434                 except (OSError, IOError), err:
 435                         raise UnavailableVideoError
 436                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 437                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
 438                         return
 439                 except (ContentTooShortError, ), err:
 440                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 441                         return
 442
 443                 if success:
 444                         try:
 445                                 self.post_process(filename, info_dict)
 446                         except (PostProcessingError), err:
 447                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
 448                                 return
 449
 450         def download(self, url_list):
 451                 """Download a given list of URLs."""
 452                 if len(url_list) > 1 and self.fixed_template():
 453                         raise SameFileError(self.params['outtmpl'])
 454
 455                 for url in url_list:
 456                         suitable_found = False
 457                         for ie in self._ies:
 458                                 # Go to next InfoExtractor if not suitable
 459                                 if not ie.suitable(url):
 460                                         continue
 461
 462                                 # Suitable InfoExtractor found
 463                                 suitable_found = True
 464
 465                                 # Extract information from URL and process it
 466                                 ie.extract(url)
 467
 468                                 # Suitable InfoExtractor had been found; go to next URL
 469                                 break
 470
 471                         if not suitable_found:
 472                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 473
 474                 return self._download_retcode
 475
 476         def post_process(self, filename, ie_info):
 477                 """Run the postprocessing chain on the given file."""
 478                 info = dict(ie_info)
 479                 info['filepath'] = filename
 480                 for pp in self._pps:
 481                         info = pp.run(info)
 482                         if info is None:
 483                                 break
 484
 485         def _download_with_rtmpdump(self, filename, url, player_url):
 486                 self.report_destination(filename)
 487
 488                 # Check for rtmpdump first
 489                 try:
 490                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 491                 except (OSError, IOError):
 492                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 493                         return False
 494
 495                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 496                 # the connection was interrumpted and resuming appears to be
 497                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 498                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
 499                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 500                 while retval == 2 or retval == 1:
 501                         prevsize = os.path.getsize(filename)
 502                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
 503                         time.sleep(5.0) # This seems to be needed
 504                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 505                         cursize = os.path.getsize(filename)
 506                         if prevsize == cursize and retval == 1:
 507                                 break
 508                 if retval == 0:
 509                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 510                         return True
 511                 else:
 512                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
 513                         return False
 514
 515         def _do_download(self, filename, url, player_url):
 516                 # Attempt to download using rtmpdump
 517                 if url.startswith('rtmp'):
 518                         return self._download_with_rtmpdump(filename, url, player_url)
 519
 520                 stream = None
 521                 open_mode = 'wb'
 522                 basic_request = urllib2.Request(url, None, std_headers)
 523                 request = urllib2.Request(url, None, std_headers)
 524
 525                 # Establish possible resume length
 526                 if os.path.isfile(filename):
 527                         resume_len = os.path.getsize(filename)
 528                 else:
 529                         resume_len = 0
 530
 531                 # Request parameters in case of being able to resume
 532                 if self.params.get('continuedl', False) and resume_len != 0:
 533                         self.report_resuming_byte(resume_len)
 534                         request.add_header('Range','bytes=%d-' % resume_len)
 535                         open_mode = 'ab'
 536
 537                 count = 0
 538                 retries = self.params.get('retries', 0)
 539                 while count <= retries:
 540                         # Establish connection
 541                         try:
 542                                 data = urllib2.urlopen(request)
 543                                 break
 544                         except (urllib2.HTTPError, ), err:
 545                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
 546                                         # Unexpected HTTP error
 547                                         raise
 548                                 elif err.code == 416:
 549                                         # Unable to resume (requested range not satisfiable)
 550                                         try:
 551                                                 # Open the connection again without the range header
 552                                                 data = urllib2.urlopen(basic_request)
 553                                                 content_length = data.info()['Content-Length']
 554                                         except (urllib2.HTTPError, ), err:
 555                                                 if err.code < 500 or err.code >= 600:
 556                                                         raise
 557                                         else:
 558                                                 # Examine the reported length
 559                                                 if (content_length is not None and
 560                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
 561                                                         # The file had already been fully downloaded.
 562                                                         # Explanation to the above condition: in issue #175 it was revealed that
 563                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
 564                                                         # changing the file size slightly and causing problems for some users. So
 565                                                         # I decided to implement a suggested change and consider the file
 566                                                         # completely downloaded if the file size differs less than 100 bytes from
 567                                                         # the one in the hard drive.
 568                                                         self.report_file_already_downloaded(filename)
 569                                                         return True
 570                                                 else:
 571                                                         # The length does not match, we start the download over
 572                                                         self.report_unable_to_resume()
 573                                                         open_mode = 'wb'
 574                                                         break
 575                         # Retry
 576                         count += 1
 577                         if count <= retries:
 578                                 self.report_retry(count, retries)
 579
 580                 if count > retries:
 581                         self.trouble(u'ERROR: giving up after %s retries' % retries)
 582                         return False
 583
 584                 data_len = data.info().get('Content-length', None)
 585                 data_len_str = self.format_bytes(data_len)
 586                 byte_counter = 0
 587                 block_size = 1024
 588                 start = time.time()
 589                 while True:
 590                         # Download and write
 591                         before = time.time()
 592                         data_block = data.read(block_size)
 593                         after = time.time()
 594                         data_block_len = len(data_block)
 595                         if data_block_len == 0:
 596                                 break
 597                         byte_counter += data_block_len
 598
 599                         # Open file just in time
 600                         if stream is None:
 601                                 try:
 602                                         (stream, filename) = sanitize_open(filename, open_mode)
 603                                         self.report_destination(filename)
 604                                 except (OSError, IOError), err:
 605                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
 606                                         return False
 607                         try:
 608                                 stream.write(data_block)
 609                         except (IOError, OSError), err:
 610                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
 611                                 return False
 612                         block_size = self.best_block_size(after - before, data_block_len)
 613
 614                         # Progress message
 615                         percent_str = self.calc_percent(byte_counter, data_len)
 616                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 617                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 618                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 619
 620                         # Apply rate limit
 621                         self.slow_down(start, byte_counter)
 622
 623                 self.report_finish()
 624                 if data_len is not None and str(byte_counter) != data_len:
 625                         raise ContentTooShortError(byte_counter, long(data_len))
 626                 return True
 627
 628 class InfoExtractor(object):
 629         """Information Extractor class.
 630
 631         Information extractors are the classes that, given a URL, extract
 632         information from the video (or videos) the URL refers to. This
 633         information includes the real video URL, the video title and simplified
 634         title, author and others. The information is stored in a dictionary
 635         which is then passed to the FileDownloader. The FileDownloader
 636         processes this information possibly downloading the video to the file
 637         system, among other possible outcomes. The dictionaries must include
 638         the following fields:
 639
 640         id:             Video identifier.
 641         url:            Final video URL.
 642         uploader:       Nickname of the video uploader.
 643         title:          Literal title.
 644         stitle:         Simplified title.
 645         ext:            Video filename extension.
 646         format:         Video format.
 647         player_url:     SWF Player URL (may be None).
 648
 649         The following fields are optional. Their primary purpose is to allow
 650         youtube-dl to serve as the backend for a video search function, such
 651         as the one in youtube2mp3.  They are only used when their respective
 652         forced printing functions are called:
 653
 654         thumbnail:      Full URL to a video thumbnail image.
 655         description:    One-line video description.
 656
 657         Subclasses of this one should re-define the _real_initialize() and
 658         _real_extract() methods, as well as the suitable() static method.
 659         Probably, they should also be instantiated and added to the main
 660         downloader.
 661         """
 662
 663         _ready = False
 664         _downloader = None
 665
 666         def __init__(self, downloader=None):
 667                 """Constructor. Receives an optional downloader."""
 668                 self._ready = False
 669                 self.set_downloader(downloader)
 670
 671         @staticmethod
 672         def suitable(url):
 673                 """Receives a URL and returns True if suitable for this IE."""
 674                 return False
 675
 676         def initialize(self):
 677                 """Initializes an instance (authentication, etc)."""
 678                 if not self._ready:
 679                         self._real_initialize()
 680                         self._ready = True
 681
 682         def extract(self, url):
 683                 """Extracts URL information and returns it in list of dicts."""
 684                 self.initialize()
 685                 return self._real_extract(url)
 686
 687         def set_downloader(self, downloader):
 688                 """Sets the downloader for this IE."""
 689                 self._downloader = downloader
 690
 691         def _real_initialize(self):
 692                 """Real initialization process. Redefine in subclasses."""
 693                 pass
 694
 695         def _real_extract(self, url):
 696                 """Real extraction process. Redefine in subclasses."""
 697                 pass
 698
 699 class YoutubeIE(InfoExtractor):
 700         """Information extractor for youtube.com."""
 701
 702         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
 703         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 704         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
 705         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 706         _NETRC_MACHINE = 'youtube'
 707         # Listed in order of quality
 708         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
 709         _video_extensions = {
 710                 '13': '3gp',
 711                 '17': 'mp4',
 712                 '18': 'mp4',
 713                 '22': 'mp4',
 714                 '37': 'mp4',
 715                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 716                 '43': 'webm',
 717                 '45': 'webm',
 718         }
 719
 720         @staticmethod
 721         def suitable(url):
 722                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 723
 724         def report_lang(self):
 725                 """Report attempt to set language."""
 726                 self._downloader.to_screen(u'[youtube] Setting language')
 727
 728         def report_login(self):
 729                 """Report attempt to log in."""
 730                 self._downloader.to_screen(u'[youtube] Logging in')
 731
 732         def report_age_confirmation(self):
 733                 """Report attempt to confirm age."""
 734                 self._downloader.to_screen(u'[youtube] Confirming age')
 735
 736         def report_video_webpage_download(self, video_id):
 737                 """Report attempt to download video webpage."""
 738                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
 739
 740         def report_video_info_webpage_download(self, video_id):
 741                 """Report attempt to download video info webpage."""
 742                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 743
 744         def report_information_extraction(self, video_id):
 745                 """Report attempt to extract video information."""
 746                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
 747
 748         def report_unavailable_format(self, video_id, format):
 749                 """Report extracted video URL."""
 750                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
 751
 752         def report_rtmp_download(self):
 753                 """Indicate the download will use the RTMP protocol."""
 754                 self._downloader.to_screen(u'[youtube] RTMP download detected')
 755
 756         def _real_initialize(self):
 757                 if self._downloader is None:
 758                         return
 759
 760                 username = None
 761                 password = None
 762                 downloader_params = self._downloader.params
 763
 764                 # Attempt to use provided username and password or .netrc data
 765                 if downloader_params.get('username', None) is not None:
 766                         username = downloader_params['username']
 767                         password = downloader_params['password']
 768                 elif downloader_params.get('usenetrc', False):
 769                         try:
 770                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 771                                 if info is not None:
 772                                         username = info[0]
 773                                         password = info[2]
 774                                 else:
 775                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 776                         except (IOError, netrc.NetrcParseError), err:
 777                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 778                                 return
 779
 780                 # Set language
 781                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 782                 try:
 783                         self.report_lang()
 784                         urllib2.urlopen(request).read()
 785                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 786                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 787                         return
 788
 789                 # No authentication to be performed
 790                 if username is None:
 791                         return
 792
 793                 # Log in
 794                 login_form = {
 795                                 'current_form': 'loginForm',
 796                                 'next':         '/',
 797                                 'action_login': 'Log In',
 798                                 'username':     username,
 799                                 'password':     password,
 800                                 }
 801                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 802                 try:
 803                         self.report_login()
 804                         login_results = urllib2.urlopen(request).read()
 805                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 806                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 807                                 return
 808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 809                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 810                         return
 811
 812                 # Confirm age
 813                 age_form = {
 814                                 'next_url':             '/',
 815                                 'action_confirm':       'Confirm',
 816                                 }
 817                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 818                 try:
 819                         self.report_age_confirmation()
 820                         age_results = urllib2.urlopen(request).read()
 821                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 822                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 823                         return
 824
 825         def _real_extract(self, url):
 826                 # Extract video id from URL
 827                 mobj = re.match(self._VALID_URL, url)
 828                 if mobj is None:
 829                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 830                         return
 831                 video_id = mobj.group(2)
 832
 833                 # Get video webpage
 834                 self.report_video_webpage_download(video_id)
 835                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
 836                 try:
 837                         video_webpage = urllib2.urlopen(request).read()
 838                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 839                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 840                         return
 841
 842                 # Attempt to extract SWF player URL
 843                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 844                 if mobj is not None:
 845                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 846                 else:
 847                         player_url = None
 848
 849                 # Get video info
 850                 self.report_video_info_webpage_download(video_id)
 851                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 852                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 853                                            % (video_id, el_type))
 854                         request = urllib2.Request(video_info_url, None, std_headers)
 855                         try:
 856                                 video_info_webpage = urllib2.urlopen(request).read()
 857                                 video_info = parse_qs(video_info_webpage)
 858                                 if 'token' in video_info:
 859                                         break
 860                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 861                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 862                                 return
 863                 if 'token' not in video_info:
 864                         if 'reason' in video_info:
 865                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
 866                         else:
 867                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
 868                         return
 869
 870                 # Start extracting information
 871                 self.report_information_extraction(video_id)
 872
 873                 # uploader
 874                 if 'author' not in video_info:
 875                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 876                         return
 877                 video_uploader = urllib.unquote_plus(video_info['author'][0])
 878
 879                 # title
 880                 if 'title' not in video_info:
 881                         self._downloader.trouble(u'ERROR: unable to extract video title')
 882                         return
 883                 video_title = urllib.unquote_plus(video_info['title'][0])
 884                 video_title = video_title.decode('utf-8')
 885                 video_title = sanitize_title(video_title)
 886
 887                 # simplified title
 888                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 889                 simple_title = simple_title.strip(ur'_')
 890
 891                 # thumbnail image
 892                 if 'thumbnail_url' not in video_info:
 893                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 894                         video_thumbnail = ''
 895                 else:   # don't panic if we can't find it
 896                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 897
 898                 # upload date
 899                 upload_date = u'NA'
 900                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
 901                 if mobj is not None:
 902                         upload_date = mobj.group(1).split()
 903                         format_expressions = ['%d %B %Y', '%B %d, %Y']
 904                         for expression in format_expressions:
 905                                 try:
 906                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 907                                 except:
 908                                         pass
 909
 910                 # description
 911                 video_description = 'No description available.'
 912                 if self._downloader.params.get('forcedescription', False):
 913                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
 914                         if mobj is not None:
 915                                 video_description = mobj.group(1)
 916
 917                 # token
 918                 video_token = urllib.unquote_plus(video_info['token'][0])
 919
 920                 # Decide which formats to download
 921                 requested_format = self._downloader.params.get('format', None)
 922                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
 923
 924                 if 'fmt_url_map' in video_info:
 925                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
 926                         format_limit = self._downloader.params.get('format_limit', None)
 927                         if format_limit is not None and format_limit in self._available_formats:
 928                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
 929                         else:
 930                                 format_list = self._available_formats
 931                         existing_formats = [x for x in format_list if x in url_map]
 932                         if len(existing_formats) == 0:
 933                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 934                                 return
 935                         if requested_format is None:
 936                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
 937                         elif requested_format == '-1':
 938                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
 939                         else:
 940                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
 941
 942                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 943                         self.report_rtmp_download()
 944                         video_url_list = [(None, video_info['conn'][0])]
 945
 946                 else:
 947                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
 948                         return
 949
 950                 for format_param, video_real_url in video_url_list:
 951                         # At this point we have a new video
 952                         self._downloader.increment_downloads()
 953
 954                         # Extension
 955                         video_extension = self._video_extensions.get(format_param, 'flv')
 956
 957                         # Find the video URL in fmt_url_map or conn paramters
 958                         try:
 959                                 # Process video information
 960                                 self._downloader.process_info({
 961                                         'id':           video_id.decode('utf-8'),
 962                                         'url':          video_real_url.decode('utf-8'),
 963                                         'uploader':     video_uploader.decode('utf-8'),
 964                                         'uploaddate':   upload_date,
 965                                         'title':        video_title,
 966                                         'stitle':       simple_title,
 967                                         'ext':          video_extension.decode('utf-8'),
 968                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 969                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 970                                         'description':  video_description.decode('utf-8'),
 971                                         'player_url':   player_url,
 972                                 })
 973                         except UnavailableVideoError, err:
 974                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
 975
 976
 977 class MetacafeIE(InfoExtractor):
 978         """Information Extractor for metacafe.com."""
 979
 980         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 981         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 982         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 983         _youtube_ie = None
 984
 985         def __init__(self, youtube_ie, downloader=None):
 986                 InfoExtractor.__init__(self, downloader)
 987                 self._youtube_ie = youtube_ie
 988
 989         @staticmethod
 990         def suitable(url):
 991                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 992
 993         def report_disclaimer(self):
 994                 """Report disclaimer retrieval."""
 995                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
 996
 997         def report_age_confirmation(self):
 998                 """Report attempt to confirm age."""
 999                 self._downloader.to_screen(u'[metacafe] Confirming age')
1000
1001         def report_download_webpage(self, video_id):
1002                 """Report webpage download."""
1003                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1004
1005         def report_extraction(self, video_id):
1006                 """Report information extraction."""
1007                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1008
1009         def _real_initialize(self):
1010                 # Retrieve disclaimer
1011                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1012                 try:
1013                         self.report_disclaimer()
1014                         disclaimer = urllib2.urlopen(request).read()
1015                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1016                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1017                         return
1018
1019                 # Confirm age
1020                 disclaimer_form = {
1021                         'filters': '0',
1022                         'submit': "Continue - I'm over 18",
1023                         }
1024                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1025                 try:
1026                         self.report_age_confirmation()
1027                         disclaimer = urllib2.urlopen(request).read()
1028                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1029                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1030                         return
1031
1032         def _real_extract(self, url):
1033                 # Extract id and simplified title from URL
1034                 mobj = re.match(self._VALID_URL, url)
1035                 if mobj is None:
1036                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1037                         return
1038
1039                 video_id = mobj.group(1)
1040
1041                 # Check if video comes from YouTube
1042                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1043                 if mobj2 is not None:
1044                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1045                         return
1046
1047                 # At this point we have a new video
1048                 self._downloader.increment_downloads()
1049
1050                 simple_title = mobj.group(2).decode('utf-8')
1051
1052                 # Retrieve video webpage to extract further information
1053                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1054                 try:
1055                         self.report_download_webpage(video_id)
1056                         webpage = urllib2.urlopen(request).read()
1057                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1058                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1059                         return
1060
1061                 # Extract URL, uploader and title from webpage
1062                 self.report_extraction(video_id)
1063                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1064                 if mobj is not None:
1065                         mediaURL = urllib.unquote(mobj.group(1))
1066                         video_extension = mediaURL[-3:]
1067
1068                         # Extract gdaKey if available
1069                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1070                         if mobj is None:
1071                                 video_url = mediaURL
1072                         else:
1073                                 gdaKey = mobj.group(1)
1074                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1075                 else:
1076                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1077                         if mobj is None:
1078                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1079                                 return
1080                         vardict = parse_qs(mobj.group(1))
1081                         if 'mediaData' not in vardict:
1082                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1083                                 return
1084                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1085                         if mobj is None:
1086                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1087                                 return
1088                         mediaURL = mobj.group(1).replace('\\/', '/')
1089                         video_extension = mediaURL[-3:]
1090                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1091
1092                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1093                 if mobj is None:
1094                         self._downloader.trouble(u'ERROR: unable to extract title')
1095                         return
1096                 video_title = mobj.group(1).decode('utf-8')
1097                 video_title = sanitize_title(video_title)
1098
1099                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1100                 if mobj is None:
1101                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1102                         return
1103                 video_uploader = mobj.group(1)
1104
1105                 try:
1106                         # Process video information
1107                         self._downloader.process_info({
1108                                 'id':           video_id.decode('utf-8'),
1109                                 'url':          video_url.decode('utf-8'),
1110                                 'uploader':     video_uploader.decode('utf-8'),
1111                                 'uploaddate':   u'NA',
1112                                 'title':        video_title,
1113                                 'stitle':       simple_title,
1114                                 'ext':          video_extension.decode('utf-8'),
1115                                 'format':       u'NA',
1116                                 'player_url':   None,
1117                         })
1118                 except UnavailableVideoError:
1119                         self._downloader.trouble(u'ERROR: unable to download video')
1120
1121
1122 class DailymotionIE(InfoExtractor):
1123         """Information Extractor for Dailymotion"""
1124
1125         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1126
1127         def __init__(self, downloader=None):
1128                 InfoExtractor.__init__(self, downloader)
1129
1130         @staticmethod
1131         def suitable(url):
1132                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1133
1134         def report_download_webpage(self, video_id):
1135                 """Report webpage download."""
1136                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1137
1138         def report_extraction(self, video_id):
1139                 """Report information extraction."""
1140                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1141
1142         def _real_initialize(self):
1143                 return
1144
1145         def _real_extract(self, url):
1146                 # Extract id and simplified title from URL
1147                 mobj = re.match(self._VALID_URL, url)
1148                 if mobj is None:
1149                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1150                         return
1151
1152                 # At this point we have a new video
1153                 self._downloader.increment_downloads()
1154                 video_id = mobj.group(1)
1155
1156                 simple_title = mobj.group(2).decode('utf-8')
1157                 video_extension = 'flv'
1158
1159                 # Retrieve video webpage to extract further information
1160                 request = urllib2.Request(url)
1161                 try:
1162                         self.report_download_webpage(video_id)
1163                         webpage = urllib2.urlopen(request).read()
1164                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1166                         return
1167
1168                 # Extract URL, uploader and title from webpage
1169                 self.report_extraction(video_id)
1170                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1171                 if mobj is None:
1172                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1173                         return
1174                 mediaURL = urllib.unquote(mobj.group(1))
1175
1176                 # if needed add http://www.dailymotion.com/ if relative URL
1177
1178                 video_url = mediaURL
1179
1180                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1181                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1182                 if mobj is None:
1183                         self._downloader.trouble(u'ERROR: unable to extract title')
1184                         return
1185                 video_title = mobj.group(1).decode('utf-8')
1186                 video_title = sanitize_title(video_title)
1187
1188                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1189                 if mobj is None:
1190                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1191                         return
1192                 video_uploader = mobj.group(1)
1193
1194                 try:
1195                         # Process video information
1196                         self._downloader.process_info({
1197                                 'id':           video_id.decode('utf-8'),
1198                                 'url':          video_url.decode('utf-8'),
1199                                 'uploader':     video_uploader.decode('utf-8'),
1200                                 'uploaddate':   u'NA',
1201                                 'title':        video_title,
1202                                 'stitle':       simple_title,
1203                                 'ext':          video_extension.decode('utf-8'),
1204                                 'format':       u'NA',
1205                                 'player_url':   None,
1206                         })
1207                 except UnavailableVideoError:
1208                         self._downloader.trouble(u'ERROR: unable to download video')
1209
1210 class GoogleIE(InfoExtractor):
1211         """Information extractor for video.google.com."""
1212
1213         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1214
1215         def __init__(self, downloader=None):
1216                 InfoExtractor.__init__(self, downloader)
1217
1218         @staticmethod
1219         def suitable(url):
1220                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1221
1222         def report_download_webpage(self, video_id):
1223                 """Report webpage download."""
1224                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1225
1226         def report_extraction(self, video_id):
1227                 """Report information extraction."""
1228                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1229
1230         def _real_initialize(self):
1231                 return
1232
1233         def _real_extract(self, url):
1234                 # Extract id from URL
1235                 mobj = re.match(self._VALID_URL, url)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1238                         return
1239
1240                 # At this point we have a new video
1241                 self._downloader.increment_downloads()
1242                 video_id = mobj.group(1)
1243
1244                 video_extension = 'mp4'
1245
1246                 # Retrieve video webpage to extract further information
1247                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1248                 try:
1249                         self.report_download_webpage(video_id)
1250                         webpage = urllib2.urlopen(request).read()
1251                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1252                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1253                         return
1254
1255                 # Extract URL, uploader, and title from webpage
1256                 self.report_extraction(video_id)
1257                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1258                 if mobj is None:
1259                         video_extension = 'flv'
1260                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1263                         return
1264                 mediaURL = urllib.unquote(mobj.group(1))
1265                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1266                 mediaURL = mediaURL.replace('\\x26', '\x26')
1267
1268                 video_url = mediaURL
1269
1270                 mobj = re.search(r'<title>(.*)</title>', webpage)
1271                 if mobj is None:
1272                         self._downloader.trouble(u'ERROR: unable to extract title')
1273                         return
1274                 video_title = mobj.group(1).decode('utf-8')
1275                 video_title = sanitize_title(video_title)
1276                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1277
1278                 # Extract video description
1279                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1280                 if mobj is None:
1281                         self._downloader.trouble(u'ERROR: unable to extract video description')
1282                         return
1283                 video_description = mobj.group(1).decode('utf-8')
1284                 if not video_description:
1285                         video_description = 'No description available.'
1286
1287                 # Extract video thumbnail
1288                 if self._downloader.params.get('forcethumbnail', False):
1289                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1290                         try:
1291                                 webpage = urllib2.urlopen(request).read()
1292                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1293                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1294                                 return
1295                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1296                         if mobj is None:
1297                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1298                                 return
1299                         video_thumbnail = mobj.group(1)
1300                 else:   # we need something to pass to process_info
1301                         video_thumbnail = ''
1302
1303
1304                 try:
1305                         # Process video information
1306                         self._downloader.process_info({
1307                                 'id':           video_id.decode('utf-8'),
1308                                 'url':          video_url.decode('utf-8'),
1309                                 'uploader':     u'NA',
1310                                 'uploaddate':   u'NA',
1311                                 'title':        video_title,
1312                                 'stitle':       simple_title,
1313                                 'ext':          video_extension.decode('utf-8'),
1314                                 'format':       u'NA',
1315                                 'player_url':   None,
1316                         })
1317                 except UnavailableVideoError:
1318                         self._downloader.trouble(u'ERROR: unable to download video')
1319
1320
1321 class PhotobucketIE(InfoExtractor):
1322         """Information extractor for photobucket.com."""
1323
1324         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1325
1326         def __init__(self, downloader=None):
1327                 InfoExtractor.__init__(self, downloader)
1328
1329         @staticmethod
1330         def suitable(url):
1331                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1332
1333         def report_download_webpage(self, video_id):
1334                 """Report webpage download."""
1335                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1336
1337         def report_extraction(self, video_id):
1338                 """Report information extraction."""
1339                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1340
1341         def _real_initialize(self):
1342                 return
1343
1344         def _real_extract(self, url):
1345                 # Extract id from URL
1346                 mobj = re.match(self._VALID_URL, url)
1347                 if mobj is None:
1348                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1349                         return
1350
1351                 # At this point we have a new video
1352                 self._downloader.increment_downloads()
1353                 video_id = mobj.group(1)
1354
1355                 video_extension = 'flv'
1356
1357                 # Retrieve video webpage to extract further information
1358                 request = urllib2.Request(url)
1359                 try:
1360                         self.report_download_webpage(video_id)
1361                         webpage = urllib2.urlopen(request).read()
1362                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1364                         return
1365
1366                 # Extract URL, uploader, and title from webpage
1367                 self.report_extraction(video_id)
1368                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1369                 if mobj is None:
1370                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1371                         return
1372                 mediaURL = urllib.unquote(mobj.group(1))
1373
1374                 video_url = mediaURL
1375
1376                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1377                 if mobj is None:
1378                         self._downloader.trouble(u'ERROR: unable to extract title')
1379                         return
1380                 video_title = mobj.group(1).decode('utf-8')
1381                 video_title = sanitize_title(video_title)
1382                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1383
1384                 video_uploader = mobj.group(2).decode('utf-8')
1385
1386                 try:
1387                         # Process video information
1388                         self._downloader.process_info({
1389                                 'id':           video_id.decode('utf-8'),
1390                                 'url':          video_url.decode('utf-8'),
1391                                 'uploader':     video_uploader,
1392                                 'uploaddate':   u'NA',
1393                                 'title':        video_title,
1394                                 'stitle':       simple_title,
1395                                 'ext':          video_extension.decode('utf-8'),
1396                                 'format':       u'NA',
1397                                 'player_url':   None,
1398                         })
1399                 except UnavailableVideoError:
1400                         self._downloader.trouble(u'ERROR: unable to download video')
1401
1402
1403 class YahooIE(InfoExtractor):
1404         """Information extractor for video.yahoo.com."""
1405
1406         # _VALID_URL matches all Yahoo! Video URLs
1407         # _VPAGE_URL matches only the extractable '/watch/' URLs
1408         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1409         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1410
1411         def __init__(self, downloader=None):
1412                 InfoExtractor.__init__(self, downloader)
1413
1414         @staticmethod
1415         def suitable(url):
1416                 return (re.match(YahooIE._VALID_URL, url) is not None)
1417
1418         def report_download_webpage(self, video_id):
1419                 """Report webpage download."""
1420                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1421
1422         def report_extraction(self, video_id):
1423                 """Report information extraction."""
1424                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1425
1426         def _real_initialize(self):
1427                 return
1428
1429         def _real_extract(self, url, new_video=True):
1430                 # Extract ID from URL
1431                 mobj = re.match(self._VALID_URL, url)
1432                 if mobj is None:
1433                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1434                         return
1435
1436                 # At this point we have a new video
1437                 self._downloader.increment_downloads()
1438                 video_id = mobj.group(2)
1439                 video_extension = 'flv'
1440
1441                 # Rewrite valid but non-extractable URLs as
1442                 # extractable English language /watch/ URLs
1443                 if re.match(self._VPAGE_URL, url) is None:
1444                         request = urllib2.Request(url)
1445                         try:
1446                                 webpage = urllib2.urlopen(request).read()
1447                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1448                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1449                                 return
1450
1451                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1452                         if mobj is None:
1453                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1454                                 return
1455                         yahoo_id = mobj.group(1)
1456
1457                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1458                         if mobj is None:
1459                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1460                                 return
1461                         yahoo_vid = mobj.group(1)
1462
1463                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1464                         return self._real_extract(url, new_video=False)
1465
1466                 # Retrieve video webpage to extract further information
1467                 request = urllib2.Request(url)
1468                 try:
1469                         self.report_download_webpage(video_id)
1470                         webpage = urllib2.urlopen(request).read()
1471                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1473                         return
1474
1475                 # Extract uploader and title from webpage
1476                 self.report_extraction(video_id)
1477                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1478                 if mobj is None:
1479                         self._downloader.trouble(u'ERROR: unable to extract video title')
1480                         return
1481                 video_title = mobj.group(1).decode('utf-8')
1482                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1483
1484                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1485                 if mobj is None:
1486                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1487                         return
1488                 video_uploader = mobj.group(1).decode('utf-8')
1489
1490                 # Extract video thumbnail
1491                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1492                 if mobj is None:
1493                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1494                         return
1495                 video_thumbnail = mobj.group(1).decode('utf-8')
1496
1497                 # Extract video description
1498                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1499                 if mobj is None:
1500                         self._downloader.trouble(u'ERROR: unable to extract video description')
1501                         return
1502                 video_description = mobj.group(1).decode('utf-8')
1503                 if not video_description: video_description = 'No description available.'
1504
1505                 # Extract video height and width
1506                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: unable to extract video height')
1509                         return
1510                 yv_video_height = mobj.group(1)
1511
1512                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: unable to extract video width')
1515                         return
1516                 yv_video_width = mobj.group(1)
1517
1518                 # Retrieve video playlist to extract media URL
1519                 # I'm not completely sure what all these options are, but we
1520                 # seem to need most of them, otherwise the server sends a 401.
1521                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1522                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1523                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1524                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1525                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1526                 try:
1527                         self.report_download_webpage(video_id)
1528                         webpage = urllib2.urlopen(request).read()
1529                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1530                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1531                         return
1532
1533                 # Extract media URL from playlist XML
1534                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1535                 if mobj is None:
1536                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1537                         return
1538                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1539                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1540
1541                 try:
1542                         # Process video information
1543                         self._downloader.process_info({
1544                                 'id':           video_id.decode('utf-8'),
1545                                 'url':          video_url,
1546                                 'uploader':     video_uploader,
1547                                 'uploaddate':   u'NA',
1548                                 'title':        video_title,
1549                                 'stitle':       simple_title,
1550                                 'ext':          video_extension.decode('utf-8'),
1551                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1552                                 'description':  video_description,
1553                                 'thumbnail':    video_thumbnail,
1554                                 'description':  video_description,
1555                                 'player_url':   None,
1556                         })
1557                 except UnavailableVideoError:
1558                         self._downloader.trouble(u'ERROR: unable to download video')
1559
1560
1561 class GenericIE(InfoExtractor):
1562         """Generic last-resort information extractor."""
1563
1564         def __init__(self, downloader=None):
1565                 InfoExtractor.__init__(self, downloader)
1566
1567         @staticmethod
1568         def suitable(url):
1569                 return True
1570
1571         def report_download_webpage(self, video_id):
1572                 """Report webpage download."""
1573                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1574                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1575
1576         def report_extraction(self, video_id):
1577                 """Report information extraction."""
1578                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1579
1580         def _real_initialize(self):
1581                 return
1582
1583         def _real_extract(self, url):
1584                 # At this point we have a new video
1585                 self._downloader.increment_downloads()
1586
1587                 video_id = url.split('/')[-1]
1588                 request = urllib2.Request(url)
1589                 try:
1590                         self.report_download_webpage(video_id)
1591                         webpage = urllib2.urlopen(request).read()
1592                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1594                         return
1595                 except ValueError, err:
1596                         # since this is the last-resort InfoExtractor, if
1597                         # this error is thrown, it'll be thrown here
1598                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1599                         return
1600
1601                 # Start with something easy: JW Player in SWFObject
1602                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1603                 if mobj is None:
1604                         # Broaden the search a little bit
1605                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1606                 if mobj is None:
1607                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1608                         return
1609
1610                 # It's possible that one of the regexes
1611                 # matched, but returned an empty group:
1612                 if mobj.group(1) is None:
1613                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1614                         return
1615
1616                 video_url = urllib.unquote(mobj.group(1))
1617                 video_id  = os.path.basename(video_url)
1618
1619                 # here's a fun little line of code for you:
1620                 video_extension = os.path.splitext(video_id)[1][1:]
1621                 video_id        = os.path.splitext(video_id)[0]
1622
1623                 # it's tempting to parse this further, but you would
1624                 # have to take into account all the variations like
1625                 #   Video Title - Site Name
1626                 #   Site Name | Video Title
1627                 #   Video Title - Tagline | Site Name
1628                 # and so on and so forth; it's just not practical
1629                 mobj = re.search(r'<title>(.*)</title>', webpage)
1630                 if mobj is None:
1631                         self._downloader.trouble(u'ERROR: unable to extract title')
1632                         return
1633                 video_title = mobj.group(1).decode('utf-8')
1634                 video_title = sanitize_title(video_title)
1635                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1636
1637                 # video uploader is domain name
1638                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1639                 if mobj is None:
1640                         self._downloader.trouble(u'ERROR: unable to extract title')
1641                         return
1642                 video_uploader = mobj.group(1).decode('utf-8')
1643
1644                 try:
1645                         # Process video information
1646                         self._downloader.process_info({
1647                                 'id':           video_id.decode('utf-8'),
1648                                 'url':          video_url.decode('utf-8'),
1649                                 'uploader':     video_uploader,
1650                                 'uploaddate':   u'NA',
1651                                 'title':        video_title,
1652                                 'stitle':       simple_title,
1653                                 'ext':          video_extension.decode('utf-8'),
1654                                 'format':       u'NA',
1655                                 'player_url':   None,
1656                         })
1657                 except UnavailableVideoError, err:
1658                         self._downloader.trouble(u'ERROR: unable to download video')
1659
1660
1661 class YoutubeSearchIE(InfoExtractor):
1662         """Information Extractor for YouTube search queries."""
1663         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1664         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1665         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1666         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1667         _youtube_ie = None
1668         _max_youtube_results = 1000
1669
1670         def __init__(self, youtube_ie, downloader=None):
1671                 InfoExtractor.__init__(self, downloader)
1672                 self._youtube_ie = youtube_ie
1673
1674         @staticmethod
1675         def suitable(url):
1676                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1677
1678         def report_download_page(self, query, pagenum):
1679                 """Report attempt to download playlist page with given number."""
1680                 query = query.decode(preferredencoding())
1681                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1682
1683         def _real_initialize(self):
1684                 self._youtube_ie.initialize()
1685
1686         def _real_extract(self, query):
1687                 mobj = re.match(self._VALID_QUERY, query)
1688                 if mobj is None:
1689                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1690                         return
1691
1692                 prefix, query = query.split(':')
1693                 prefix = prefix[8:]
1694                 query  = query.encode('utf-8')
1695                 if prefix == '':
1696                         self._download_n_results(query, 1)
1697                         return
1698                 elif prefix == 'all':
1699                         self._download_n_results(query, self._max_youtube_results)
1700                         return
1701                 else:
1702                         try:
1703                                 n = long(prefix)
1704                                 if n <= 0:
1705                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1706                                         return
1707                                 elif n > self._max_youtube_results:
1708                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1709                                         n = self._max_youtube_results
1710                                 self._download_n_results(query, n)
1711                                 return
1712                         except ValueError: # parsing prefix as integer fails
1713                                 self._download_n_results(query, 1)
1714                                 return
1715
1716         def _download_n_results(self, query, n):
1717                 """Downloads a specified number of results for a query"""
1718
1719                 video_ids = []
1720                 already_seen = set()
1721                 pagenum = 1
1722
1723                 while True:
1724                         self.report_download_page(query, pagenum)
1725                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1726                         request = urllib2.Request(result_url, None, std_headers)
1727                         try:
1728                                 page = urllib2.urlopen(request).read()
1729                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1730                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1731                                 return
1732
1733                         # Extract video identifiers
1734                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1735                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1736                                 if video_id not in already_seen:
1737                                         video_ids.append(video_id)
1738                                         already_seen.add(video_id)
1739                                         if len(video_ids) == n:
1740                                                 # Specified n videos reached
1741                                                 for id in video_ids:
1742                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1743                                                 return
1744
1745                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1746                                 for id in video_ids:
1747                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1748                                 return
1749
1750                         pagenum = pagenum + 1
1751
1752 class GoogleSearchIE(InfoExtractor):
1753         """Information Extractor for Google Video search queries."""
1754         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1755         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1756         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1757         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1758         _google_ie = None
1759         _max_google_results = 1000
1760
1761         def __init__(self, google_ie, downloader=None):
1762                 InfoExtractor.__init__(self, downloader)
1763                 self._google_ie = google_ie
1764
1765         @staticmethod
1766         def suitable(url):
1767                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1768
1769         def report_download_page(self, query, pagenum):
1770                 """Report attempt to download playlist page with given number."""
1771                 query = query.decode(preferredencoding())
1772                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1773
1774         def _real_initialize(self):
1775                 self._google_ie.initialize()
1776
1777         def _real_extract(self, query):
1778                 mobj = re.match(self._VALID_QUERY, query)
1779                 if mobj is None:
1780                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1781                         return
1782
1783                 prefix, query = query.split(':')
1784                 prefix = prefix[8:]
1785                 query  = query.encode('utf-8')
1786                 if prefix == '':
1787                         self._download_n_results(query, 1)
1788                         return
1789                 elif prefix == 'all':
1790                         self._download_n_results(query, self._max_google_results)
1791                         return
1792                 else:
1793                         try:
1794                                 n = long(prefix)
1795                                 if n <= 0:
1796                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1797                                         return
1798                                 elif n > self._max_google_results:
1799                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1800                                         n = self._max_google_results
1801                                 self._download_n_results(query, n)
1802                                 return
1803                         except ValueError: # parsing prefix as integer fails
1804                                 self._download_n_results(query, 1)
1805                                 return
1806
1807         def _download_n_results(self, query, n):
1808                 """Downloads a specified number of results for a query"""
1809
1810                 video_ids = []
1811                 already_seen = set()
1812                 pagenum = 1
1813
1814                 while True:
1815                         self.report_download_page(query, pagenum)
1816                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1817                         request = urllib2.Request(result_url, None, std_headers)
1818                         try:
1819                                 page = urllib2.urlopen(request).read()
1820                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1821                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1822                                 return
1823
1824                         # Extract video identifiers
1825                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1826                                 video_id = mobj.group(1)
1827                                 if video_id not in already_seen:
1828                                         video_ids.append(video_id)
1829                                         already_seen.add(video_id)
1830                                         if len(video_ids) == n:
1831                                                 # Specified n videos reached
1832                                                 for id in video_ids:
1833                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1834                                                 return
1835
1836                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1837                                 for id in video_ids:
1838                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1839                                 return
1840
1841                         pagenum = pagenum + 1
1842
1843 class YahooSearchIE(InfoExtractor):
1844         """Information Extractor for Yahoo! Video search queries."""
1845         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1846         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1847         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1848         _MORE_PAGES_INDICATOR = r'\s*Next'
1849         _yahoo_ie = None
1850         _max_yahoo_results = 1000
1851
1852         def __init__(self, yahoo_ie, downloader=None):
1853                 InfoExtractor.__init__(self, downloader)
1854                 self._yahoo_ie = yahoo_ie
1855
1856         @staticmethod
1857         def suitable(url):
1858                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1859
1860         def report_download_page(self, query, pagenum):
1861                 """Report attempt to download playlist page with given number."""
1862                 query = query.decode(preferredencoding())
1863                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1864
1865         def _real_initialize(self):
1866                 self._yahoo_ie.initialize()
1867
1868         def _real_extract(self, query):
1869                 mobj = re.match(self._VALID_QUERY, query)
1870                 if mobj is None:
1871                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1872                         return
1873
1874                 prefix, query = query.split(':')
1875                 prefix = prefix[8:]
1876                 query  = query.encode('utf-8')
1877                 if prefix == '':
1878                         self._download_n_results(query, 1)
1879                         return
1880                 elif prefix == 'all':
1881                         self._download_n_results(query, self._max_yahoo_results)
1882                         return
1883                 else:
1884                         try:
1885                                 n = long(prefix)
1886                                 if n <= 0:
1887                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1888                                         return
1889                                 elif n > self._max_yahoo_results:
1890                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1891                                         n = self._max_yahoo_results
1892                                 self._download_n_results(query, n)
1893                                 return
1894                         except ValueError: # parsing prefix as integer fails
1895                                 self._download_n_results(query, 1)
1896                                 return
1897
1898         def _download_n_results(self, query, n):
1899                 """Downloads a specified number of results for a query"""
1900
1901                 video_ids = []
1902                 already_seen = set()
1903                 pagenum = 1
1904
1905                 while True:
1906                         self.report_download_page(query, pagenum)
1907                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1908                         request = urllib2.Request(result_url, None, std_headers)
1909                         try:
1910                                 page = urllib2.urlopen(request).read()
1911                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1912                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1913                                 return
1914
1915                         # Extract video identifiers
1916                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917                                 video_id = mobj.group(1)
1918                                 if video_id not in already_seen:
1919                                         video_ids.append(video_id)
1920                                         already_seen.add(video_id)
1921                                         if len(video_ids) == n:
1922                                                 # Specified n videos reached
1923                                                 for id in video_ids:
1924                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1925                                                 return
1926
1927                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1928                                 for id in video_ids:
1929                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1930                                 return
1931
1932                         pagenum = pagenum + 1
1933
1934 class YoutubePlaylistIE(InfoExtractor):
1935         """Information Extractor for YouTube playlists."""
1936
1937         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1938         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1939         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1940         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1941         _youtube_ie = None
1942
1943         def __init__(self, youtube_ie, downloader=None):
1944                 InfoExtractor.__init__(self, downloader)
1945                 self._youtube_ie = youtube_ie
1946
1947         @staticmethod
1948         def suitable(url):
1949                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1950
1951         def report_download_page(self, playlist_id, pagenum):
1952                 """Report attempt to download playlist page with given number."""
1953                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1954
1955         def _real_initialize(self):
1956                 self._youtube_ie.initialize()
1957
1958         def _real_extract(self, url):
1959                 # Extract playlist id
1960                 mobj = re.match(self._VALID_URL, url)
1961                 if mobj is None:
1962                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1963                         return
1964
1965                 # Download playlist pages
1966                 playlist_id = mobj.group(1)
1967                 video_ids = []
1968                 pagenum = 1
1969
1970                 while True:
1971                         self.report_download_page(playlist_id, pagenum)
1972                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1973                         try:
1974                                 page = urllib2.urlopen(request).read()
1975                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1976                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1977                                 return
1978
1979                         # Extract video identifiers
1980                         ids_in_page = []
1981                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1982                                 if mobj.group(1) not in ids_in_page:
1983                                         ids_in_page.append(mobj.group(1))
1984                         video_ids.extend(ids_in_page)
1985
1986                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1987                                 break
1988                         pagenum = pagenum + 1
1989
1990                 playliststart = self._downloader.params.get('playliststart', 1) - 1
1991                 playlistend = self._downloader.params.get('playlistend', -1)
1992                 video_ids = video_ids[playliststart:playlistend]
1993
1994                 for id in video_ids:
1995                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1996                 return
1997
1998 class YoutubeUserIE(InfoExtractor):
1999         """Information Extractor for YouTube users."""
2000
2001         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2002         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2003         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2004         _youtube_ie = None
2005
2006         def __init__(self, youtube_ie, downloader=None):
2007                 InfoExtractor.__init__(self, downloader)
2008                 self._youtube_ie = youtube_ie
2009
2010         @staticmethod
2011         def suitable(url):
2012                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2013
2014         def report_download_page(self, username):
2015                 """Report attempt to download user page."""
2016                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2017
2018         def _real_initialize(self):
2019                 self._youtube_ie.initialize()
2020
2021         def _real_extract(self, url):
2022                 # Extract username
2023                 mobj = re.match(self._VALID_URL, url)
2024                 if mobj is None:
2025                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2026                         return
2027
2028                 # Download user page
2029                 username = mobj.group(1)
2030                 video_ids = []
2031                 pagenum = 1
2032
2033                 self.report_download_page(username)
2034                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2035                 try:
2036                         page = urllib2.urlopen(request).read()
2037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2038                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2039                         return
2040
2041                 # Extract video identifiers
2042                 ids_in_page = []
2043
2044                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2045                         if mobj.group(1) not in ids_in_page:
2046                                 ids_in_page.append(mobj.group(1))
2047                 video_ids.extend(ids_in_page)
2048
2049                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2050                 playlistend = self._downloader.params.get('playlistend', -1)
2051                 video_ids = video_ids[playliststart:playlistend]
2052
2053                 for id in video_ids:
2054                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2055                 return
2056
2057 class PostProcessor(object):
2058         """Post Processor class.
2059
2060         PostProcessor objects can be added to downloaders with their
2061         add_post_processor() method. When the downloader has finished a
2062         successful download, it will take its internal chain of PostProcessors
2063         and start calling the run() method on each one of them, first with
2064         an initial argument and then with the returned value of the previous
2065         PostProcessor.
2066
2067         The chain will be stopped if one of them ever returns None or the end
2068         of the chain is reached.
2069
2070         PostProcessor objects follow a "mutual registration" process similar
2071         to InfoExtractor objects.
2072         """
2073
2074         _downloader = None
2075
2076         def __init__(self, downloader=None):
2077                 self._downloader = downloader
2078
2079         def set_downloader(self, downloader):
2080                 """Sets the downloader for this PP."""
2081                 self._downloader = downloader
2082
2083         def run(self, information):
2084                 """Run the PostProcessor.
2085
2086                 The "information" argument is a dictionary like the ones
2087                 composed by InfoExtractors. The only difference is that this
2088                 one has an extra field called "filepath" that points to the
2089                 downloaded file.
2090
2091                 When this method returns None, the postprocessing chain is
2092                 stopped. However, this method may return an information
2093                 dictionary that will be passed to the next postprocessing
2094                 object in the chain. It can be the one it received after
2095                 changing some fields.
2096
2097                 In addition, this method may raise a PostProcessingError
2098                 exception that will be taken into account by the downloader
2099                 it was called from.
2100                 """
2101                 return information # by default, do nothing
2102
2103 ### MAIN PROGRAM ###
2104 if __name__ == '__main__':
2105         try:
2106                 # Modules needed only when running the main program
2107                 import getpass
2108                 import optparse
2109
2110                 # Function to update the program file with the latest version from bitbucket.org
2111                 def update_self(downloader, filename):
2112                         # Note: downloader only used for options
2113                         if not os.access (filename, os.W_OK):
2114                                 sys.exit('ERROR: no write permissions on %s' % filename)
2115
2116                         downloader.to_screen('Updating to latest stable version...')
2117                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2118                         latest_version = urllib.urlopen(latest_url).read().strip()
2119                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2120                         newcontent = urllib.urlopen(prog_url).read()
2121                         stream = open(filename, 'w')
2122                         stream.write(newcontent)
2123                         stream.close()
2124                         downloader.to_screen('Updated to version %s' % latest_version)
2125
2126                 # Parse command line
2127                 parser = optparse.OptionParser(
2128                         usage='Usage: %prog [options] url...',
2129                         version='2010.10.24',
2130                         conflict_handler='resolve',
2131                 )
2132
2133                 parser.add_option('-h', '--help',
2134                                 action='help', help='print this help text and exit')
2135                 parser.add_option('-v', '--version',
2136                                 action='version', help='print program version and exit')
2137                 parser.add_option('-U', '--update',
2138                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2139                 parser.add_option('-i', '--ignore-errors',
2140                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2141                 parser.add_option('-r', '--rate-limit',
2142                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2143                 parser.add_option('-R', '--retries',
2144                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2145                 parser.add_option('--playlist-start',
2146                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2147                 parser.add_option('--playlist-end',
2148                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2149
2150                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2151                 authentication.add_option('-u', '--username',
2152                                 dest='username', metavar='USERNAME', help='account username')
2153                 authentication.add_option('-p', '--password',
2154                                 dest='password', metavar='PASSWORD', help='account password')
2155                 authentication.add_option('-n', '--netrc',
2156                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2157                 parser.add_option_group(authentication)
2158
2159                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2160                 video_format.add_option('-f', '--format',
2161                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2162                 video_format.add_option('-m', '--mobile-version',
2163                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2164                 video_format.add_option('--all-formats',
2165                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2166                 video_format.add_option('--max-quality',
2167                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2168                 video_format.add_option('-b', '--best-quality',
2169                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2170                 parser.add_option_group(video_format)
2171
2172                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2173                 verbosity.add_option('-q', '--quiet',
2174                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2175                 verbosity.add_option('-s', '--simulate',
2176                                 action='store_true', dest='simulate', help='do not download video', default=False)
2177                 verbosity.add_option('-g', '--get-url',
2178                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2179                 verbosity.add_option('-e', '--get-title',
2180                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2181                 verbosity.add_option('--get-thumbnail',
2182                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2183                 verbosity.add_option('--get-description',
2184                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2185                 verbosity.add_option('--no-progress',
2186                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2187                 parser.add_option_group(verbosity)
2188
2189                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2190                 filesystem.add_option('-t', '--title',
2191                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2192                 filesystem.add_option('-l', '--literal',
2193                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2194                 filesystem.add_option('-A', '--auto-number',
2195                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2196                 filesystem.add_option('-o', '--output',
2197                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2198                 filesystem.add_option('-a', '--batch-file',
2199                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2200                 filesystem.add_option('-w', '--no-overwrites',
2201                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2202                 filesystem.add_option('-c', '--continue',
2203                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2204                 filesystem.add_option('--cookies',
2205                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2206                 parser.add_option_group(filesystem)
2207
2208                 (opts, args) = parser.parse_args()
2209
2210                 # Open appropriate CookieJar
2211                 if opts.cookiefile is None:
2212                         jar = cookielib.CookieJar()
2213                 else:
2214                         try:
2215                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2216                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2217                                         jar.load()
2218                         except (IOError, OSError), err:
2219                                 sys.exit(u'ERROR: unable to open cookie file')
2220
2221                 # General configuration
2222                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2223                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2224                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2225                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2226
2227                 # Batch file verification
2228                 batchurls = []
2229                 if opts.batchfile is not None:
2230                         try:
2231                                 if opts.batchfile == '-':
2232                                         batchfd = sys.stdin
2233                                 else:
2234                                         batchfd = open(opts.batchfile, 'r')
2235                                 batchurls = batchfd.readlines()
2236                                 batchurls = [x.strip() for x in batchurls]
2237                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2238                         except IOError:
2239                                 sys.exit(u'ERROR: batch file could not be read')
2240                 all_urls = batchurls + args
2241
2242                 # Conflicting, missing and erroneous options
2243                 if opts.bestquality:
2244                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2245                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2246                         parser.error(u'using .netrc conflicts with giving username/password')
2247                 if opts.password is not None and opts.username is None:
2248                         parser.error(u'account username missing')
2249                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2250                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2251                 if opts.usetitle and opts.useliteral:
2252                         parser.error(u'using title conflicts with using literal title')
2253                 if opts.username is not None and opts.password is None:
2254                         opts.password = getpass.getpass(u'Type account password and press return:')
2255                 if opts.ratelimit is not None:
2256                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2257                         if numeric_limit is None:
2258                                 parser.error(u'invalid rate limit specified')
2259                         opts.ratelimit = numeric_limit
2260                 if opts.retries is not None:
2261                         try:
2262                                 opts.retries = long(opts.retries)
2263                         except (TypeError, ValueError), err:
2264                                 parser.error(u'invalid retry count specified')
2265                 try:
2266                         opts.playliststart = long(opts.playliststart)
2267                         if opts.playliststart <= 0:
2268                                 raise ValueError
2269                 except (TypeError, ValueError), err:
2270                         parser.error(u'invalid playlist start number specified')
2271                 try:
2272                         opts.playlistend = long(opts.playlistend)
2273                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2274                                 raise ValueError
2275                 except (TypeError, ValueError), err:
2276                         parser.error(u'invalid playlist end number specified')
2277
2278                 # Information extractors
2279                 youtube_ie = YoutubeIE()
2280                 metacafe_ie = MetacafeIE(youtube_ie)
2281                 dailymotion_ie = DailymotionIE()
2282                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2283                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2284                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2285                 google_ie = GoogleIE()
2286                 google_search_ie = GoogleSearchIE(google_ie)
2287                 photobucket_ie = PhotobucketIE()
2288                 yahoo_ie = YahooIE()
2289                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2290                 generic_ie = GenericIE()
2291
2292                 # File downloader
2293                 fd = FileDownloader({
2294                         'usenetrc': opts.usenetrc,
2295                         'username': opts.username,
2296                         'password': opts.password,
2297                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2298                         'forceurl': opts.geturl,
2299                         'forcetitle': opts.gettitle,
2300                         'forcethumbnail': opts.getthumbnail,
2301                         'forcedescription': opts.getdescription,
2302                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2303                         'format': opts.format,
2304                         'format_limit': opts.format_limit,
2305                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2306                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2307                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2308                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2309                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2310                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2311                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2312                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2313                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2314                                 or u'%(id)s.%(ext)s'),
2315                         'ignoreerrors': opts.ignoreerrors,
2316                         'ratelimit': opts.ratelimit,
2317                         'nooverwrites': opts.nooverwrites,
2318                         'retries': opts.retries,
2319                         'continuedl': opts.continue_dl,
2320                         'noprogress': opts.noprogress,
2321                         'playliststart': opts.playliststart,
2322                         'playlistend': opts.playlistend,
2323                         'logtostderr': opts.outtmpl == '-',
2324                         })
2325                 fd.add_info_extractor(youtube_search_ie)
2326                 fd.add_info_extractor(youtube_pl_ie)
2327                 fd.add_info_extractor(youtube_user_ie)
2328                 fd.add_info_extractor(metacafe_ie)
2329                 fd.add_info_extractor(dailymotion_ie)
2330                 fd.add_info_extractor(youtube_ie)
2331                 fd.add_info_extractor(google_ie)
2332                 fd.add_info_extractor(google_search_ie)
2333                 fd.add_info_extractor(photobucket_ie)
2334                 fd.add_info_extractor(yahoo_ie)
2335                 fd.add_info_extractor(yahoo_search_ie)
2336
2337                 # This must come last since it's the
2338                 # fallback if none of the others work
2339                 fd.add_info_extractor(generic_ie)
2340
2341                 # Update version
2342                 if opts.update_self:
2343                         update_self(fd, sys.argv[0])
2344
2345                 # Maybe do nothing
2346                 if len(all_urls) < 1:
2347                         if not opts.update_self:
2348                                 parser.error(u'you must provide at least one URL')
2349                         else:
2350                                 sys.exit()
2351                 retcode = fd.download(all_urls)
2352
2353                 # Dump cookie jar if requested
2354                 if opts.cookiefile is not None:
2355                         try:
2356                                 jar.save()
2357                         except (IOError, OSError), err:
2358                                 sys.exit(u'ERROR: unable to save cookie jar')
2359
2360                 sys.exit(retcode)
2361
2362         except DownloadError:
2363                 sys.exit(1)
2364         except SameFileError:
2365                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2366         except KeyboardInterrupt:
2367                 sys.exit(u'\nERROR: Interrupted by user')