git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import locale
   8 import math
   9 import netrc
  10 import os
  11 import os.path
  12 import re
  13 import socket
  14 import string
  15 import sys
  16 import time
  17 import urllib
  18 import urllib2
  19
  20 std_headers = {
  21         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  22         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  23         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  24         'Accept-Language': 'en-us,en;q=0.5',
  25 }
  26
  27 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  28
  29 class DownloadError(Exception):
  30         """Download Error exception.
  31
  32         This exception may be thrown by FileDownloader objects if they are not
  33         configured to continue on errors. They will contain the appropriate
  34         error message.
  35         """
  36         pass
  37
  38 class SameFileError(Exception):
  39         """Same File exception.
  40
  41         This exception will be thrown by FileDownloader objects if they detect
  42         multiple files would have to be downloaded to the same file on disk.
  43         """
  44         pass
  45
  46 class PostProcessingError(Exception):
  47         """Post Processing exception.
  48
  49         This exception may be raised by PostProcessor's .run() method to
  50         indicate an error in the postprocessing task.
  51         """
  52         pass
  53
  54 class FileDownloader(object):
  55         """File Downloader class.
  56
  57         File downloader objects are the ones responsible of downloading the
  58         actual video file and writing it to disk if the user has requested
  59         it, among some other tasks. In most cases there should be one per
  60         program. As, given a video URL, the downloader doesn't know how to
  61         extract all the needed information, task that InfoExtractors do, it
  62         has to pass the URL to one of them.
  63
  64         For this, file downloader objects have a method that allows
  65         InfoExtractors to be registered in a given order. When it is passed
  66         a URL, the file downloader handles it to the first InfoExtractor it
  67         finds that reports being able to handle it. The InfoExtractor returns
  68         all the information to the FileDownloader and the latter downloads the
  69         file or does whatever it's instructed to do.
  70
  71         File downloaders accept a lot of parameters. In order not to saturate
  72         the object constructor with arguments, it receives a dictionary of
  73         options instead. These options are available through the get_params()
  74         method for the InfoExtractors to use. The FileDownloader also registers
  75         itself as the downloader in charge for the InfoExtractors that are
  76         added to it, so this is a "mutual registration".
  77
  78         Available options:
  79
  80         username:       Username for authentication purposes.
  81         password:       Password for authentication purposes.
  82         usenetrc:       Use netrc for authentication instead.
  83         quiet:          Do not print messages to stdout.
  84         forceurl:       Force printing final URL.
  85         forcetitle:     Force printing title.
  86         simulate:       Do not download the video files.
  87         format:         Video format code.
  88         outtmpl:        Template for output names.
  89         ignoreerrors:   Do not stop on download errors.
  90         ratelimit:      Download speed limit, in bytes/sec.
  91         """
  92
  93         _params = None
  94         _ies = []
  95         _pps = []
  96
  97         def __init__(self, params):
  98                 """Create a FileDownloader object with the given options."""
  99                 self._ies = []
 100                 self._pps = []
 101                 self.set_params(params)
 102
 103         @staticmethod
 104         def pmkdir(filename):
 105                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 106                 components = filename.split(os.sep)
 107                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 108                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 109                 for dir in aggregate:
 110                         if not os.path.exists(dir):
 111                                 os.mkdir(dir)
 112
 113         @staticmethod
 114         def format_bytes(bytes):
 115                 if bytes is None:
 116                         return 'N/A'
 117                 if bytes == 0:
 118                         exponent = 0
 119                 else:
 120                         exponent = long(math.log(float(bytes), 1024.0))
 121                 suffix = 'bkMGTPEZY'[exponent]
 122                 converted = float(bytes) / float(1024**exponent)
 123                 return '%.2f%s' % (converted, suffix)
 124
 125         @staticmethod
 126         def calc_percent(byte_counter, data_len):
 127                 if data_len is None:
 128                         return '---.-%'
 129                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 130
 131         @staticmethod
 132         def calc_eta(start, now, total, current):
 133                 if total is None:
 134                         return '--:--'
 135                 dif = now - start
 136                 if current == 0 or dif < 0.001: # One millisecond
 137                         return '--:--'
 138                 rate = float(current) / dif
 139                 eta = long((float(total) - float(current)) / rate)
 140                 (eta_mins, eta_secs) = divmod(eta, 60)
 141                 if eta_mins > 99:
 142                         return '--:--'
 143                 return '%02d:%02d' % (eta_mins, eta_secs)
 144
 145         @staticmethod
 146         def calc_speed(start, now, bytes):
 147                 dif = now - start
 148                 if bytes == 0 or dif < 0.001: # One millisecond
 149                         return '%10s' % '---b/s'
 150                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 151
 152         @staticmethod
 153         def best_block_size(elapsed_time, bytes):
 154                 new_min = max(bytes / 2.0, 1.0)
 155                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 156                 if elapsed_time < 0.001:
 157                         return int(new_max)
 158                 rate = bytes / elapsed_time
 159                 if rate > new_max:
 160                         return int(new_max)
 161                 if rate < new_min:
 162                         return int(new_min)
 163                 return int(rate)
 164
 165         @staticmethod
 166         def parse_bytes(bytestr):
 167                 """Parse a string indicating a byte quantity into a long integer."""
 168                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 169                 if matchobj is None:
 170                         return None
 171                 number = float(matchobj.group(1))
 172                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 173                 return long(round(number * multiplier))
 174
 175         def set_params(self, params):
 176                 """Sets parameters."""
 177                 if type(params) != dict:
 178                         raise ValueError('params: dictionary expected')
 179                 self._params = params
 180
 181         def get_params(self):
 182                 """Get parameters."""
 183                 return self._params
 184
 185         def add_info_extractor(self, ie):
 186                 """Add an InfoExtractor object to the end of the list."""
 187                 self._ies.append(ie)
 188                 ie.set_downloader(self)
 189
 190         def add_post_processor(self, pp):
 191                 """Add a PostProcessor object to the end of the chain."""
 192                 self._pps.append(pp)
 193                 pp.set_downloader(self)
 194
 195         def to_stdout(self, message, skip_eol=False):
 196                 """Print message to stdout if not in quiet mode."""
 197                 if not self._params.get('quiet', False):
 198                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 199                         sys.stdout.flush()
 200
 201         def to_stderr(self, message):
 202                 """Print message to stderr."""
 203                 print >>sys.stderr, message
 204
 205         def fixed_template(self):
 206                 """Checks if the output template is fixed."""
 207                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 208
 209         def trouble(self, message=None):
 210                 """Determine action to take when a download problem appears.
 211
 212                 Depending on if the downloader has been configured to ignore
 213                 download errors or not, this method may throw an exception or
 214                 not when errors are found, after printing the message. If it
 215                 doesn't raise, it returns an error code suitable to be returned
 216                 later as a program exit code to indicate error.
 217                 """
 218                 if message is not None:
 219                         self.to_stderr(message)
 220                 if not self._params.get('ignoreerrors', False):
 221                         raise DownloadError(message)
 222                 return 1
 223
 224         def slow_down(self, start_time, byte_counter):
 225                 """Sleep if the download speed is over the rate limit."""
 226                 rate_limit = self._params.get('ratelimit', None)
 227                 if rate_limit is None or byte_counter == 0:
 228                         return
 229                 now = time.time()
 230                 elapsed = now - start_time
 231                 if elapsed <= 0.0:
 232                         return
 233                 speed = float(byte_counter) / elapsed
 234                 if speed > rate_limit:
 235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 236
 237         def report_destination(self, filename):
 238                 """Report destination filename."""
 239                 self.to_stdout(u'[download] Destination: %s' % filename)
 240
 241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 242                 """Report download progress."""
 243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 245
 246         def report_finish(self):
 247                 """Report download finished."""
 248                 self.to_stdout(u'')
 249
 250         def download(self, url_list):
 251                 """Download a given list of URLs."""
 252                 retcode = 0
 253                 if len(url_list) > 1 and self.fixed_template():
 254                         raise SameFileError(self._params['outtmpl'])
 255
 256                 for url in url_list:
 257                         suitable_found = False
 258                         for ie in self._ies:
 259                                 if not ie.suitable(url):
 260                                         continue
 261                                 # Suitable InfoExtractor found
 262                                 suitable_found = True
 263                                 all_results = ie.extract(url)
 264                                 results = [x for x in all_results if x is not None]
 265                                 if len(results) != len(all_results):
 266                                         retcode = self.trouble()
 267
 268                                 if len(results) > 1 and self.fixed_template():
 269                                         raise SameFileError(self._params['outtmpl'])
 270
 271                                 for result in results:
 272                                         # Forced printings
 273                                         if self._params.get('forcetitle', False):
 274                                                 print result['title']
 275                                         if self._params.get('forceurl', False):
 276                                                 print result['url']
 277
 278                                         # Do nothing else if in simulate mode
 279                                         if self._params.get('simulate', False):
 280                                                 continue
 281
 282                                         try:
 283                                                 filename = self._params['outtmpl'] % result
 284                                                 self.report_destination(filename)
 285                                         except (ValueError, KeyError), err:
 286                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 287                                                 continue
 288                                         try:
 289                                                 self.pmkdir(filename)
 290                                         except (OSError, IOError), err:
 291                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 292                                                 continue
 293                                         try:
 294                                                 outstream = open(filename, 'wb')
 295                                         except (OSError, IOError), err:
 296                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 297                                                 continue
 298                                         try:
 299                                                 self._do_download(outstream, result['url'])
 300                                                 outstream.close()
 301                                         except (OSError, IOError), err:
 302                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 303                                                 continue
 304                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 305                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 306                                                 continue
 307                                         try:
 308                                                 self.post_process(filename, result)
 309                                         except (PostProcessingError), err:
 310                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
 311                                                 continue
 312
 313                                 break
 314                         if not suitable_found:
 315                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 316
 317                 return retcode
 318
 319         def post_process(self, filename, ie_info):
 320                 """Run the postprocessing chain on the given file."""
 321                 info = dict(ie_info)
 322                 info['filepath'] = filename
 323                 for pp in self._pps:
 324                         info = pp.run(info)
 325                         if info is None:
 326                                 break
 327
 328         def _do_download(self, stream, url):
 329                 request = urllib2.Request(url, None, std_headers)
 330                 data = urllib2.urlopen(request)
 331                 data_len = data.info().get('Content-length', None)
 332                 data_len_str = self.format_bytes(data_len)
 333                 byte_counter = 0
 334                 block_size = 1024
 335                 start = time.time()
 336                 while True:
 337                         # Progress message
 338                         percent_str = self.calc_percent(byte_counter, data_len)
 339                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 340                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 341                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 342
 343                         # Download and write
 344                         before = time.time()
 345                         data_block = data.read(block_size)
 346                         after = time.time()
 347                         data_block_len = len(data_block)
 348                         if data_block_len == 0:
 349                                 break
 350                         byte_counter += data_block_len
 351                         stream.write(data_block)
 352                         block_size = self.best_block_size(after - before, data_block_len)
 353
 354                         # Apply rate limit
 355                         self.slow_down(start, byte_counter)
 356
 357                 self.report_finish()
 358                 if data_len is not None and str(byte_counter) != data_len:
 359                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 360
 361 class InfoExtractor(object):
 362         """Information Extractor class.
 363
 364         Information extractors are the classes that, given a URL, extract
 365         information from the video (or videos) the URL refers to. This
 366         information includes the real video URL, the video title and simplified
 367         title, author and others. It is returned in a list of dictionaries when
 368         calling its extract() method. It is a list because a URL can refer to
 369         more than one video (think of playlists). The dictionaries must include
 370         the following fields:
 371
 372         id:             Video identifier.
 373         url:            Final video URL.
 374         uploader:       Nickname of the video uploader.
 375         title:          Literal title.
 376         stitle:         Simplified title.
 377         ext:            Video filename extension.
 378
 379         Subclasses of this one should re-define the _real_initialize() and
 380         _real_extract() methods, as well as the suitable() static method.
 381         Probably, they should also be instantiated and added to the main
 382         downloader.
 383         """
 384
 385         _ready = False
 386         _downloader = None
 387
 388         def __init__(self, downloader=None):
 389                 """Constructor. Receives an optional downloader."""
 390                 self._ready = False
 391                 self.set_downloader(downloader)
 392
 393         @staticmethod
 394         def suitable(url):
 395                 """Receives a URL and returns True if suitable for this IE."""
 396                 return False
 397
 398         def initialize(self):
 399                 """Initializes an instance (authentication, etc)."""
 400                 if not self._ready:
 401                         self._real_initialize()
 402                         self._ready = True
 403
 404         def extract(self, url):
 405                 """Extracts URL information and returns it in list of dicts."""
 406                 self.initialize()
 407                 return self._real_extract(url)
 408
 409         def set_downloader(self, downloader):
 410                 """Sets the downloader for this IE."""
 411                 self._downloader = downloader
 412
 413         def to_stdout(self, message):
 414                 """Print message to stdout if downloader is not in quiet mode."""
 415                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 416                         print message
 417
 418         def to_stderr(self, message):
 419                 """Print message to stderr."""
 420                 print >>sys.stderr, message
 421
 422         def _real_initialize(self):
 423                 """Real initialization process. Redefine in subclasses."""
 424                 pass
 425
 426         def _real_extract(self, url):
 427                 """Real extraction process. Redefine in subclasses."""
 428                 pass
 429
 430 class YoutubeIE(InfoExtractor):
 431         """Information extractor for youtube.com."""
 432
 433         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 434         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 435         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 436         _NETRC_MACHINE = 'youtube'
 437
 438         @staticmethod
 439         def suitable(url):
 440                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 441
 442         def report_login(self):
 443                 """Report attempt to log in."""
 444                 self.to_stdout(u'[youtube] Logging in')
 445
 446         def report_age_confirmation(self):
 447                 """Report attempt to confirm age."""
 448                 self.to_stdout(u'[youtube] Confirming age')
 449
 450         def report_webpage_download(self, video_id):
 451                 """Report attempt to download webpage."""
 452                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 453
 454         def report_information_extraction(self, video_id):
 455                 """Report attempt to extract video information."""
 456                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 457
 458         def report_video_url(self, video_id, video_real_url):
 459                 """Report extracted video URL."""
 460                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 461
 462         def _real_initialize(self):
 463                 if self._downloader is None:
 464                         return
 465
 466                 username = None
 467                 password = None
 468                 downloader_params = self._downloader.get_params()
 469
 470                 # Attempt to use provided username and password or .netrc data
 471                 if downloader_params.get('username', None) is not None:
 472                         username = downloader_params['username']
 473                         password = downloader_params['password']
 474                 elif downloader_params.get('usenetrc', False):
 475                         try:
 476                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 477                                 if info is not None:
 478                                         username = info[0]
 479                                         password = info[2]
 480                                 else:
 481                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 482                         except (IOError, netrc.NetrcParseError), err:
 483                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 484                                 return
 485
 486                 # No authentication to be performed
 487                 if username is None:
 488                         return
 489
 490                 # Log in
 491                 login_form = {
 492                                 'current_form': 'loginForm',
 493                                 'next':         '/',
 494                                 'action_login': 'Log In',
 495                                 'username':     username,
 496                                 'password':     password,
 497                                 }
 498                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 499                 try:
 500                         self.report_login()
 501                         login_results = urllib2.urlopen(request).read()
 502                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 503                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 504                                 return
 505                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 506                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 507                         return
 508
 509                 # Confirm age
 510                 age_form = {
 511                                 'next_url':             '/',
 512                                 'action_confirm':       'Confirm',
 513                                 }
 514                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 515                 try:
 516                         self.report_age_confirmation()
 517                         age_results = urllib2.urlopen(request).read()
 518                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 519                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 520                         return
 521
 522         def _real_extract(self, url):
 523                 # Extract video id from URL
 524                 mobj = re.match(self._VALID_URL, url)
 525                 if mobj is None:
 526                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 527                         return [None]
 528                 video_id = mobj.group(2)
 529
 530                 # Downloader parameters
 531                 format_param = None
 532                 if self._downloader is not None:
 533                         params = self._downloader.get_params()
 534                         format_param = params.get('format', None)
 535
 536                 # Extension
 537                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 538
 539                 # Normalize URL, including format
 540                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 541                 if format_param is not None:
 542                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 543                 request = urllib2.Request(normalized_url, None, std_headers)
 544                 try:
 545                         self.report_webpage_download(video_id)
 546                         video_webpage = urllib2.urlopen(request).read()
 547                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 548                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 549                         return [None]
 550                 self.report_information_extraction(video_id)
 551
 552                 # "t" param
 553                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 554                 if mobj is None:
 555                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 556                         return [None]
 557                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 558                 if format_param is not None:
 559                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 560                 self.report_video_url(video_id, video_real_url)
 561
 562                 # uploader
 563                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 564                 if mobj is None:
 565                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 566                         return [None]
 567                 video_uploader = mobj.group(1)
 568
 569                 # title
 570                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 571                 if mobj is None:
 572                         self.to_stderr(u'ERROR: unable to extract video title')
 573                         return [None]
 574                 video_title = mobj.group(1).decode('utf-8')
 575                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 576                 video_title = video_title.replace(os.sep, u'%')
 577
 578                 # simplified title
 579                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 580                 simple_title = simple_title.strip(ur'_')
 581
 582                 # Return information
 583                 return [{
 584                         'id':           video_id.decode('utf-8'),
 585                         'url':          video_real_url.decode('utf-8'),
 586                         'uploader':     video_uploader.decode('utf-8'),
 587                         'title':        video_title,
 588                         'stitle':       simple_title,
 589                         'ext':          video_extension.decode('utf-8'),
 590                         }]
 591
 592 class MetacafeIE(InfoExtractor):
 593         """Information Extractor for metacafe.com."""
 594
 595         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 596         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 597         _youtube_ie = None
 598
 599         def __init__(self, youtube_ie, downloader=None):
 600                 InfoExtractor.__init__(self, downloader)
 601                 self._youtube_ie = youtube_ie
 602
 603         @staticmethod
 604         def suitable(url):
 605                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 606
 607         def report_disclaimer(self):
 608                 """Report disclaimer retrieval."""
 609                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 610
 611         def report_age_confirmation(self):
 612                 """Report attempt to confirm age."""
 613                 self.to_stdout(u'[metacafe] Confirming age')
 614
 615         def report_download_webpage(self, video_id):
 616                 """Report webpage download."""
 617                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 618
 619         def report_extraction(self, video_id):
 620                 """Report information extraction."""
 621                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 622
 623         def _real_initialize(self):
 624                 # Retrieve disclaimer
 625                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 626                 try:
 627                         self.report_disclaimer()
 628                         disclaimer = urllib2.urlopen(request).read()
 629                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 630                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 631                         return
 632
 633                 # Confirm age
 634                 disclaimer_form = {
 635                         'filters': '0',
 636                         'submit': "Continue - I'm over 18",
 637                         }
 638                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 639                 try:
 640                         self.report_age_confirmation()
 641                         disclaimer = urllib2.urlopen(request).read()
 642                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 643                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 644                         return
 645
 646         def _real_extract(self, url):
 647                 # Extract id and simplified title from URL
 648                 mobj = re.match(self._VALID_URL, url)
 649                 if mobj is None:
 650                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 651                         return [None]
 652
 653                 video_id = mobj.group(1)
 654
 655                 # Check if video comes from YouTube
 656                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 657                 if mobj2 is not None:
 658                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 659
 660                 simple_title = mobj.group(2).decode('utf-8')
 661                 video_extension = 'flv'
 662
 663                 # Retrieve video webpage to extract further information
 664                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 665                 try:
 666                         self.report_download_webpage(video_id)
 667                         webpage = urllib2.urlopen(request).read()
 668                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 669                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 670                         return [None]
 671
 672                 # Extract URL, uploader and title from webpage
 673                 self.report_extraction(video_id)
 674                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 675                 if mobj is None:
 676                         self.to_stderr(u'ERROR: unable to extract media URL')
 677                         return [None]
 678                 mediaURL = mobj.group(1).replace('\\', '')
 679
 680                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 681                 if mobj is None:
 682                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 683                         return [None]
 684                 gdaKey = mobj.group(1)
 685
 686                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 687
 688                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 689                 if mobj is None:
 690                         self.to_stderr(u'ERROR: unable to extract title')
 691                         return [None]
 692                 video_title = mobj.group(1).decode('utf-8')
 693
 694                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 695                 if mobj is None:
 696                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 697                         return [None]
 698                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 699
 700                 # Return information
 701                 return [{
 702                         'id':           video_id.decode('utf-8'),
 703                         'url':          video_url.decode('utf-8'),
 704                         'uploader':     video_uploader.decode('utf-8'),
 705                         'title':        video_title,
 706                         'stitle':       simple_title,
 707                         'ext':          video_extension.decode('utf-8'),
 708                         }]
 709
 710 class YoutubePlaylistIE(InfoExtractor):
 711         """Information Extractor for YouTube playlists."""
 712
 713         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 714         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s'
 715         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 716         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 717         _youtube_ie = None
 718
 719         def __init__(self, youtube_ie, downloader=None):
 720                 InfoExtractor.__init__(self, downloader)
 721                 self._youtube_ie = youtube_ie
 722
 723         @staticmethod
 724         def suitable(url):
 725                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 726
 727         def report_download_page(self, playlist_id, pagenum):
 728                 """Report attempt to download playlist page with given number."""
 729                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 730
 731         def _real_initialize(self):
 732                 self._youtube_ie.initialize()
 733
 734         def _real_extract(self, url):
 735                 # Extract playlist id
 736                 mobj = re.match(self._VALID_URL, url)
 737                 if mobj is None:
 738                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 739                         return [None]
 740
 741                 # Download playlist pages
 742                 playlist_id = mobj.group(1)
 743                 video_ids = []
 744                 pagenum = 1
 745
 746                 while True:
 747                         self.report_download_page(playlist_id, pagenum)
 748                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 749                         try:
 750                                 page = urllib2.urlopen(request).read()
 751                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 752                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 753                                 return [None]
 754
 755                         # Extract video identifiers
 756                         ids_in_page = set()
 757                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 758                                 ids_in_page.add(mobj.group(1))
 759                         video_ids.extend(list(ids_in_page))
 760
 761                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 762                                 break
 763                         pagenum = pagenum + 1
 764
 765                 information = []
 766                 for id in video_ids:
 767                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 768                 return information
 769
 770 class PostProcessor(object):
 771         """Post Processor class.
 772
 773         PostProcessor objects can be added to downloaders with their
 774         add_post_processor() method. When the downloader has finished a
 775         successful download, it will take its internal chain of PostProcessors
 776         and start calling the run() method on each one of them, first with
 777         an initial argument and then with the returned value of the previous
 778         PostProcessor.
 779
 780         The chain will be stopped if one of them ever returns None or the end
 781         of the chain is reached.
 782
 783         PostProcessor objects follow a "mutual registration" process similar
 784         to InfoExtractor objects.
 785         """
 786
 787         _downloader = None
 788
 789         def __init__(self, downloader=None):
 790                 self._downloader = downloader
 791
 792         def to_stdout(self, message):
 793                 """Print message to stdout if downloader is not in quiet mode."""
 794                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 795                         print message
 796
 797         def to_stderr(self, message):
 798                 """Print message to stderr."""
 799                 print >>sys.stderr, message
 800
 801         def set_downloader(self, downloader):
 802                 """Sets the downloader for this PP."""
 803                 self._downloader = downloader
 804
 805         def run(self, information):
 806                 """Run the PostProcessor.
 807
 808                 The "information" argument is a dictionary like the ones
 809                 returned by InfoExtractors. The only difference is that this
 810                 one has an extra field called "filepath" that points to the
 811                 downloaded file.
 812
 813                 When this method returns None, the postprocessing chain is
 814                 stopped. However, this method may return an information
 815                 dictionary that will be passed to the next postprocessing
 816                 object in the chain. It can be the one it received after
 817                 changing some fields.
 818
 819                 In addition, this method may raise a PostProcessingError
 820                 exception that will be taken into account by the downloader
 821                 it was called from.
 822                 """
 823                 return information # by default, do nothing
 824
 825 ### MAIN PROGRAM ###
 826 if __name__ == '__main__':
 827         try:
 828                 # Modules needed only when running the main program
 829                 import getpass
 830                 import optparse
 831
 832                 # General configuration
 833                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 834                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 835                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 836
 837                 # Parse command line
 838                 parser = optparse.OptionParser(
 839                                 usage='Usage: %prog [options] url...',
 840                                 version='2008.09.20',
 841                                 conflict_handler='resolve',
 842                                 )
 843                 parser.add_option('-h', '--help',
 844                                 action='help', help='print this help text and exit')
 845                 parser.add_option('-v', '--version',
 846                                 action='version', help='print program version and exit')
 847                 parser.add_option('-u', '--username',
 848                                 dest='username', metavar='UN', help='account username')
 849                 parser.add_option('-p', '--password',
 850                                 dest='password', metavar='PW', help='account password')
 851                 parser.add_option('-o', '--output',
 852                                 dest='outtmpl', metavar='TPL', help='output filename template')
 853                 parser.add_option('-q', '--quiet',
 854                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 855                 parser.add_option('-s', '--simulate',
 856                                 action='store_true', dest='simulate', help='do not download video', default=False)
 857                 parser.add_option('-t', '--title',
 858                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 859                 parser.add_option('-l', '--literal',
 860                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 861                 parser.add_option('-n', '--netrc',
 862                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 863                 parser.add_option('-g', '--get-url',
 864                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 865                 parser.add_option('-e', '--get-title',
 866                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 867                 parser.add_option('-f', '--format',
 868                                 dest='format', metavar='FMT', help='video format code')
 869                 parser.add_option('-b', '--best-quality',
 870                                 action='store_const', dest='format', help='alias for -f 18', const='18')
 871                 parser.add_option('-m', '--mobile-version',
 872                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 873                 parser.add_option('-i', '--ignore-errors',
 874                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 875                 parser.add_option('-r', '--rate-limit',
 876                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 877                 (opts, args) = parser.parse_args()
 878
 879                 # Conflicting, missing and erroneous options
 880                 if len(args) < 1:
 881                         sys.exit(u'ERROR: you must provide at least one URL')
 882                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 883                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
 884                 if opts.password is not None and opts.username is None:
 885                         sys.exit(u'ERROR: account username missing')
 886                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 887                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
 888                 if opts.usetitle and opts.useliteral:
 889                         sys.exit(u'ERROR: using title conflicts with using literal title')
 890                 if opts.username is not None and opts.password is None:
 891                         opts.password = getpass.getpass(u'Type account password and press return:')
 892                 if opts.ratelimit is not None:
 893                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
 894                         if numeric_limit is None:
 895                                 sys.exit(u'ERROR: invalid rate limit specified')
 896                         opts.ratelimit = numeric_limit
 897
 898                 # Information extractors
 899                 youtube_ie = YoutubeIE()
 900                 metacafe_ie = MetacafeIE(youtube_ie)
 901                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
 902
 903                 # File downloader
 904                 fd = FileDownloader({
 905                         'usenetrc': opts.usenetrc,
 906                         'username': opts.username,
 907                         'password': opts.password,
 908                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 909                         'forceurl': opts.geturl,
 910                         'forcetitle': opts.gettitle,
 911                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 912                         'format': opts.format,
 913                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getdefaultlocale()[1]))
 914                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
 915                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
 916                                 or u'%(id)s.%(ext)s'),
 917                         'ignoreerrors': opts.ignoreerrors,
 918                         'ratelimit': opts.ratelimit,
 919                         })
 920                 fd.add_info_extractor(youtube_pl_ie)
 921                 fd.add_info_extractor(metacafe_ie)
 922                 fd.add_info_extractor(youtube_ie)
 923                 retcode = fd.download(args)
 924                 sys.exit(retcode)
 925
 926         except DownloadError:
 927                 sys.exit(1)
 928         except SameFileError:
 929                 sys.exit(u'ERROR: fixed output name but more than one file to download')
 930         except KeyboardInterrupt:
 931                 sys.exit(u'\nERROR: Interrupted by user')