git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class DownloadError(Exception):
  29         """Download Error exception.
  30
  31         This exception may be thrown by FileDownloader objects if they are not
  32         configured to continue on errors. They will contain the appropriate
  33         error message.
  34         """
  35         pass
  36
  37 class SameFileError(Exception):
  38         """Same File exception.
  39
  40         This exception will be thrown by FileDownloader objects if they detect
  41         multiple files would have to be downloaded to the same file on disk.
  42         """
  43         pass
  44
  45 class PostProcessingError(Exception):
  46         """Post Processing exception.
  47
  48         This exception may be raised by PostProcessor's .run() method to
  49         indicate an error in the postprocessing task.
  50         """
  51         pass
  52
  53 class FileDownloader(object):
  54         """File Downloader class.
  55
  56         File downloader objects are the ones responsible of downloading the
  57         actual video file and writing it to disk if the user has requested
  58         it, among some other tasks. In most cases there should be one per
  59         program. As, given a video URL, the downloader doesn't know how to
  60         extract all the needed information, task that InfoExtractors do, it
  61         has to pass the URL to one of them.
  62
  63         For this, file downloader objects have a method that allows
  64         InfoExtractors to be registered in a given order. When it is passed
  65         a URL, the file downloader handles it to the first InfoExtractor it
  66         finds that reports being able to handle it. The InfoExtractor returns
  67         all the information to the FileDownloader and the latter downloads the
  68         file or does whatever it's instructed to do.
  69
  70         File downloaders accept a lot of parameters. In order not to saturate
  71         the object constructor with arguments, it receives a dictionary of
  72         options instead. These options are available through the get_params()
  73         method for the InfoExtractors to use. The FileDownloader also registers
  74         itself as the downloader in charge for the InfoExtractors that are
  75         added to it, so this is a "mutual registration".
  76
  77         Available options:
  78
  79         username:       Username for authentication purposes.
  80         password:       Password for authentication purposes.
  81         usenetrc:       Use netrc for authentication instead.
  82         quiet:          Do not print messages to stdout.
  83         forceurl:       Force printing final URL.
  84         forcetitle:     Force printing title.
  85         simulate:       Do not download the video files.
  86         format:         Video format code.
  87         outtmpl:        Template for output names.
  88         ignoreerrors:   Do not stop on download errors.
  89         ratelimit:      Download speed limit, in bytes/sec.
  90         """
  91
  92         _params = None
  93         _ies = []
  94         _pps = []
  95
  96         def __init__(self, params):
  97                 """Create a FileDownloader object with the given options."""
  98                 self._ies = []
  99                 self._pps = []
 100                 self.set_params(params)
 101
 102         @staticmethod
 103         def pmkdir(filename):
 104                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 105                 components = filename.split(os.sep)
 106                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 107                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 108                 for dir in aggregate:
 109                         if not os.path.exists(dir):
 110                                 os.mkdir(dir)
 111
 112         @staticmethod
 113         def format_bytes(bytes):
 114                 if bytes is None:
 115                         return 'N/A'
 116                 if bytes == 0:
 117                         exponent = 0
 118                 else:
 119                         exponent = long(math.log(float(bytes), 1024.0))
 120                 suffix = 'bkMGTPEZY'[exponent]
 121                 converted = float(bytes) / float(1024**exponent)
 122                 return '%.2f%s' % (converted, suffix)
 123
 124         @staticmethod
 125         def calc_percent(byte_counter, data_len):
 126                 if data_len is None:
 127                         return '---.-%'
 128                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 129
 130         @staticmethod
 131         def calc_eta(start, now, total, current):
 132                 if total is None:
 133                         return '--:--'
 134                 dif = now - start
 135                 if current == 0 or dif < 0.001: # One millisecond
 136                         return '--:--'
 137                 rate = float(current) / dif
 138                 eta = long((float(total) - float(current)) / rate)
 139                 (eta_mins, eta_secs) = divmod(eta, 60)
 140                 if eta_mins > 99:
 141                         return '--:--'
 142                 return '%02d:%02d' % (eta_mins, eta_secs)
 143
 144         @staticmethod
 145         def calc_speed(start, now, bytes):
 146                 dif = now - start
 147                 if bytes == 0 or dif < 0.001: # One millisecond
 148                         return '%10s' % '---b/s'
 149                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 150
 151         @staticmethod
 152         def best_block_size(elapsed_time, bytes):
 153                 new_min = max(bytes / 2.0, 1.0)
 154                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 155                 if elapsed_time < 0.001:
 156                         return int(new_max)
 157                 rate = bytes / elapsed_time
 158                 if rate > new_max:
 159                         return int(new_max)
 160                 if rate < new_min:
 161                         return int(new_min)
 162                 return int(rate)
 163
 164         @staticmethod
 165         def parse_bytes(bytestr):
 166                 """Parse a string indicating a byte quantity into a long integer."""
 167                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 168                 if matchobj is None:
 169                         return None
 170                 number = float(matchobj.group(1))
 171                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 172                 return long(round(number * multiplier))
 173
 174         def set_params(self, params):
 175                 """Sets parameters."""
 176                 if type(params) != dict:
 177                         raise ValueError('params: dictionary expected')
 178                 self._params = params
 179
 180         def get_params(self):
 181                 """Get parameters."""
 182                 return self._params
 183
 184         def add_info_extractor(self, ie):
 185                 """Add an InfoExtractor object to the end of the list."""
 186                 self._ies.append(ie)
 187                 ie.set_downloader(self)
 188
 189         def add_post_processor(self, pp):
 190                 """Add a PostProcessor object to the end of the chain."""
 191                 self._pps.append(pp)
 192                 pp.set_downloader(self)
 193
 194         def to_stdout(self, message, skip_eol=False):
 195                 """Print message to stdout if not in quiet mode."""
 196                 if not self._params.get('quiet', False):
 197                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 198                         sys.stdout.flush()
 199
 200         def to_stderr(self, message):
 201                 """Print message to stderr."""
 202                 print >>sys.stderr, message
 203
 204         def fixed_template(self):
 205                 """Checks if the output template is fixed."""
 206                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 207
 208         def trouble(self, message=None):
 209                 """Determine action to take when a download problem appears.
 210
 211                 Depending on if the downloader has been configured to ignore
 212                 download errors or not, this method may throw an exception or
 213                 not when errors are found, after printing the message. If it
 214                 doesn't raise, it returns an error code suitable to be returned
 215                 later as a program exit code to indicate error.
 216                 """
 217                 if message is not None:
 218                         self.to_stderr(message)
 219                 if not self._params.get('ignoreerrors', False):
 220                         raise DownloadError(message)
 221                 return 1
 222
 223         def slow_down(self, start_time, byte_counter):
 224                 """Sleep if the download speed is over the rate limit."""
 225                 rate_limit = self._params.get('ratelimit', None)
 226                 if rate_limit is None or byte_counter == 0:
 227                         return
 228                 now = time.time()
 229                 elapsed = now - start_time
 230                 if elapsed <= 0.0:
 231                         return
 232                 speed = float(byte_counter) / elapsed
 233                 if speed > rate_limit:
 234                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 235
 236         def report_destination(self, filename):
 237                 """Report destination filename."""
 238                 self.to_stdout(u'[download] Destination: %s' % filename)
 239
 240         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 241                 """Report download progress."""
 242                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 243                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 244
 245         def report_finish(self):
 246                 """Report download finished."""
 247                 self.to_stdout(u'')
 248
 249         def download(self, url_list):
 250                 """Download a given list of URLs."""
 251                 retcode = 0
 252                 if len(url_list) > 1 and self.fixed_template():
 253                         raise SameFileError(self._params['outtmpl'])
 254
 255                 for url in url_list:
 256                         suitable_found = False
 257                         for ie in self._ies:
 258                                 if not ie.suitable(url):
 259                                         continue
 260                                 # Suitable InfoExtractor found
 261                                 suitable_found = True
 262                                 all_results = ie.extract(url)
 263                                 results = [x for x in all_results if x is not None]
 264                                 if len(results) != len(all_results):
 265                                         retcode = self.trouble()
 266
 267                                 if len(results) > 1 and self.fixed_template():
 268                                         raise SameFileError(self._params['outtmpl'])
 269
 270                                 for result in results:
 271                                         # Forced printings
 272                                         if self._params.get('forcetitle', False):
 273                                                 print result['title']
 274                                         if self._params.get('forceurl', False):
 275                                                 print result['url']
 276
 277                                         # Do nothing else if in simulate mode
 278                                         if self._params.get('simulate', False):
 279                                                 continue
 280
 281                                         try:
 282                                                 filename = self._params['outtmpl'] % result
 283                                                 self.report_destination(filename)
 284                                         except (ValueError, KeyError), err:
 285                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 286                                                 continue
 287                                         try:
 288                                                 self.pmkdir(filename)
 289                                         except (OSError, IOError), err:
 290                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 291                                                 continue
 292                                         try:
 293                                                 outstream = open(filename, 'wb')
 294                                         except (OSError, IOError), err:
 295                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 296                                                 continue
 297                                         try:
 298                                                 self._do_download(outstream, result['url'])
 299                                                 outstream.close()
 300                                         except (OSError, IOError), err:
 301                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 302                                                 continue
 303                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 304                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 305                                                 continue
 306                                         try:
 307                                                 self.post_process(filename, result)
 308                                         except (PostProcessingError), err:
 309                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
 310                                                 continue
 311
 312                                 break
 313                         if not suitable_found:
 314                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 315
 316                 return retcode
 317
 318         def post_process(self, filename, ie_info):
 319                 """Run the postprocessing chain on the given file."""
 320                 info = dict(ie_info)
 321                 info['filepath'] = filename
 322                 for pp in self._pps:
 323                         info = pp.run(info)
 324                         if info is None:
 325                                 break
 326
 327         def _do_download(self, stream, url):
 328                 request = urllib2.Request(url, None, std_headers)
 329                 data = urllib2.urlopen(request)
 330                 data_len = data.info().get('Content-length', None)
 331                 data_len_str = self.format_bytes(data_len)
 332                 byte_counter = 0
 333                 block_size = 1024
 334                 start = time.time()
 335                 while True:
 336                         # Progress message
 337                         percent_str = self.calc_percent(byte_counter, data_len)
 338                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 339                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 340                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 341
 342                         # Download and write
 343                         before = time.time()
 344                         data_block = data.read(block_size)
 345                         after = time.time()
 346                         data_block_len = len(data_block)
 347                         if data_block_len == 0:
 348                                 break
 349                         byte_counter += data_block_len
 350                         stream.write(data_block)
 351                         block_size = self.best_block_size(after - before, data_block_len)
 352
 353                         # Apply rate limit
 354                         self.slow_down(start, byte_counter)
 355
 356                 self.report_finish()
 357                 if data_len is not None and str(byte_counter) != data_len:
 358                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 359
 360 class InfoExtractor(object):
 361         """Information Extractor class.
 362
 363         Information extractors are the classes that, given a URL, extract
 364         information from the video (or videos) the URL refers to. This
 365         information includes the real video URL, the video title and simplified
 366         title, author and others. It is returned in a list of dictionaries when
 367         calling its extract() method. It is a list because a URL can refer to
 368         more than one video (think of playlists). The dictionaries must include
 369         the following fields:
 370
 371         id:             Video identifier.
 372         url:            Final video URL.
 373         uploader:       Nickname of the video uploader.
 374         title:          Literal title.
 375         stitle:         Simplified title.
 376         ext:            Video filename extension.
 377
 378         Subclasses of this one should re-define the _real_initialize() and
 379         _real_extract() methods, as well as the suitable() static method.
 380         Probably, they should also be instantiated and added to the main
 381         downloader.
 382         """
 383
 384         _ready = False
 385         _downloader = None
 386
 387         def __init__(self, downloader=None):
 388                 """Constructor. Receives an optional downloader."""
 389                 self._ready = False
 390                 self.set_downloader(downloader)
 391
 392         @staticmethod
 393         def suitable(url):
 394                 """Receives a URL and returns True if suitable for this IE."""
 395                 return False
 396
 397         def initialize(self):
 398                 """Initializes an instance (authentication, etc)."""
 399                 if not self._ready:
 400                         self._real_initialize()
 401                         self._ready = True
 402
 403         def extract(self, url):
 404                 """Extracts URL information and returns it in list of dicts."""
 405                 self.initialize()
 406                 return self._real_extract(url)
 407
 408         def set_downloader(self, downloader):
 409                 """Sets the downloader for this IE."""
 410                 self._downloader = downloader
 411
 412         def to_stdout(self, message):
 413                 """Print message to stdout if downloader is not in quiet mode."""
 414                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 415                         print message
 416
 417         def to_stderr(self, message):
 418                 """Print message to stderr."""
 419                 print >>sys.stderr, message
 420
 421         def _real_initialize(self):
 422                 """Real initialization process. Redefine in subclasses."""
 423                 pass
 424
 425         def _real_extract(self, url):
 426                 """Real extraction process. Redefine in subclasses."""
 427                 pass
 428
 429 class YoutubeIE(InfoExtractor):
 430         """Information extractor for youtube.com."""
 431
 432         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 433         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 434         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 435         _NETRC_MACHINE = 'youtube'
 436
 437         @staticmethod
 438         def suitable(url):
 439                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 440
 441         def report_login(self):
 442                 """Report attempt to log in."""
 443                 self.to_stdout(u'[youtube] Logging in')
 444
 445         def report_age_confirmation(self):
 446                 """Report attempt to confirm age."""
 447                 self.to_stdout(u'[youtube] Confirming age')
 448
 449         def report_webpage_download(self, video_id):
 450                 """Report attempt to download webpage."""
 451                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 452
 453         def report_information_extraction(self, video_id):
 454                 """Report attempt to extract video information."""
 455                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 456
 457         def report_video_url(self, video_id, video_real_url):
 458                 """Report extracted video URL."""
 459                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 460
 461         def _real_initialize(self):
 462                 if self._downloader is None:
 463                         return
 464
 465                 username = None
 466                 password = None
 467                 downloader_params = self._downloader.get_params()
 468
 469                 # Attempt to use provided username and password or .netrc data
 470                 if downloader_params.get('username', None) is not None:
 471                         username = downloader_params['username']
 472                         password = downloader_params['password']
 473                 elif downloader_params.get('usenetrc', False):
 474                         try:
 475                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 476                                 if info is not None:
 477                                         username = info[0]
 478                                         password = info[2]
 479                                 else:
 480                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 481                         except (IOError, netrc.NetrcParseError), err:
 482                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 483                                 return
 484
 485                 # No authentication to be performed
 486                 if username is None:
 487                         return
 488
 489                 # Log in
 490                 login_form = {
 491                                 'current_form': 'loginForm',
 492                                 'next':         '/',
 493                                 'action_login': 'Log In',
 494                                 'username':     username,
 495                                 'password':     password,
 496                                 }
 497                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 498                 try:
 499                         self.report_login()
 500                         login_results = urllib2.urlopen(request).read()
 501                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 502                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 503                                 return
 504                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 505                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 506                         return
 507
 508                 # Confirm age
 509                 age_form = {
 510                                 'next_url':             '/',
 511                                 'action_confirm':       'Confirm',
 512                                 }
 513                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 514                 try:
 515                         self.report_age_confirmation()
 516                         age_results = urllib2.urlopen(request).read()
 517                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 518                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 519                         return
 520
 521         def _real_extract(self, url):
 522                 # Extract video id from URL
 523                 mobj = re.match(self._VALID_URL, url)
 524                 if mobj is None:
 525                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 526                         return [None]
 527                 video_id = mobj.group(2)
 528
 529                 # Downloader parameters
 530                 format_param = None
 531                 if self._downloader is not None:
 532                         params = self._downloader.get_params()
 533                         format_param = params.get('format', None)
 534
 535                 # Extension
 536                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 537
 538                 # Normalize URL, including format
 539                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 540                 if format_param is not None:
 541                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 542                 request = urllib2.Request(normalized_url, None, std_headers)
 543                 try:
 544                         self.report_webpage_download(video_id)
 545                         video_webpage = urllib2.urlopen(request).read()
 546                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 547                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 548                         return [None]
 549                 self.report_information_extraction(video_id)
 550
 551                 # "t" param
 552                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 553                 if mobj is None:
 554                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 555                         return [None]
 556                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 557                 if format_param is not None:
 558                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 559                 self.report_video_url(video_id, video_real_url)
 560
 561                 # uploader
 562                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 563                 if mobj is None:
 564                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 565                         return [None]
 566                 video_uploader = mobj.group(1)
 567
 568                 # title
 569                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 570                 if mobj is None:
 571                         self.to_stderr(u'ERROR: unable to extract video title')
 572                         return [None]
 573                 video_title = mobj.group(1).decode('utf-8')
 574                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 575                 video_title = video_title.replace(os.sep, u'%')
 576
 577                 # simplified title
 578                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 579                 simple_title = simple_title.strip(ur'_')
 580
 581                 # Return information
 582                 return [{
 583                         'id':           video_id.decode('utf-8'),
 584                         'url':          video_real_url.decode('utf-8'),
 585                         'uploader':     video_uploader.decode('utf-8'),
 586                         'title':        video_title,
 587                         'stitle':       simple_title,
 588                         'ext':          video_extension.decode('utf-8'),
 589                         }]
 590
 591 class MetacafeIE(InfoExtractor):
 592         """Information Extractor for metacafe.com."""
 593
 594         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 595         _DISCLAIMER = 'http://www.metacafe.com/disclaimer'
 596         _youtube_ie = None
 597
 598         def __init__(self, youtube_ie, downloader=None):
 599                 InfoExtractor.__init__(self, downloader)
 600                 self._youtube_ie = youtube_ie
 601
 602         @staticmethod
 603         def suitable(url):
 604                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 605
 606         def report_disclaimer(self):
 607                 """Report disclaimer retrieval."""
 608                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 609
 610         def report_age_confirmation(self):
 611                 """Report attempt to confirm age."""
 612                 self.to_stdout(u'[metacafe] Confirming age')
 613
 614         def report_download_webpage(self, video_id):
 615                 """Report webpage download."""
 616                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 617
 618         def report_extraction(self, video_id):
 619                 """Report information extraction."""
 620                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 621
 622         def _real_initialize(self):
 623                 # Retrieve disclaimer
 624                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 625                 try:
 626                         self.report_disclaimer()
 627                         disclaimer = urllib2.urlopen(request).read()
 628                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 629                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 630                         return
 631
 632                 # Confirm age
 633                 disclaimer_form = {
 634                         'allowAdultContent': '1',
 635                         'submit': "Continue - I'm over 18",
 636                         }
 637                 request = urllib2.Request('http://www.metacafe.com/watch/', urllib.urlencode(disclaimer_form), std_headers)
 638                 try:
 639                         self.report_age_confirmation()
 640                         disclaimer = urllib2.urlopen(request).read()
 641                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 642                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 643                         return
 644
 645         def _real_extract(self, url):
 646                 # Extract id and simplified title from URL
 647                 mobj = re.match(self._VALID_URL, url)
 648                 if mobj is None:
 649                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 650                         return [None]
 651
 652                 video_id = mobj.group(1)
 653
 654                 # Check if video comes from YouTube
 655                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 656                 if mobj2 is not None:
 657                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 658
 659                 simple_title = mobj.group(2).decode('utf-8')
 660                 video_extension = 'flv'
 661
 662                 # Retrieve video webpage to extract further information
 663                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 664                 try:
 665                         self.report_download_webpage(video_id)
 666                         webpage = urllib2.urlopen(request).read()
 667                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 668                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 669                         return [None]
 670
 671                 # Extract URL, uploader and title from webpage
 672                 self.report_extraction(video_id)
 673                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 674                 if mobj is None:
 675                         self.to_stderr(u'ERROR: unable to extract media URL')
 676                         return [None]
 677                 mediaURL = mobj.group(1).replace('\\', '')
 678
 679                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 680                 if mobj is None:
 681                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 682                         return [None]
 683                 gdaKey = mobj.group(1)
 684
 685                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 686
 687                 mobj = re.search(r'(?im)<meta name="title" content="Metacafe - ([^"]+)"', webpage)
 688                 if mobj is None:
 689                         self.to_stderr(u'ERROR: unable to extract title')
 690                         return [None]
 691                 video_title = mobj.group(1).decode('utf-8')
 692
 693                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 694                 if mobj is None:
 695                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 696                         return [None]
 697                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 698
 699                 # Return information
 700                 return [{
 701                         'id':           video_id.decode('utf-8'),
 702                         'url':          video_url.decode('utf-8'),
 703                         'uploader':     video_uploader.decode('utf-8'),
 704                         'title':        video_title,
 705                         'stitle':       simple_title,
 706                         'ext':          video_extension.decode('utf-8'),
 707                         }]
 708
 709 class YoutubePlaylistIE(InfoExtractor):
 710         """Information Extractor for YouTube playlists."""
 711
 712         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 713         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s'
 714         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 715         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 716         _youtube_ie = None
 717
 718         def __init__(self, youtube_ie, downloader=None):
 719                 InfoExtractor.__init__(self, downloader)
 720                 self._youtube_ie = youtube_ie
 721
 722         @staticmethod
 723         def suitable(url):
 724                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 725
 726         def report_download_page(self, playlist_id, pagenum):
 727                 """Report attempt to download playlist page with given number."""
 728                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 729
 730         def _real_initialize(self):
 731                 self._youtube_ie.initialize()
 732
 733         def _real_extract(self, url):
 734                 # Extract playlist id
 735                 mobj = re.match(self._VALID_URL, url)
 736                 if mobj is None:
 737                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 738                         return [None]
 739
 740                 # Download playlist pages
 741                 playlist_id = mobj.group(1)
 742                 video_ids = []
 743                 pagenum = 1
 744
 745                 while True:
 746                         self.report_download_page(playlist_id, pagenum)
 747                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 748                         try:
 749                                 page = urllib2.urlopen(request).read()
 750                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 751                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 752                                 return [None]
 753
 754                         # Extract video identifiers
 755                         ids_in_page = set()
 756                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 757                                 ids_in_page.add(mobj.group(1))
 758                         video_ids.extend(list(ids_in_page))
 759
 760                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 761                                 break
 762                         pagenum = pagenum + 1
 763
 764                 information = []
 765                 for id in video_ids:
 766                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 767                 return information
 768
 769 class PostProcessor(object):
 770         """Post Processor class.
 771
 772         PostProcessor objects can be added to downloaders with their
 773         add_post_processor() method. When the downloader has finished a
 774         successful download, it will take its internal chain of PostProcessors
 775         and start calling the run() method on each one of them, first with
 776         an initial argument and then with the returned value of the previous
 777         PostProcessor.
 778
 779         The chain will be stopped if one of them ever returns None or the end
 780         of the chain is reached.
 781
 782         PostProcessor objects follow a "mutual registration" process similar
 783         to InfoExtractor objects.
 784         """
 785
 786         _downloader = None
 787
 788         def __init__(self, downloader=None):
 789                 self._downloader = downloader
 790
 791         def to_stdout(self, message):
 792                 """Print message to stdout if downloader is not in quiet mode."""
 793                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 794                         print message
 795
 796         def to_stderr(self, message):
 797                 """Print message to stderr."""
 798                 print >>sys.stderr, message
 799
 800         def set_downloader(self, downloader):
 801                 """Sets the downloader for this PP."""
 802                 self._downloader = downloader
 803
 804         def run(self, information):
 805                 """Run the PostProcessor.
 806
 807                 The "information" argument is a dictionary like the ones
 808                 returned by InfoExtractors. The only difference is that this
 809                 one has an extra field called "filepath" that points to the
 810                 downloaded file.
 811
 812                 When this method returns None, the postprocessing chain is
 813                 stopped. However, this method may return an information
 814                 dictionary that will be passed to the next postprocessing
 815                 object in the chain. It can be the one it received after
 816                 changing some fields.
 817
 818                 In addition, this method may raise a PostProcessingError
 819                 exception that will be taken into account by the downloader
 820                 it was called from.
 821                 """
 822                 return information # by default, do nothing
 823
 824 ### MAIN PROGRAM ###
 825 if __name__ == '__main__':
 826         try:
 827                 # Modules needed only when running the main program
 828                 import getpass
 829                 import optparse
 830
 831                 # General configuration
 832                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 833                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 834                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 835
 836                 # Parse command line
 837                 parser = optparse.OptionParser(
 838                                 usage='Usage: %prog [options] url...',
 839                                 version='2008.08.09',
 840                                 conflict_handler='resolve',
 841                                 )
 842                 parser.add_option('-h', '--help',
 843                                 action='help', help='print this help text and exit')
 844                 parser.add_option('-v', '--version',
 845                                 action='version', help='print program version and exit')
 846                 parser.add_option('-u', '--username',
 847                                 dest='username', metavar='UN', help='account username')
 848                 parser.add_option('-p', '--password',
 849                                 dest='password', metavar='PW', help='account password')
 850                 parser.add_option('-o', '--output',
 851                                 dest='outtmpl', metavar='TPL', help='output filename template')
 852                 parser.add_option('-q', '--quiet',
 853                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 854                 parser.add_option('-s', '--simulate',
 855                                 action='store_true', dest='simulate', help='do not download video', default=False)
 856                 parser.add_option('-t', '--title',
 857                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 858                 parser.add_option('-l', '--literal',
 859                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 860                 parser.add_option('-n', '--netrc',
 861                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 862                 parser.add_option('-g', '--get-url',
 863                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 864                 parser.add_option('-e', '--get-title',
 865                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 866                 parser.add_option('-f', '--format',
 867                                 dest='format', metavar='FMT', help='video format code')
 868                 parser.add_option('-b', '--best-quality',
 869                                 action='store_const', dest='format', help='alias for -f 18', const='18')
 870                 parser.add_option('-m', '--mobile-version',
 871                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 872                 parser.add_option('-i', '--ignore-errors',
 873                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 874                 parser.add_option('-r', '--rate-limit',
 875                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 876                 (opts, args) = parser.parse_args()
 877
 878                 # Conflicting, missing and erroneous options
 879                 if len(args) < 1:
 880                         sys.exit(u'ERROR: you must provide at least one URL')
 881                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 882                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
 883                 if opts.password is not None and opts.username is None:
 884                         sys.exit(u'ERROR: account username missing')
 885                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 886                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
 887                 if opts.usetitle and opts.useliteral:
 888                         sys.exit(u'ERROR: using title conflicts with using literal title')
 889                 if opts.username is not None and opts.password is None:
 890                         opts.password = getpass.getpass(u'Type account password and press return:')
 891                 if opts.ratelimit is not None:
 892                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
 893                         if numeric_limit is None:
 894                                 sys.exit(u'ERROR: invalid rate limit specified')
 895                         opts.ratelimit = numeric_limit
 896
 897                 # Information extractors
 898                 youtube_ie = YoutubeIE()
 899                 metacafe_ie = MetacafeIE(youtube_ie)
 900                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
 901
 902                 # File downloader
 903                 fd = FileDownloader({
 904                         'usenetrc': opts.usenetrc,
 905                         'username': opts.username,
 906                         'password': opts.password,
 907                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 908                         'forceurl': opts.geturl,
 909                         'forcetitle': opts.gettitle,
 910                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 911                         'format': opts.format,
 912                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode())
 913                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
 914                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
 915                                 or u'%(id)s.%(ext)s'),
 916                         'ignoreerrors': opts.ignoreerrors,
 917                         'ratelimit': opts.ratelimit,
 918                         })
 919                 fd.add_info_extractor(youtube_pl_ie)
 920                 fd.add_info_extractor(metacafe_ie)
 921                 fd.add_info_extractor(youtube_ie)
 922                 retcode = fd.download(args)
 923                 sys.exit(retcode)
 924
 925         except DownloadError:
 926                 sys.exit(1)
 927         except SameFileError:
 928                 sys.exit(u'ERROR: fixed output name but more than one file to download')
 929         except KeyboardInterrupt:
 930                 sys.exit(u'\nERROR: Interrupted by user')