_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor returns
  69         all the information to the FileDownloader and the latter downloads the
  70         file or does whatever it's instructed to do.
  71
  72         File downloaders accept a lot of parameters. In order not to saturate
  73         the object constructor with arguments, it receives a dictionary of
  74         options instead. These options are available through the params
  75         attribute for the InfoExtractors to use. The FileDownloader also
  76         registers itself as the downloader in charge for the InfoExtractors
  77         that are added to it, so this is a "mutual registration".
  78
  79         Available options:
  80
  81         username:       Username for authentication purposes.
  82         password:       Password for authentication purposes.
  83         usenetrc:       Use netrc for authentication instead.
  84         quiet:          Do not print messages to stdout.
  85         forceurl:       Force printing final URL.
  86         forcetitle:     Force printing title.
  87         simulate:       Do not download the video files.
  88         format:         Video format code.
  89         outtmpl:        Template for output names.
  90         ignoreerrors:   Do not stop on download errors.
  91         ratelimit:      Download speed limit, in bytes/sec.
  92         nooverwrites:   Prevent overwriting files.
  93         """
  94
  95         params = None
  96         _ies = []
  97         _pps = []
  98
  99         def __init__(self, params):
 100                 """Create a FileDownloader object with the given options."""
 101                 self._ies = []
 102                 self._pps = []
 103                 self.params = params
 104
 105         @staticmethod
 106         def pmkdir(filename):
 107                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 108                 components = filename.split(os.sep)
 109                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 110                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 111                 for dir in aggregate:
 112                         if not os.path.exists(dir):
 113                                 os.mkdir(dir)
 114
 115         @staticmethod
 116         def format_bytes(bytes):
 117                 if bytes is None:
 118                         return 'N/A'
 119                 if bytes == 0:
 120                         exponent = 0
 121                 else:
 122                         exponent = long(math.log(float(bytes), 1024.0))
 123                 suffix = 'bkMGTPEZY'[exponent]
 124                 converted = float(bytes) / float(1024**exponent)
 125                 return '%.2f%s' % (converted, suffix)
 126
 127         @staticmethod
 128         def calc_percent(byte_counter, data_len):
 129                 if data_len is None:
 130                         return '---.-%'
 131                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 132
 133         @staticmethod
 134         def calc_eta(start, now, total, current):
 135                 if total is None:
 136                         return '--:--'
 137                 dif = now - start
 138                 if current == 0 or dif < 0.001: # One millisecond
 139                         return '--:--'
 140                 rate = float(current) / dif
 141                 eta = long((float(total) - float(current)) / rate)
 142                 (eta_mins, eta_secs) = divmod(eta, 60)
 143                 if eta_mins > 99:
 144                         return '--:--'
 145                 return '%02d:%02d' % (eta_mins, eta_secs)
 146
 147         @staticmethod
 148         def calc_speed(start, now, bytes):
 149                 dif = now - start
 150                 if bytes == 0 or dif < 0.001: # One millisecond
 151                         return '%10s' % '---b/s'
 152                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 153
 154         @staticmethod
 155         def best_block_size(elapsed_time, bytes):
 156                 new_min = max(bytes / 2.0, 1.0)
 157                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 158                 if elapsed_time < 0.001:
 159                         return int(new_max)
 160                 rate = bytes / elapsed_time
 161                 if rate > new_max:
 162                         return int(new_max)
 163                 if rate < new_min:
 164                         return int(new_min)
 165                 return int(rate)
 166
 167         @staticmethod
 168         def parse_bytes(bytestr):
 169                 """Parse a string indicating a byte quantity into a long integer."""
 170                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 171                 if matchobj is None:
 172                         return None
 173                 number = float(matchobj.group(1))
 174                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 175                 return long(round(number * multiplier))
 176
 177         def add_info_extractor(self, ie):
 178                 """Add an InfoExtractor object to the end of the list."""
 179                 self._ies.append(ie)
 180                 ie.set_downloader(self)
 181
 182         def add_post_processor(self, pp):
 183                 """Add a PostProcessor object to the end of the chain."""
 184                 self._pps.append(pp)
 185                 pp.set_downloader(self)
 186
 187         def to_stdout(self, message, skip_eol=False):
 188                 """Print message to stdout if not in quiet mode."""
 189                 if not self.params.get('quiet', False):
 190                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 191                         sys.stdout.flush()
 192
 193         def to_stderr(self, message):
 194                 """Print message to stderr."""
 195                 print >>sys.stderr, message
 196
 197         def fixed_template(self):
 198                 """Checks if the output template is fixed."""
 199                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 200
 201         def trouble(self, message=None):
 202                 """Determine action to take when a download problem appears.
 203
 204                 Depending on if the downloader has been configured to ignore
 205                 download errors or not, this method may throw an exception or
 206                 not when errors are found, after printing the message. If it
 207                 doesn't raise, it returns an error code suitable to be returned
 208                 later as a program exit code to indicate error.
 209                 """
 210                 if message is not None:
 211                         self.to_stderr(message)
 212                 if not self.params.get('ignoreerrors', False):
 213                         raise DownloadError(message)
 214                 return 1
 215
 216         def slow_down(self, start_time, byte_counter):
 217                 """Sleep if the download speed is over the rate limit."""
 218                 rate_limit = self.params.get('ratelimit', None)
 219                 if rate_limit is None or byte_counter == 0:
 220                         return
 221                 now = time.time()
 222                 elapsed = now - start_time
 223                 if elapsed <= 0.0:
 224                         return
 225                 speed = float(byte_counter) / elapsed
 226                 if speed > rate_limit:
 227                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 228
 229         def report_destination(self, filename):
 230                 """Report destination filename."""
 231                 self.to_stdout(u'[download] Destination: %s' % filename)
 232
 233         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 234                 """Report download progress."""
 235                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 236                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 237
 238         def report_finish(self):
 239                 """Report download finished."""
 240                 self.to_stdout(u'')
 241
 242         def process_info(self, info_dict):
 243                 """Process a single dictionary returned by an InfoExtractor."""
 244                 # Forced printings
 245                 if self.params.get('forcetitle', False):
 246                         print info_dict['title']
 247                 if self.params.get('forceurl', False):
 248                         print info_dict['url']
 249
 250                 # Do nothing else if in simulate mode
 251                 if self.params.get('simulate', False):
 252                         return 0
 253
 254                 try:
 255                         filename = self.params['outtmpl'] % info_dict
 256                         self.report_destination(filename)
 257                 except (ValueError, KeyError), err:
 258                         return self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 259                 if self.params['nooverwrites'] and os.path.exists(filename):
 260                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 261                         return 0
 262                 try:
 263                         self.pmkdir(filename)
 264                 except (OSError, IOError), err:
 265                         return self.trouble('ERROR: unable to create directories: %s' % str(err))
 266                 try:
 267                         outstream = open(filename, 'wb')
 268                 except (OSError, IOError), err:
 269                         return self.trouble('ERROR: unable to open for writing: %s' % str(err))
 270                 try:
 271                         self._do_download(outstream, info_dict['url'])
 272                         outstream.close()
 273                 except (OSError, IOError), err:
 274                         return self.trouble('ERROR: unable to write video data: %s' % str(err))
 275                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 276                         return self.trouble('ERROR: unable to download video data: %s' % str(err))
 277                 try:
 278                         self.post_process(filename, info_dict)
 279                 except (PostProcessingError), err:
 280                         return self.trouble('ERROR: postprocessing: %s' % str(err))
 281
 282                 return 0
 283
 284         def download(self, url_list):
 285                 """Download a given list of URLs."""
 286                 retcode = 0
 287                 if len(url_list) > 1 and self.fixed_template():
 288                         raise SameFileError(self.params['outtmpl'])
 289
 290                 for url in url_list:
 291                         suitable_found = False
 292                         for ie in self._ies:
 293                                 # Go to next InfoExtractor if not suitable
 294                                 if not ie.suitable(url):
 295                                         continue
 296
 297                                 # Suitable InfoExtractor found
 298                                 suitable_found = True
 299
 300                                 # Extract information from URL
 301                                 all_results = ie.extract(url)
 302                                 results = [x for x in all_results if x is not None]
 303
 304                                 # See if there were problems extracting any information
 305                                 if len(results) != len(all_results):
 306                                         retcode = self.trouble()
 307
 308                                 # Two results could go to the same file
 309                                 if len(results) > 1 and self.fixed_template():
 310                                         raise SameFileError(self.params['outtmpl'])
 311
 312                                 # Process each result
 313                                 for result in results:
 314                                         result = self.process_info(result)
 315
 316                                         # Do not overwrite an error code with a success code
 317                                         if result != 0:
 318                                                 retcode = result
 319
 320                                 # Suitable InfoExtractor had been found; go to next URL
 321                                 break
 322
 323                         if not suitable_found:
 324                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 325
 326                 return retcode
 327
 328         def post_process(self, filename, ie_info):
 329                 """Run the postprocessing chain on the given file."""
 330                 info = dict(ie_info)
 331                 info['filepath'] = filename
 332                 for pp in self._pps:
 333                         info = pp.run(info)
 334                         if info is None:
 335                                 break
 336
 337         def _do_download(self, stream, url):
 338                 request = urllib2.Request(url, None, std_headers)
 339                 data = urllib2.urlopen(request)
 340                 data_len = data.info().get('Content-length', None)
 341                 data_len_str = self.format_bytes(data_len)
 342                 byte_counter = 0
 343                 block_size = 1024
 344                 start = time.time()
 345                 while True:
 346                         # Progress message
 347                         percent_str = self.calc_percent(byte_counter, data_len)
 348                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 349                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 350                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 351
 352                         # Download and write
 353                         before = time.time()
 354                         data_block = data.read(block_size)
 355                         after = time.time()
 356                         data_block_len = len(data_block)
 357                         if data_block_len == 0:
 358                                 break
 359                         byte_counter += data_block_len
 360                         stream.write(data_block)
 361                         block_size = self.best_block_size(after - before, data_block_len)
 362
 363                         # Apply rate limit
 364                         self.slow_down(start, byte_counter)
 365
 366                 self.report_finish()
 367                 if data_len is not None and str(byte_counter) != data_len:
 368                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 369
 370 class InfoExtractor(object):
 371         """Information Extractor class.
 372
 373         Information extractors are the classes that, given a URL, extract
 374         information from the video (or videos) the URL refers to. This
 375         information includes the real video URL, the video title and simplified
 376         title, author and others. It is returned in a list of dictionaries when
 377         calling its extract() method. It is a list because a URL can refer to
 378         more than one video (think of playlists). The dictionaries must include
 379         the following fields:
 380
 381         id:             Video identifier.
 382         url:            Final video URL.
 383         uploader:       Nickname of the video uploader.
 384         title:          Literal title.
 385         stitle:         Simplified title.
 386         ext:            Video filename extension.
 387
 388         Subclasses of this one should re-define the _real_initialize() and
 389         _real_extract() methods, as well as the suitable() static method.
 390         Probably, they should also be instantiated and added to the main
 391         downloader.
 392         """
 393
 394         _ready = False
 395         _downloader = None
 396
 397         def __init__(self, downloader=None):
 398                 """Constructor. Receives an optional downloader."""
 399                 self._ready = False
 400                 self.set_downloader(downloader)
 401
 402         @staticmethod
 403         def suitable(url):
 404                 """Receives a URL and returns True if suitable for this IE."""
 405                 return False
 406
 407         def initialize(self):
 408                 """Initializes an instance (authentication, etc)."""
 409                 if not self._ready:
 410                         self._real_initialize()
 411                         self._ready = True
 412
 413         def extract(self, url):
 414                 """Extracts URL information and returns it in list of dicts."""
 415                 self.initialize()
 416                 return self._real_extract(url)
 417
 418         def set_downloader(self, downloader):
 419                 """Sets the downloader for this IE."""
 420                 self._downloader = downloader
 421
 422         def to_stdout(self, message):
 423                 """Print message to stdout if downloader is not in quiet mode."""
 424                 if self._downloader is None or not self._downloader.params.get('quiet', False):
 425                         print message
 426
 427         def to_stderr(self, message):
 428                 """Print message to stderr."""
 429                 print >>sys.stderr, message
 430
 431         def _real_initialize(self):
 432                 """Real initialization process. Redefine in subclasses."""
 433                 pass
 434
 435         def _real_extract(self, url):
 436                 """Real extraction process. Redefine in subclasses."""
 437                 pass
 438
 439 class YoutubeIE(InfoExtractor):
 440         """Information extractor for youtube.com."""
 441
 442         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 443         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 444         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 445         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 446         _NETRC_MACHINE = 'youtube'
 447
 448         @staticmethod
 449         def suitable(url):
 450                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 451
 452         @staticmethod
 453         def htmlentity_transform(matchobj):
 454                 """Transforms an HTML entity to a Unicode character."""
 455                 entity = matchobj.group(1)
 456
 457                 # Known non-numeric HTML entity
 458                 if entity in htmlentitydefs.name2codepoint:
 459                         return unichr(htmlentitydefs.name2codepoint[entity])
 460
 461                 # Unicode character
 462                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 463                 if mobj is not None:
 464                         numstr = mobj.group(1)
 465                         if numstr.startswith(u'x'):
 466                                 base = 16
 467                                 numstr = u'0%s' % numstr
 468                         else:
 469                                 base = 10
 470                         return unichr(long(numstr, base))
 471
 472                 # Unknown entity in name, return its literal representation
 473                 return (u'&%s;' % entity)
 474
 475         def report_lang(self):
 476                 """Report attempt to set language."""
 477                 self.to_stdout(u'[youtube] Setting language')
 478
 479         def report_login(self):
 480                 """Report attempt to log in."""
 481                 self.to_stdout(u'[youtube] Logging in')
 482
 483         def report_age_confirmation(self):
 484                 """Report attempt to confirm age."""
 485                 self.to_stdout(u'[youtube] Confirming age')
 486
 487         def report_webpage_download(self, video_id):
 488                 """Report attempt to download webpage."""
 489                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 490
 491         def report_information_extraction(self, video_id):
 492                 """Report attempt to extract video information."""
 493                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 494
 495         def report_video_url(self, video_id, video_real_url):
 496                 """Report extracted video URL."""
 497                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 498
 499         def _real_initialize(self):
 500                 if self._downloader is None:
 501                         return
 502
 503                 username = None
 504                 password = None
 505                 downloader_params = self._downloader.params
 506
 507                 # Attempt to use provided username and password or .netrc data
 508                 if downloader_params.get('username', None) is not None:
 509                         username = downloader_params['username']
 510                         password = downloader_params['password']
 511                 elif downloader_params.get('usenetrc', False):
 512                         try:
 513                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 514                                 if info is not None:
 515                                         username = info[0]
 516                                         password = info[2]
 517                                 else:
 518                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 519                         except (IOError, netrc.NetrcParseError), err:
 520                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 521                                 return
 522
 523                 # Set language
 524                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 525                 try:
 526                         self.report_lang()
 527                         urllib2.urlopen(request).read()
 528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 529                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 530                         return
 531
 532                 # No authentication to be performed
 533                 if username is None:
 534                         return
 535
 536                 # Log in
 537                 login_form = {
 538                                 'current_form': 'loginForm',
 539                                 'next':         '/',
 540                                 'action_login': 'Log In',
 541                                 'username':     username,
 542                                 'password':     password,
 543                                 }
 544                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 545                 try:
 546                         self.report_login()
 547                         login_results = urllib2.urlopen(request).read()
 548                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 549                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 550                                 return
 551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 552                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 553                         return
 554
 555                 # Confirm age
 556                 age_form = {
 557                                 'next_url':             '/',
 558                                 'action_confirm':       'Confirm',
 559                                 }
 560                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 561                 try:
 562                         self.report_age_confirmation()
 563                         age_results = urllib2.urlopen(request).read()
 564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 565                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 566                         return
 567
 568         def _real_extract(self, url):
 569                 # Extract video id from URL
 570                 mobj = re.match(self._VALID_URL, url)
 571                 if mobj is None:
 572                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 573                         return [None]
 574                 video_id = mobj.group(2)
 575
 576                 # Downloader parameters
 577                 format_param = None
 578                 if self._downloader is not None:
 579                         params = self._downloader.params
 580                         format_param = params.get('format', None)
 581                 if format_param is None:
 582                         format_param = '34'
 583
 584                 # Extension
 585                 video_extension = {
 586                         '17': '3gp',
 587                         '18': 'mp4',
 588                         '22': 'mp4',
 589                 }.get(format_param, 'flv')
 590
 591                 # Normalize URL, including format
 592                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 593                 if format_param is not None:
 594                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 595                 request = urllib2.Request(normalized_url, None, std_headers)
 596                 try:
 597                         self.report_webpage_download(video_id)
 598                         video_webpage = urllib2.urlopen(request).read()
 599                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 600                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 601                         return [None]
 602                 self.report_information_extraction(video_id)
 603
 604                 # "t" param
 605                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 606                 if mobj is None:
 607                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 608                         return [None]
 609                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 610                 if format_param is not None:
 611                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 612                 self.report_video_url(video_id, video_real_url)
 613
 614                 # uploader
 615                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 616                 if mobj is None:
 617                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 618                         return [None]
 619                 video_uploader = mobj.group(1)
 620
 621                 # title
 622                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 623                 if mobj is None:
 624                         self.to_stderr(u'ERROR: unable to extract video title')
 625                         return [None]
 626                 video_title = mobj.group(1).decode('utf-8')
 627                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 628                 video_title = video_title.replace(os.sep, u'%')
 629
 630                 # simplified title
 631                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 632                 simple_title = simple_title.strip(ur'_')
 633
 634                 # Return information
 635                 return [{
 636                         'id':           video_id.decode('utf-8'),
 637                         'url':          video_real_url.decode('utf-8'),
 638                         'uploader':     video_uploader.decode('utf-8'),
 639                         'title':        video_title,
 640                         'stitle':       simple_title,
 641                         'ext':          video_extension.decode('utf-8'),
 642                         }]
 643
 644 class MetacafeIE(InfoExtractor):
 645         """Information Extractor for metacafe.com."""
 646
 647         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 648         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 649         _youtube_ie = None
 650
 651         def __init__(self, youtube_ie, downloader=None):
 652                 InfoExtractor.__init__(self, downloader)
 653                 self._youtube_ie = youtube_ie
 654
 655         @staticmethod
 656         def suitable(url):
 657                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 658
 659         def report_disclaimer(self):
 660                 """Report disclaimer retrieval."""
 661                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 662
 663         def report_age_confirmation(self):
 664                 """Report attempt to confirm age."""
 665                 self.to_stdout(u'[metacafe] Confirming age')
 666
 667         def report_download_webpage(self, video_id):
 668                 """Report webpage download."""
 669                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 670
 671         def report_extraction(self, video_id):
 672                 """Report information extraction."""
 673                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 674
 675         def _real_initialize(self):
 676                 # Retrieve disclaimer
 677                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 678                 try:
 679                         self.report_disclaimer()
 680                         disclaimer = urllib2.urlopen(request).read()
 681                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 682                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 683                         return
 684
 685                 # Confirm age
 686                 disclaimer_form = {
 687                         'filters': '0',
 688                         'submit': "Continue - I'm over 18",
 689                         }
 690                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 691                 try:
 692                         self.report_age_confirmation()
 693                         disclaimer = urllib2.urlopen(request).read()
 694                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 695                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 696                         return
 697
 698         def _real_extract(self, url):
 699                 # Extract id and simplified title from URL
 700                 mobj = re.match(self._VALID_URL, url)
 701                 if mobj is None:
 702                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 703                         return [None]
 704
 705                 video_id = mobj.group(1)
 706
 707                 # Check if video comes from YouTube
 708                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 709                 if mobj2 is not None:
 710                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 711
 712                 simple_title = mobj.group(2).decode('utf-8')
 713                 video_extension = 'flv'
 714
 715                 # Retrieve video webpage to extract further information
 716                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 717                 try:
 718                         self.report_download_webpage(video_id)
 719                         webpage = urllib2.urlopen(request).read()
 720                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 721                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 722                         return [None]
 723
 724                 # Extract URL, uploader and title from webpage
 725                 self.report_extraction(video_id)
 726                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 727                 if mobj is None:
 728                         self.to_stderr(u'ERROR: unable to extract media URL')
 729                         return [None]
 730                 mediaURL = mobj.group(1).replace('\\', '')
 731
 732                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 733                 if mobj is None:
 734                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 735                         return [None]
 736                 gdaKey = mobj.group(1)
 737
 738                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 739
 740                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 741                 if mobj is None:
 742                         self.to_stderr(u'ERROR: unable to extract title')
 743                         return [None]
 744                 video_title = mobj.group(1).decode('utf-8')
 745
 746                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 747                 if mobj is None:
 748                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 749                         return [None]
 750                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 751
 752                 # Return information
 753                 return [{
 754                         'id':           video_id.decode('utf-8'),
 755                         'url':          video_url.decode('utf-8'),
 756                         'uploader':     video_uploader.decode('utf-8'),
 757                         'title':        video_title,
 758                         'stitle':       simple_title,
 759                         'ext':          video_extension.decode('utf-8'),
 760                         }]
 761
 762
 763 class YoutubeSearchIE(InfoExtractor):
 764         """Information Extractor for YouTube search queries."""
 765         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 766         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 767         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 768         _MORE_PAGES_INDICATOR = r'>Next</a>'
 769         _youtube_ie = None
 770         _max_youtube_results = 1000
 771
 772         def __init__(self, youtube_ie, downloader=None):
 773                 InfoExtractor.__init__(self, downloader)
 774                 self._youtube_ie = youtube_ie
 775
 776         @staticmethod
 777         def suitable(url):
 778                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 779
 780         def report_download_page(self, query, pagenum):
 781                 """Report attempt to download playlist page with given number."""
 782                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 783
 784         def _real_initialize(self):
 785                 self._youtube_ie.initialize()
 786
 787         def _real_extract(self, query):
 788                 mobj = re.match(self._VALID_QUERY, query)
 789                 if mobj is None:
 790                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
 791                         return [None]
 792
 793                 prefix, query = query.split(':')
 794                 prefix = prefix[8:]
 795                 if prefix == '':
 796                         return self._download_n_results(query, 1)
 797                 elif prefix == 'all':
 798                         return self._download_n_results(query, self._max_youtube_results)
 799                 else:
 800                         try:
 801                                 n = int(prefix)
 802                                 if n <= 0:
 803                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 804                                         return [None]
 805                                 elif n > self._max_youtube_results:
 806                                         self.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 807                                         n = self._max_youtube_results
 808                                 return self._download_n_results(query, n)
 809                         except ValueError: # parsing prefix as int fails
 810                                 return self._download_n_results(query, 1)
 811
 812         def _download_n_results(self, query, n):
 813                 """Downloads a specified number of results for a query"""
 814
 815                 video_ids = []
 816                 already_seen = set()
 817                 pagenum = 1
 818
 819                 while True:
 820                         self.report_download_page(query, pagenum)
 821                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 822                         request = urllib2.Request(result_url, None, std_headers)
 823                         try:
 824                                 page = urllib2.urlopen(request).read()
 825                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 826                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 827                                 return [None]
 828
 829                         # Extract video identifiers
 830                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 831                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 832                                 if video_id not in already_seen:
 833                                         video_ids.append(video_id)
 834                                         already_seen.add(video_id)
 835                                         if len(video_ids) == n:
 836                                                 # Specified n videos reached
 837                                                 information = []
 838                                                 for id in video_ids:
 839                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 840                                                 return information
 841
 842                         if self._MORE_PAGES_INDICATOR not in page:
 843                                 information = []
 844                                 for id in video_ids:
 845                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 846                                 return information
 847
 848                         pagenum = pagenum + 1
 849
 850 class YoutubePlaylistIE(InfoExtractor):
 851         """Information Extractor for YouTube playlists."""
 852
 853         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 854         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 855         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 856         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 857         _youtube_ie = None
 858
 859         def __init__(self, youtube_ie, downloader=None):
 860                 InfoExtractor.__init__(self, downloader)
 861                 self._youtube_ie = youtube_ie
 862
 863         @staticmethod
 864         def suitable(url):
 865                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 866
 867         def report_download_page(self, playlist_id, pagenum):
 868                 """Report attempt to download playlist page with given number."""
 869                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 870
 871         def _real_initialize(self):
 872                 self._youtube_ie.initialize()
 873
 874         def _real_extract(self, url):
 875                 # Extract playlist id
 876                 mobj = re.match(self._VALID_URL, url)
 877                 if mobj is None:
 878                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 879                         return [None]
 880
 881                 # Download playlist pages
 882                 playlist_id = mobj.group(1)
 883                 video_ids = []
 884                 pagenum = 1
 885
 886                 while True:
 887                         self.report_download_page(playlist_id, pagenum)
 888                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 889                         try:
 890                                 page = urllib2.urlopen(request).read()
 891                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 892                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 893                                 return [None]
 894
 895                         # Extract video identifiers
 896                         ids_in_page = []
 897                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 898                                 if mobj.group(1) not in ids_in_page:
 899                                         ids_in_page.append(mobj.group(1))
 900                         video_ids.extend(ids_in_page)
 901
 902                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 903                                 break
 904                         pagenum = pagenum + 1
 905
 906                 information = []
 907                 for id in video_ids:
 908                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 909                 return information
 910
 911 class PostProcessor(object):
 912         """Post Processor class.
 913
 914         PostProcessor objects can be added to downloaders with their
 915         add_post_processor() method. When the downloader has finished a
 916         successful download, it will take its internal chain of PostProcessors
 917         and start calling the run() method on each one of them, first with
 918         an initial argument and then with the returned value of the previous
 919         PostProcessor.
 920
 921         The chain will be stopped if one of them ever returns None or the end
 922         of the chain is reached.
 923
 924         PostProcessor objects follow a "mutual registration" process similar
 925         to InfoExtractor objects.
 926         """
 927
 928         _downloader = None
 929
 930         def __init__(self, downloader=None):
 931                 self._downloader = downloader
 932
 933         def to_stdout(self, message):
 934                 """Print message to stdout if downloader is not in quiet mode."""
 935                 if self._downloader is None or not self._downloader.params.get('quiet', False):
 936                         print message
 937
 938         def to_stderr(self, message):
 939                 """Print message to stderr."""
 940                 print >>sys.stderr, message
 941
 942         def set_downloader(self, downloader):
 943                 """Sets the downloader for this PP."""
 944                 self._downloader = downloader
 945
 946         def run(self, information):
 947                 """Run the PostProcessor.
 948
 949                 The "information" argument is a dictionary like the ones
 950                 returned by InfoExtractors. The only difference is that this
 951                 one has an extra field called "filepath" that points to the
 952                 downloaded file.
 953
 954                 When this method returns None, the postprocessing chain is
 955                 stopped. However, this method may return an information
 956                 dictionary that will be passed to the next postprocessing
 957                 object in the chain. It can be the one it received after
 958                 changing some fields.
 959
 960                 In addition, this method may raise a PostProcessingError
 961                 exception that will be taken into account by the downloader
 962                 it was called from.
 963                 """
 964                 return information # by default, do nothing
 965
 966 ### MAIN PROGRAM ###
 967 if __name__ == '__main__':
 968         try:
 969                 # Modules needed only when running the main program
 970                 import getpass
 971                 import optparse
 972
 973                 # General configuration
 974                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 975                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 976                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 977
 978                 # Parse command line
 979                 parser = optparse.OptionParser(
 980                                 usage='Usage: %prog [options] url...',
 981                                 version='INTERNAL',
 982                                 conflict_handler='resolve',
 983                                 )
 984                 parser.add_option('-h', '--help',
 985                                 action='help', help='print this help text and exit')
 986                 parser.add_option('-v', '--version',
 987                                 action='version', help='print program version and exit')
 988                 parser.add_option('-u', '--username',
 989                                 dest='username', metavar='UN', help='account username')
 990                 parser.add_option('-p', '--password',
 991                                 dest='password', metavar='PW', help='account password')
 992                 parser.add_option('-o', '--output',
 993                                 dest='outtmpl', metavar='TPL', help='output filename template')
 994                 parser.add_option('-q', '--quiet',
 995                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 996                 parser.add_option('-s', '--simulate',
 997                                 action='store_true', dest='simulate', help='do not download video', default=False)
 998                 parser.add_option('-t', '--title',
 999                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1000                 parser.add_option('-l', '--literal',
1001                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1002                 parser.add_option('-n', '--netrc',
1003                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1004                 parser.add_option('-g', '--get-url',
1005                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1006                 parser.add_option('-e', '--get-title',
1007                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1008                 parser.add_option('-f', '--format',
1009                                 dest='format', metavar='FMT', help='video format code')
1010                 parser.add_option('-m', '--mobile-version',
1011                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1012                 parser.add_option('-d', '--high-def',
1013                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1014                 parser.add_option('-i', '--ignore-errors',
1015                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1016                 parser.add_option('-r', '--rate-limit',
1017                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1018                 parser.add_option('-a', '--batch-file',
1019                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1020                 parser.add_option('-w', '--no-overwrites',
1021                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1022                 (opts, args) = parser.parse_args()
1023
1024                 # Batch file verification
1025                 batchurls = []
1026                 if opts.batchfile is not None:
1027                         try:
1028                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1029                         except IOError:
1030                                 sys.exit(u'ERROR: batch file could not be read')
1031                 all_urls = batchurls + args
1032
1033                 # Conflicting, missing and erroneous options
1034                 if len(all_urls) < 1:
1035                         sys.exit(u'ERROR: you must provide at least one URL')
1036                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1037                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1038                 if opts.password is not None and opts.username is None:
1039                         sys.exit(u'ERROR: account username missing')
1040                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1041                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1042                 if opts.usetitle and opts.useliteral:
1043                         sys.exit(u'ERROR: using title conflicts with using literal title')
1044                 if opts.username is not None and opts.password is None:
1045                         opts.password = getpass.getpass(u'Type account password and press return:')
1046                 if opts.ratelimit is not None:
1047                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1048                         if numeric_limit is None:
1049                                 sys.exit(u'ERROR: invalid rate limit specified')
1050                         opts.ratelimit = numeric_limit
1051
1052                 # Information extractors
1053                 youtube_ie = YoutubeIE()
1054                 metacafe_ie = MetacafeIE(youtube_ie)
1055                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1056                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1057
1058                 # File downloader
1059                 charset = locale.getdefaultlocale()[1]
1060                 if charset is None:
1061                         charset = 'ascii'
1062                 fd = FileDownloader({
1063                         'usenetrc': opts.usenetrc,
1064                         'username': opts.username,
1065                         'password': opts.password,
1066                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1067                         'forceurl': opts.geturl,
1068                         'forcetitle': opts.gettitle,
1069                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1070                         'format': opts.format,
1071                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1072                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1073                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1074                                 or u'%(id)s.%(ext)s'),
1075                         'ignoreerrors': opts.ignoreerrors,
1076                         'ratelimit': opts.ratelimit,
1077                         'nooverwrites': opts.nooverwrites,
1078                         })
1079                 fd.add_info_extractor(youtube_search_ie)
1080                 fd.add_info_extractor(youtube_pl_ie)
1081                 fd.add_info_extractor(metacafe_ie)
1082                 fd.add_info_extractor(youtube_ie)
1083                 retcode = fd.download(all_urls)
1084                 sys.exit(retcode)
1085
1086         except DownloadError:
1087                 sys.exit(1)
1088         except SameFileError:
1089                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1090         except KeyboardInterrupt:
1091                 sys.exit(u'\nERROR: Interrupted by user')