git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61
  62 class FileDownloader(object):
  63         """File Downloader class.
  64
  65         File downloader objects are the ones responsible of downloading the
  66         actual video file and writing it to disk if the user has requested
  67         it, among some other tasks. In most cases there should be one per
  68         program. As, given a video URL, the downloader doesn't know how to
  69         extract all the needed information, task that InfoExtractors do, it
  70         has to pass the URL to one of them.
  71
  72         For this, file downloader objects have a method that allows
  73         InfoExtractors to be registered in a given order. When it is passed
  74         a URL, the file downloader handles it to the first InfoExtractor it
  75         finds that reports being able to handle it. The InfoExtractor extracts
  76         all the information about the video or videos the URL refers to, and
  77         asks the FileDownloader to process the video information, possibly
  78         downloading the video.
  79
  80         File downloaders accept a lot of parameters. In order not to saturate
  81         the object constructor with arguments, it receives a dictionary of
  82         options instead. These options are available through the params
  83         attribute for the InfoExtractors to use. The FileDownloader also
  84         registers itself as the downloader in charge for the InfoExtractors
  85         that are added to it, so this is a "mutual registration".
  86
  87         Available options:
  88
  89         username:       Username for authentication purposes.
  90         password:       Password for authentication purposes.
  91         usenetrc:       Use netrc for authentication instead.
  92         quiet:          Do not print messages to stdout.
  93         forceurl:       Force printing final URL.
  94         forcetitle:     Force printing title.
  95         simulate:       Do not download the video files.
  96         format:         Video format code.
  97         outtmpl:        Template for output names.
  98         ignoreerrors:   Do not stop on download errors.
  99         ratelimit:      Download speed limit, in bytes/sec.
 100         nooverwrites:   Prevent overwriting files.
 101         """
 102
 103         params = None
 104         _ies = []
 105         _pps = []
 106         _download_retcode = None
 107
 108         def __init__(self, params):
 109                 """Create a FileDownloader object with the given options."""
 110                 self._ies = []
 111                 self._pps = []
 112                 self._download_retcode = 0
 113                 self.params = params
 114
 115         @staticmethod
 116         def pmkdir(filename):
 117                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 118                 components = filename.split(os.sep)
 119                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 120                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 121                 for dir in aggregate:
 122                         if not os.path.exists(dir):
 123                                 os.mkdir(dir)
 124
 125         @staticmethod
 126         def format_bytes(bytes):
 127                 if bytes is None:
 128                         return 'N/A'
 129                 if bytes == 0:
 130                         exponent = 0
 131                 else:
 132                         exponent = long(math.log(float(bytes), 1024.0))
 133                 suffix = 'bkMGTPEZY'[exponent]
 134                 converted = float(bytes) / float(1024**exponent)
 135                 return '%.2f%s' % (converted, suffix)
 136
 137         @staticmethod
 138         def calc_percent(byte_counter, data_len):
 139                 if data_len is None:
 140                         return '---.-%'
 141                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 142
 143         @staticmethod
 144         def calc_eta(start, now, total, current):
 145                 if total is None:
 146                         return '--:--'
 147                 dif = now - start
 148                 if current == 0 or dif < 0.001: # One millisecond
 149                         return '--:--'
 150                 rate = float(current) / dif
 151                 eta = long((float(total) - float(current)) / rate)
 152                 (eta_mins, eta_secs) = divmod(eta, 60)
 153                 if eta_mins > 99:
 154                         return '--:--'
 155                 return '%02d:%02d' % (eta_mins, eta_secs)
 156
 157         @staticmethod
 158         def calc_speed(start, now, bytes):
 159                 dif = now - start
 160                 if bytes == 0 or dif < 0.001: # One millisecond
 161                         return '%10s' % '---b/s'
 162                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 163
 164         @staticmethod
 165         def best_block_size(elapsed_time, bytes):
 166                 new_min = max(bytes / 2.0, 1.0)
 167                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 168                 if elapsed_time < 0.001:
 169                         return int(new_max)
 170                 rate = bytes / elapsed_time
 171                 if rate > new_max:
 172                         return int(new_max)
 173                 if rate < new_min:
 174                         return int(new_min)
 175                 return int(rate)
 176
 177         @staticmethod
 178         def parse_bytes(bytestr):
 179                 """Parse a string indicating a byte quantity into a long integer."""
 180                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 181                 if matchobj is None:
 182                         return None
 183                 number = float(matchobj.group(1))
 184                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 185                 return long(round(number * multiplier))
 186
 187         def add_info_extractor(self, ie):
 188                 """Add an InfoExtractor object to the end of the list."""
 189                 self._ies.append(ie)
 190                 ie.set_downloader(self)
 191
 192         def add_post_processor(self, pp):
 193                 """Add a PostProcessor object to the end of the chain."""
 194                 self._pps.append(pp)
 195                 pp.set_downloader(self)
 196
 197         def to_stdout(self, message, skip_eol=False):
 198                 """Print message to stdout if not in quiet mode."""
 199                 if not self.params.get('quiet', False):
 200                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 201                         sys.stdout.flush()
 202
 203         def to_stderr(self, message):
 204                 """Print message to stderr."""
 205                 print >>sys.stderr, message
 206
 207         def fixed_template(self):
 208                 """Checks if the output template is fixed."""
 209                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 210
 211         def trouble(self, message=None):
 212                 """Determine action to take when a download problem appears.
 213
 214                 Depending on if the downloader has been configured to ignore
 215                 download errors or not, this method may throw an exception or
 216                 not when errors are found, after printing the message.
 217                 """
 218                 if message is not None:
 219                         self.to_stderr(message)
 220                 if not self.params.get('ignoreerrors', False):
 221                         raise DownloadError(message)
 222                 self._download_retcode = 1
 223
 224         def slow_down(self, start_time, byte_counter):
 225                 """Sleep if the download speed is over the rate limit."""
 226                 rate_limit = self.params.get('ratelimit', None)
 227                 if rate_limit is None or byte_counter == 0:
 228                         return
 229                 now = time.time()
 230                 elapsed = now - start_time
 231                 if elapsed <= 0.0:
 232                         return
 233                 speed = float(byte_counter) / elapsed
 234                 if speed > rate_limit:
 235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 236
 237         def report_destination(self, filename):
 238                 """Report destination filename."""
 239                 self.to_stdout(u'[download] Destination: %s' % filename)
 240
 241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 242                 """Report download progress."""
 243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 245
 246         def report_finish(self):
 247                 """Report download finished."""
 248                 self.to_stdout(u'')
 249
 250         def process_info(self, info_dict):
 251                 """Process a single dictionary returned by an InfoExtractor."""
 252                 # Forced printings
 253                 if self.params.get('forcetitle', False):
 254                         print info_dict['title'].encode(locale.getpreferredencoding())
 255                 if self.params.get('forceurl', False):
 256                         print info_dict['url'].encode(locale.getpreferredencoding())
 257
 258                 # Do nothing else if in simulate mode
 259                 if self.params.get('simulate', False):
 260                         return
 261
 262                 try:
 263                         filename = self.params['outtmpl'] % info_dict
 264                         self.report_destination(filename)
 265                 except (ValueError, KeyError), err:
 266                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 267                 if self.params['nooverwrites'] and os.path.exists(filename):
 268                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 269                         return
 270
 271                 try:
 272                         self.pmkdir(filename)
 273                 except (OSError, IOError), err:
 274                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 275                         return
 276
 277                 try:
 278                         outstream = open(filename, 'wb')
 279                 except (OSError, IOError), err:
 280                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 281                         return
 282
 283                 try:
 284                         self._do_download(outstream, info_dict['url'])
 285                         outstream.close()
 286                 except (OSError, IOError), err:
 287                         os.remove(filename)
 288                         raise UnavailableFormatError
 289                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 290                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 291                         return
 292
 293                 try:
 294                         self.post_process(filename, info_dict)
 295                 except (PostProcessingError), err:
 296                         self.trouble('ERROR: postprocessing: %s' % str(err))
 297                         return
 298
 299         def download(self, url_list):
 300                 """Download a given list of URLs."""
 301                 if len(url_list) > 1 and self.fixed_template():
 302                         raise SameFileError(self.params['outtmpl'])
 303
 304                 for url in url_list:
 305                         suitable_found = False
 306                         for ie in self._ies:
 307                                 # Go to next InfoExtractor if not suitable
 308                                 if not ie.suitable(url):
 309                                         continue
 310
 311                                 # Suitable InfoExtractor found
 312                                 suitable_found = True
 313
 314                                 # Extract information from URL and process it
 315                                 ie.extract(url)
 316
 317                                 # Suitable InfoExtractor had been found; go to next URL
 318                                 break
 319
 320                         if not suitable_found:
 321                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 322
 323                 return self._download_retcode
 324
 325         def post_process(self, filename, ie_info):
 326                 """Run the postprocessing chain on the given file."""
 327                 info = dict(ie_info)
 328                 info['filepath'] = filename
 329                 for pp in self._pps:
 330                         info = pp.run(info)
 331                         if info is None:
 332                                 break
 333
 334         def _do_download(self, stream, url):
 335                 request = urllib2.Request(url, None, std_headers)
 336                 data = urllib2.urlopen(request)
 337                 data_len = data.info().get('Content-length', None)
 338                 data_len_str = self.format_bytes(data_len)
 339                 byte_counter = 0
 340                 block_size = 1024
 341                 start = time.time()
 342                 while True:
 343                         # Progress message
 344                         percent_str = self.calc_percent(byte_counter, data_len)
 345                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 346                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 347                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 348
 349                         # Download and write
 350                         before = time.time()
 351                         data_block = data.read(block_size)
 352                         after = time.time()
 353                         data_block_len = len(data_block)
 354                         if data_block_len == 0:
 355                                 break
 356                         byte_counter += data_block_len
 357                         stream.write(data_block)
 358                         block_size = self.best_block_size(after - before, data_block_len)
 359
 360                         # Apply rate limit
 361                         self.slow_down(start, byte_counter)
 362
 363                 self.report_finish()
 364                 if data_len is not None and str(byte_counter) != data_len:
 365                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 366
 367 class InfoExtractor(object):
 368         """Information Extractor class.
 369
 370         Information extractors are the classes that, given a URL, extract
 371         information from the video (or videos) the URL refers to. This
 372         information includes the real video URL, the video title and simplified
 373         title, author and others. The information is stored in a dictionary
 374         which is then passed to the FileDownloader. The FileDownloader
 375         processes this information possibly downloading the video to the file
 376         system, among other possible outcomes. The dictionaries must include
 377         the following fields:
 378
 379         id:             Video identifier.
 380         url:            Final video URL.
 381         uploader:       Nickname of the video uploader.
 382         title:          Literal title.
 383         stitle:         Simplified title.
 384         ext:            Video filename extension.
 385
 386         Subclasses of this one should re-define the _real_initialize() and
 387         _real_extract() methods, as well as the suitable() static method.
 388         Probably, they should also be instantiated and added to the main
 389         downloader.
 390         """
 391
 392         _ready = False
 393         _downloader = None
 394
 395         def __init__(self, downloader=None):
 396                 """Constructor. Receives an optional downloader."""
 397                 self._ready = False
 398                 self.set_downloader(downloader)
 399
 400         @staticmethod
 401         def suitable(url):
 402                 """Receives a URL and returns True if suitable for this IE."""
 403                 return False
 404
 405         def initialize(self):
 406                 """Initializes an instance (authentication, etc)."""
 407                 if not self._ready:
 408                         self._real_initialize()
 409                         self._ready = True
 410
 411         def extract(self, url):
 412                 """Extracts URL information and returns it in list of dicts."""
 413                 self.initialize()
 414                 return self._real_extract(url)
 415
 416         def set_downloader(self, downloader):
 417                 """Sets the downloader for this IE."""
 418                 self._downloader = downloader
 419
 420         def _real_initialize(self):
 421                 """Real initialization process. Redefine in subclasses."""
 422                 pass
 423
 424         def _real_extract(self, url):
 425                 """Real extraction process. Redefine in subclasses."""
 426                 pass
 427
 428 class YoutubeIE(InfoExtractor):
 429         """Information extractor for youtube.com."""
 430
 431         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 432         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 433         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 434         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 435         _NETRC_MACHINE = 'youtube'
 436         _available_formats = ['22', '18', '17', '13'] # listed in order of priority for -b flag
 437         _video_extensions = {
 438                 '13': '3gp',
 439                 '17': 'mp4',
 440                 '18': 'mp4',
 441                 '22': 'mp4',
 442         }
 443
 444         @staticmethod
 445         def suitable(url):
 446                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 447
 448         @staticmethod
 449         def htmlentity_transform(matchobj):
 450                 """Transforms an HTML entity to a Unicode character."""
 451                 entity = matchobj.group(1)
 452
 453                 # Known non-numeric HTML entity
 454                 if entity in htmlentitydefs.name2codepoint:
 455                         return unichr(htmlentitydefs.name2codepoint[entity])
 456
 457                 # Unicode character
 458                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 459                 if mobj is not None:
 460                         numstr = mobj.group(1)
 461                         if numstr.startswith(u'x'):
 462                                 base = 16
 463                                 numstr = u'0%s' % numstr
 464                         else:
 465                                 base = 10
 466                         return unichr(long(numstr, base))
 467
 468                 # Unknown entity in name, return its literal representation
 469                 return (u'&%s;' % entity)
 470
 471         def report_lang(self):
 472                 """Report attempt to set language."""
 473                 self._downloader.to_stdout(u'[youtube] Setting language')
 474
 475         def report_login(self):
 476                 """Report attempt to log in."""
 477                 self._downloader.to_stdout(u'[youtube] Logging in')
 478
 479         def report_age_confirmation(self):
 480                 """Report attempt to confirm age."""
 481                 self._downloader.to_stdout(u'[youtube] Confirming age')
 482
 483         def report_webpage_download(self, video_id):
 484                 """Report attempt to download webpage."""
 485                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 486
 487         def report_information_extraction(self, video_id):
 488                 """Report attempt to extract video information."""
 489                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 490
 491         def report_video_url(self, video_id, video_real_url):
 492                 """Report extracted video URL."""
 493                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 494
 495         def report_unavailable_format(self, video_id, format):
 496                 """Report extracted video URL."""
 497                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 498
 499         def _real_initialize(self):
 500                 if self._downloader is None:
 501                         return
 502
 503                 username = None
 504                 password = None
 505                 downloader_params = self._downloader.params
 506
 507                 # Attempt to use provided username and password or .netrc data
 508                 if downloader_params.get('username', None) is not None:
 509                         username = downloader_params['username']
 510                         password = downloader_params['password']
 511                 elif downloader_params.get('usenetrc', False):
 512                         try:
 513                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 514                                 if info is not None:
 515                                         username = info[0]
 516                                         password = info[2]
 517                                 else:
 518                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 519                         except (IOError, netrc.NetrcParseError), err:
 520                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 521                                 return
 522
 523                 # Set language
 524                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 525                 try:
 526                         self.report_lang()
 527                         urllib2.urlopen(request).read()
 528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 529                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 530                         return
 531
 532                 # No authentication to be performed
 533                 if username is None:
 534                         return
 535
 536                 # Log in
 537                 login_form = {
 538                                 'current_form': 'loginForm',
 539                                 'next':         '/',
 540                                 'action_login': 'Log In',
 541                                 'username':     username,
 542                                 'password':     password,
 543                                 }
 544                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 545                 try:
 546                         self.report_login()
 547                         login_results = urllib2.urlopen(request).read()
 548                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 549                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 550                                 return
 551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 552                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 553                         return
 554
 555                 # Confirm age
 556                 age_form = {
 557                                 'next_url':             '/',
 558                                 'action_confirm':       'Confirm',
 559                                 }
 560                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 561                 try:
 562                         self.report_age_confirmation()
 563                         age_results = urllib2.urlopen(request).read()
 564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 565                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 566                         return
 567
 568         def _real_extract(self, url):
 569                 # Extract video id from URL
 570                 mobj = re.match(self._VALID_URL, url)
 571                 if mobj is None:
 572                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 573                         return
 574                 video_id = mobj.group(2)
 575
 576                 # Downloader parameters
 577                 best_quality = False
 578                 format_param = None
 579                 quality_index = 0
 580                 if self._downloader is not None:
 581                         params = self._downloader.params
 582                         format_param = params.get('format', None)
 583                         if format_param == '0':
 584                                 format_param = self._available_formats[quality_index]
 585                                 best_quality = True
 586
 587                 while True:
 588                         try:
 589                                 # Extension
 590                                 video_extension = self._video_extensions.get(format_param, 'flv')
 591
 592                                 # Normalize URL, including format
 593                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 594                                 if format_param is not None:
 595                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 596                                 request = urllib2.Request(normalized_url, None, std_headers)
 597                                 try:
 598                                         self.report_webpage_download(video_id)
 599                                         video_webpage = urllib2.urlopen(request).read()
 600                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 601                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 602                                         return
 603                                 self.report_information_extraction(video_id)
 604
 605                                 # "t" param
 606                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 607                                 if mobj is None:
 608                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 609                                         return
 610                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 611                                 if format_param is not None:
 612                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 613                                 self.report_video_url(video_id, video_real_url)
 614
 615                                 # uploader
 616                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 617                                 if mobj is None:
 618                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 619                                         return
 620                                 video_uploader = mobj.group(1)
 621
 622                                 # title
 623                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 624                                 if mobj is None:
 625                                         self._downloader.trouble(u'ERROR: unable to extract video title')
 626                                         return
 627                                 video_title = mobj.group(1).decode('utf-8')
 628                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 629                                 video_title = video_title.replace(os.sep, u'%')
 630
 631                                 # simplified title
 632                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 633                                 simple_title = simple_title.strip(ur'_')
 634
 635                                 # Process video information
 636                                 self._downloader.process_info({
 637                                         'id':           video_id.decode('utf-8'),
 638                                         'url':          video_real_url.decode('utf-8'),
 639                                         'uploader':     video_uploader.decode('utf-8'),
 640                                         'title':        video_title,
 641                                         'stitle':       simple_title,
 642                                         'ext':          video_extension.decode('utf-8'),
 643                                 })
 644
 645                                 return
 646
 647                         except UnavailableFormatError, err:
 648                                 if best_quality:
 649                                         if quality_index == len(self._available_formats) - 1:
 650                                                 # I don't ever expect this to happen
 651                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 652                                                 return
 653                                         else:
 654                                                 self.report_unavailable_format(video_id, format_param)
 655                                                 quality_index += 1
 656                                                 format_param = self._available_formats[quality_index]
 657                                                 continue
 658                                 else:
 659                                         self._downloader.trouble('ERROR: format not available for video')
 660                                         return
 661
 662
 663 class MetacafeIE(InfoExtractor):
 664         """Information Extractor for metacafe.com."""
 665
 666         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 667         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 668         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 669         _youtube_ie = None
 670
 671         def __init__(self, youtube_ie, downloader=None):
 672                 InfoExtractor.__init__(self, downloader)
 673                 self._youtube_ie = youtube_ie
 674
 675         @staticmethod
 676         def suitable(url):
 677                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 678
 679         def report_disclaimer(self):
 680                 """Report disclaimer retrieval."""
 681                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 682
 683         def report_age_confirmation(self):
 684                 """Report attempt to confirm age."""
 685                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 686
 687         def report_download_webpage(self, video_id):
 688                 """Report webpage download."""
 689                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 690
 691         def report_extraction(self, video_id):
 692                 """Report information extraction."""
 693                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 694
 695         def _real_initialize(self):
 696                 # Retrieve disclaimer
 697                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 698                 try:
 699                         self.report_disclaimer()
 700                         disclaimer = urllib2.urlopen(request).read()
 701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 702                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 703                         return
 704
 705                 # Confirm age
 706                 disclaimer_form = {
 707                         'filters': '0',
 708                         'submit': "Continue - I'm over 18",
 709                         }
 710                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 711                 try:
 712                         self.report_age_confirmation()
 713                         disclaimer = urllib2.urlopen(request).read()
 714                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 715                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 716                         return
 717
 718         def _real_extract(self, url):
 719                 # Extract id and simplified title from URL
 720                 mobj = re.match(self._VALID_URL, url)
 721                 if mobj is None:
 722                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 723                         return
 724
 725                 video_id = mobj.group(1)
 726
 727                 # Check if video comes from YouTube
 728                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 729                 if mobj2 is not None:
 730                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 731                         return
 732
 733                 simple_title = mobj.group(2).decode('utf-8')
 734                 video_extension = 'flv'
 735
 736                 # Retrieve video webpage to extract further information
 737                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 738                 try:
 739                         self.report_download_webpage(video_id)
 740                         webpage = urllib2.urlopen(request).read()
 741                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 742                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 743                         return
 744
 745                 # Extract URL, uploader and title from webpage
 746                 self.report_extraction(video_id)
 747                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 748                 if mobj is None:
 749                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 750                         return
 751                 mediaURL = urllib.unquote(mobj.group(1))
 752
 753                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 754                 if mobj is None:
 755                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 756                         return
 757                 gdaKey = mobj.group(1)
 758
 759                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 760
 761                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 762                 if mobj is None:
 763                         self._downloader.trouble(u'ERROR: unable to extract title')
 764                         return
 765                 video_title = mobj.group(1).decode('utf-8')
 766
 767                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 768                 if mobj is None:
 769                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 770                         return
 771                 video_uploader = mobj.group(1)
 772
 773                 try:
 774                         # Process video information
 775                         self._downloader.process_info({
 776                                 'id':           video_id.decode('utf-8'),
 777                                 'url':          video_url.decode('utf-8'),
 778                                 'uploader':     video_uploader.decode('utf-8'),
 779                                 'title':        video_title,
 780                                 'stitle':       simple_title,
 781                                 'ext':          video_extension.decode('utf-8'),
 782                         })
 783                 except UnavailableFormatError:
 784                         self._downloader.trouble(u'ERROR: format not available for video')
 785
 786
 787 class YoutubeSearchIE(InfoExtractor):
 788         """Information Extractor for YouTube search queries."""
 789         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 790         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 791         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 792         _MORE_PAGES_INDICATOR = r'>Next</a>'
 793         _youtube_ie = None
 794         _max_youtube_results = 1000
 795
 796         def __init__(self, youtube_ie, downloader=None):
 797                 InfoExtractor.__init__(self, downloader)
 798                 self._youtube_ie = youtube_ie
 799
 800         @staticmethod
 801         def suitable(url):
 802                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 803
 804         def report_download_page(self, query, pagenum):
 805                 """Report attempt to download playlist page with given number."""
 806                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 807
 808         def _real_initialize(self):
 809                 self._youtube_ie.initialize()
 810
 811         def _real_extract(self, query):
 812                 mobj = re.match(self._VALID_QUERY, query)
 813                 if mobj is None:
 814                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 815                         return
 816
 817                 prefix, query = query.split(':')
 818                 prefix = prefix[8:]
 819                 if prefix == '':
 820                         self._download_n_results(query, 1)
 821                         return
 822                 elif prefix == 'all':
 823                         self._download_n_results(query, self._max_youtube_results)
 824                         return
 825                 else:
 826                         try:
 827                                 n = int(prefix)
 828                                 if n <= 0:
 829                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 830                                         return
 831                                 elif n > self._max_youtube_results:
 832                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 833                                         n = self._max_youtube_results
 834                                 self._download_n_results(query, n)
 835                                 return
 836                         except ValueError: # parsing prefix as int fails
 837                                 self._download_n_results(query, 1)
 838                                 return
 839
 840         def _download_n_results(self, query, n):
 841                 """Downloads a specified number of results for a query"""
 842
 843                 video_ids = []
 844                 already_seen = set()
 845                 pagenum = 1
 846
 847                 while True:
 848                         self.report_download_page(query, pagenum)
 849                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 850                         request = urllib2.Request(result_url, None, std_headers)
 851                         try:
 852                                 page = urllib2.urlopen(request).read()
 853                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 854                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 855                                 return
 856
 857                         # Extract video identifiers
 858                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 859                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 860                                 if video_id not in already_seen:
 861                                         video_ids.append(video_id)
 862                                         already_seen.add(video_id)
 863                                         if len(video_ids) == n:
 864                                                 # Specified n videos reached
 865                                                 for id in video_ids:
 866                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 867                                                 return
 868
 869                         if self._MORE_PAGES_INDICATOR not in page:
 870                                 for id in video_ids:
 871                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 872                                 return
 873
 874                         pagenum = pagenum + 1
 875
 876 class YoutubePlaylistIE(InfoExtractor):
 877         """Information Extractor for YouTube playlists."""
 878
 879         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 880         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 881         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 882         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 883         _youtube_ie = None
 884
 885         def __init__(self, youtube_ie, downloader=None):
 886                 InfoExtractor.__init__(self, downloader)
 887                 self._youtube_ie = youtube_ie
 888
 889         @staticmethod
 890         def suitable(url):
 891                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 892
 893         def report_download_page(self, playlist_id, pagenum):
 894                 """Report attempt to download playlist page with given number."""
 895                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 896
 897         def _real_initialize(self):
 898                 self._youtube_ie.initialize()
 899
 900         def _real_extract(self, url):
 901                 # Extract playlist id
 902                 mobj = re.match(self._VALID_URL, url)
 903                 if mobj is None:
 904                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 905                         return
 906
 907                 # Download playlist pages
 908                 playlist_id = mobj.group(1)
 909                 video_ids = []
 910                 pagenum = 1
 911
 912                 while True:
 913                         self.report_download_page(playlist_id, pagenum)
 914                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 915                         try:
 916                                 page = urllib2.urlopen(request).read()
 917                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 918                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 919                                 return
 920
 921                         # Extract video identifiers
 922                         ids_in_page = []
 923                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 924                                 if mobj.group(1) not in ids_in_page:
 925                                         ids_in_page.append(mobj.group(1))
 926                         video_ids.extend(ids_in_page)
 927
 928                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 929                                 break
 930                         pagenum = pagenum + 1
 931
 932                 for id in video_ids:
 933                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 934                 return
 935
 936 class PostProcessor(object):
 937         """Post Processor class.
 938
 939         PostProcessor objects can be added to downloaders with their
 940         add_post_processor() method. When the downloader has finished a
 941         successful download, it will take its internal chain of PostProcessors
 942         and start calling the run() method on each one of them, first with
 943         an initial argument and then with the returned value of the previous
 944         PostProcessor.
 945
 946         The chain will be stopped if one of them ever returns None or the end
 947         of the chain is reached.
 948
 949         PostProcessor objects follow a "mutual registration" process similar
 950         to InfoExtractor objects.
 951         """
 952
 953         _downloader = None
 954
 955         def __init__(self, downloader=None):
 956                 self._downloader = downloader
 957
 958         def set_downloader(self, downloader):
 959                 """Sets the downloader for this PP."""
 960                 self._downloader = downloader
 961
 962         def run(self, information):
 963                 """Run the PostProcessor.
 964
 965                 The "information" argument is a dictionary like the ones
 966                 composed by InfoExtractors. The only difference is that this
 967                 one has an extra field called "filepath" that points to the
 968                 downloaded file.
 969
 970                 When this method returns None, the postprocessing chain is
 971                 stopped. However, this method may return an information
 972                 dictionary that will be passed to the next postprocessing
 973                 object in the chain. It can be the one it received after
 974                 changing some fields.
 975
 976                 In addition, this method may raise a PostProcessingError
 977                 exception that will be taken into account by the downloader
 978                 it was called from.
 979                 """
 980                 return information # by default, do nothing
 981
 982 ### MAIN PROGRAM ###
 983 if __name__ == '__main__':
 984         try:
 985                 # Modules needed only when running the main program
 986                 import getpass
 987                 import optparse
 988
 989                 # General configuration
 990                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 991                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 992                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 993
 994                 # Parse command line
 995                 parser = optparse.OptionParser(
 996                         usage='Usage: %prog [options] url...',
 997                         version='INTERNAL',
 998                         conflict_handler='resolve',
 999                 )
1000
1001                 parser.add_option('-h', '--help',
1002                                 action='help', help='print this help text and exit')
1003                 parser.add_option('-v', '--version',
1004                                 action='version', help='print program version and exit')
1005                 parser.add_option('-i', '--ignore-errors',
1006                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1007                 parser.add_option('-r', '--rate-limit',
1008                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1009
1010                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1011                 authentication.add_option('-u', '--username',
1012                                 dest='username', metavar='UN', help='account username')
1013                 authentication.add_option('-p', '--password',
1014                                 dest='password', metavar='PW', help='account password')
1015                 authentication.add_option('-n', '--netrc',
1016                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1017                 parser.add_option_group(authentication)
1018
1019                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1020                 video_format.add_option('-f', '--format',
1021                                 action='append', dest='format', metavar='FMT', help='video format code')
1022                 video_format.add_option('-b', '--best-quality',
1023                                 action='append_const', dest='format', help='download the best quality video possible', const='0')
1024                 video_format.add_option('-m', '--mobile-version',
1025                                 action='append_const', dest='format', help='alias for -f 17', const='17')
1026                 video_format.add_option('-d', '--high-def',
1027                                 action='append_const', dest='format', help='alias for -f 22', const='22')
1028                 parser.add_option_group(video_format)
1029
1030                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1031                 verbosity.add_option('-q', '--quiet',
1032                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1033                 verbosity.add_option('-s', '--simulate',
1034                                 action='store_true', dest='simulate', help='do not download video', default=False)
1035                 verbosity.add_option('-g', '--get-url',
1036                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1037                 verbosity.add_option('-e', '--get-title',
1038                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1039                 parser.add_option_group(verbosity)
1040
1041                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1042                 filesystem.add_option('-t', '--title',
1043                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1044                 filesystem.add_option('-l', '--literal',
1045                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1046                 filesystem.add_option('-o', '--output',
1047                                 dest='outtmpl', metavar='TPL', help='output filename template')
1048                 filesystem.add_option('-a', '--batch-file',
1049                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1050                 filesystem.add_option('-w', '--no-overwrites',
1051                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1052                 parser.add_option_group(filesystem)
1053
1054                 (opts, args) = parser.parse_args()
1055
1056                 # Batch file verification
1057                 batchurls = []
1058                 if opts.batchfile is not None:
1059                         try:
1060                                 batchurls = open(opts.batchfile, 'r').readlines()
1061                                 batchurls = [x.strip() for x in batchurls]
1062                                 batchurls = [x for x in batchurls if len(x) > 0]
1063                         except IOError:
1064                                 sys.exit(u'ERROR: batch file could not be read')
1065                 all_urls = batchurls + args
1066
1067                 # Conflicting, missing and erroneous options
1068                 if len(all_urls) < 1:
1069                         parser.error(u'you must provide at least one URL')
1070                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1071                         parser.error(u'using .netrc conflicts with giving username/password')
1072                 if opts.password is not None and opts.username is None:
1073                         parser.error(u'account username missing')
1074                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1075                         parser.error(u'using output template conflicts with using title or literal title')
1076                 if opts.usetitle and opts.useliteral:
1077                         parser.error(u'using title conflicts with using literal title')
1078                 if opts.username is not None and opts.password is None:
1079                         opts.password = getpass.getpass(u'Type account password and press return:')
1080                 if opts.ratelimit is not None:
1081                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1082                         if numeric_limit is None:
1083                                 parser.error(u'invalid rate limit specified')
1084                         opts.ratelimit = numeric_limit
1085                 if opts.format is not None and len(opts.format) > 1:
1086                         parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1087                 if opts.format is None:
1088                         real_format = None
1089                 else:
1090                         real_format = opts.format[0]
1091
1092
1093                 # Information extractors
1094                 youtube_ie = YoutubeIE()
1095                 metacafe_ie = MetacafeIE(youtube_ie)
1096                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1097                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1098
1099                 # File downloader
1100                 fd = FileDownloader({
1101                         'usenetrc': opts.usenetrc,
1102                         'username': opts.username,
1103                         'password': opts.password,
1104                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1105                         'forceurl': opts.geturl,
1106                         'forcetitle': opts.gettitle,
1107                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1108                         'format': real_format,
1109                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1110                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1111                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1112                                 or u'%(id)s.%(ext)s'),
1113                         'ignoreerrors': opts.ignoreerrors,
1114                         'ratelimit': opts.ratelimit,
1115                         'nooverwrites': opts.nooverwrites,
1116                         })
1117                 fd.add_info_extractor(youtube_search_ie)
1118                 fd.add_info_extractor(youtube_pl_ie)
1119                 fd.add_info_extractor(metacafe_ie)
1120                 fd.add_info_extractor(youtube_ie)
1121                 retcode = fd.download(all_urls)
1122                 sys.exit(retcode)
1123
1124         except DownloadError:
1125                 sys.exit(1)
1126         except SameFileError:
1127                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1128         except KeyboardInterrupt:
1129                 sys.exit(u'\nERROR: Interrupted by user')