_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61
  62 class FileDownloader(object):
  63         """File Downloader class.
  64
  65         File downloader objects are the ones responsible of downloading the
  66         actual video file and writing it to disk if the user has requested
  67         it, among some other tasks. In most cases there should be one per
  68         program. As, given a video URL, the downloader doesn't know how to
  69         extract all the needed information, task that InfoExtractors do, it
  70         has to pass the URL to one of them.
  71
  72         For this, file downloader objects have a method that allows
  73         InfoExtractors to be registered in a given order. When it is passed
  74         a URL, the file downloader handles it to the first InfoExtractor it
  75         finds that reports being able to handle it. The InfoExtractor extracts
  76         all the information about the video or videos the URL refers to, and
  77         asks the FileDownloader to process the video information, possibly
  78         downloading the video.
  79
  80         File downloaders accept a lot of parameters. In order not to saturate
  81         the object constructor with arguments, it receives a dictionary of
  82         options instead. These options are available through the params
  83         attribute for the InfoExtractors to use. The FileDownloader also
  84         registers itself as the downloader in charge for the InfoExtractors
  85         that are added to it, so this is a "mutual registration".
  86
  87         Available options:
  88
  89         username:       Username for authentication purposes.
  90         password:       Password for authentication purposes.
  91         usenetrc:       Use netrc for authentication instead.
  92         quiet:          Do not print messages to stdout.
  93         forceurl:       Force printing final URL.
  94         forcetitle:     Force printing title.
  95         simulate:       Do not download the video files.
  96         format:         Video format code.
  97         outtmpl:        Template for output names.
  98         ignoreerrors:   Do not stop on download errors.
  99         ratelimit:      Download speed limit, in bytes/sec.
 100         nooverwrites:   Prevent overwriting files.
 101         """
 102
 103         params = None
 104         _ies = []
 105         _pps = []
 106         _download_retcode = None
 107
 108         def __init__(self, params):
 109                 """Create a FileDownloader object with the given options."""
 110                 self._ies = []
 111                 self._pps = []
 112                 self._download_retcode = 0
 113                 self.params = params
 114
 115         @staticmethod
 116         def pmkdir(filename):
 117                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 118                 components = filename.split(os.sep)
 119                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 120                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 121                 for dir in aggregate:
 122                         if not os.path.exists(dir):
 123                                 os.mkdir(dir)
 124
 125         @staticmethod
 126         def format_bytes(bytes):
 127                 if bytes is None:
 128                         return 'N/A'
 129                 if bytes == 0:
 130                         exponent = 0
 131                 else:
 132                         exponent = long(math.log(float(bytes), 1024.0))
 133                 suffix = 'bkMGTPEZY'[exponent]
 134                 converted = float(bytes) / float(1024**exponent)
 135                 return '%.2f%s' % (converted, suffix)
 136
 137         @staticmethod
 138         def calc_percent(byte_counter, data_len):
 139                 if data_len is None:
 140                         return '---.-%'
 141                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 142
 143         @staticmethod
 144         def calc_eta(start, now, total, current):
 145                 if total is None:
 146                         return '--:--'
 147                 dif = now - start
 148                 if current == 0 or dif < 0.001: # One millisecond
 149                         return '--:--'
 150                 rate = float(current) / dif
 151                 eta = long((float(total) - float(current)) / rate)
 152                 (eta_mins, eta_secs) = divmod(eta, 60)
 153                 if eta_mins > 99:
 154                         return '--:--'
 155                 return '%02d:%02d' % (eta_mins, eta_secs)
 156
 157         @staticmethod
 158         def calc_speed(start, now, bytes):
 159                 dif = now - start
 160                 if bytes == 0 or dif < 0.001: # One millisecond
 161                         return '%10s' % '---b/s'
 162                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 163
 164         @staticmethod
 165         def best_block_size(elapsed_time, bytes):
 166                 new_min = max(bytes / 2.0, 1.0)
 167                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 168                 if elapsed_time < 0.001:
 169                         return int(new_max)
 170                 rate = bytes / elapsed_time
 171                 if rate > new_max:
 172                         return int(new_max)
 173                 if rate < new_min:
 174                         return int(new_min)
 175                 return int(rate)
 176
 177         @staticmethod
 178         def parse_bytes(bytestr):
 179                 """Parse a string indicating a byte quantity into a long integer."""
 180                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 181                 if matchobj is None:
 182                         return None
 183                 number = float(matchobj.group(1))
 184                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 185                 return long(round(number * multiplier))
 186
 187         def add_info_extractor(self, ie):
 188                 """Add an InfoExtractor object to the end of the list."""
 189                 self._ies.append(ie)
 190                 ie.set_downloader(self)
 191
 192         def add_post_processor(self, pp):
 193                 """Add a PostProcessor object to the end of the chain."""
 194                 self._pps.append(pp)
 195                 pp.set_downloader(self)
 196
 197         def to_stdout(self, message, skip_eol=False):
 198                 """Print message to stdout if not in quiet mode."""
 199                 if not self.params.get('quiet', False):
 200                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 201                         sys.stdout.flush()
 202
 203         def to_stderr(self, message):
 204                 """Print message to stderr."""
 205                 print >>sys.stderr, message
 206
 207         def fixed_template(self):
 208                 """Checks if the output template is fixed."""
 209                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 210
 211         def trouble(self, message=None):
 212                 """Determine action to take when a download problem appears.
 213
 214                 Depending on if the downloader has been configured to ignore
 215                 download errors or not, this method may throw an exception or
 216                 not when errors are found, after printing the message.
 217                 """
 218                 if message is not None:
 219                         self.to_stderr(message)
 220                 if not self.params.get('ignoreerrors', False):
 221                         raise DownloadError(message)
 222                 self._download_retcode = 1
 223
 224         def slow_down(self, start_time, byte_counter):
 225                 """Sleep if the download speed is over the rate limit."""
 226                 rate_limit = self.params.get('ratelimit', None)
 227                 if rate_limit is None or byte_counter == 0:
 228                         return
 229                 now = time.time()
 230                 elapsed = now - start_time
 231                 if elapsed <= 0.0:
 232                         return
 233                 speed = float(byte_counter) / elapsed
 234                 if speed > rate_limit:
 235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 236
 237         def report_destination(self, filename):
 238                 """Report destination filename."""
 239                 self.to_stdout(u'[download] Destination: %s' % filename)
 240
 241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 242                 """Report download progress."""
 243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 245
 246         def report_finish(self):
 247                 """Report download finished."""
 248                 self.to_stdout(u'')
 249
 250         def process_info(self, info_dict):
 251                 """Process a single dictionary returned by an InfoExtractor."""
 252                 # Forced printings
 253                 if self.params.get('forcetitle', False):
 254                         print info_dict['title'].encode(locale.getpreferredencoding())
 255                 if self.params.get('forceurl', False):
 256                         print info_dict['url'].encode(locale.getpreferredencoding())
 257
 258                 # Do nothing else if in simulate mode
 259                 if self.params.get('simulate', False):
 260                         return
 261
 262                 try:
 263                         template_dict = dict(info_dict)
 264                         template_dict['epoch'] = unicode(long(time.time()))
 265                         filename = self.params['outtmpl'] % template_dict
 266                         self.report_destination(filename)
 267                 except (ValueError, KeyError), err:
 268                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 269                 if self.params['nooverwrites'] and os.path.exists(filename):
 270                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 271                         return
 272
 273                 try:
 274                         self.pmkdir(filename)
 275                 except (OSError, IOError), err:
 276                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 277                         return
 278
 279                 try:
 280                         outstream = open(filename, 'wb')
 281                 except (OSError, IOError), err:
 282                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 283                         return
 284
 285                 try:
 286                         self._do_download(outstream, info_dict['url'])
 287                         outstream.close()
 288                 except (OSError, IOError), err:
 289                         outstream.close()
 290                         os.remove(filename)
 291                         raise UnavailableFormatError
 292                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 293                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 294                         return
 295
 296                 try:
 297                         self.post_process(filename, info_dict)
 298                 except (PostProcessingError), err:
 299                         self.trouble('ERROR: postprocessing: %s' % str(err))
 300                         return
 301
 302         def download(self, url_list):
 303                 """Download a given list of URLs."""
 304                 if len(url_list) > 1 and self.fixed_template():
 305                         raise SameFileError(self.params['outtmpl'])
 306
 307                 for url in url_list:
 308                         suitable_found = False
 309                         for ie in self._ies:
 310                                 # Go to next InfoExtractor if not suitable
 311                                 if not ie.suitable(url):
 312                                         continue
 313
 314                                 # Suitable InfoExtractor found
 315                                 suitable_found = True
 316
 317                                 # Extract information from URL and process it
 318                                 ie.extract(url)
 319
 320                                 # Suitable InfoExtractor had been found; go to next URL
 321                                 break
 322
 323                         if not suitable_found:
 324                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 325
 326                 return self._download_retcode
 327
 328         def post_process(self, filename, ie_info):
 329                 """Run the postprocessing chain on the given file."""
 330                 info = dict(ie_info)
 331                 info['filepath'] = filename
 332                 for pp in self._pps:
 333                         info = pp.run(info)
 334                         if info is None:
 335                                 break
 336
 337         def _do_download(self, stream, url):
 338                 request = urllib2.Request(url, None, std_headers)
 339                 data = urllib2.urlopen(request)
 340                 data_len = data.info().get('Content-length', None)
 341                 data_len_str = self.format_bytes(data_len)
 342                 byte_counter = 0
 343                 block_size = 1024
 344                 start = time.time()
 345                 while True:
 346                         # Progress message
 347                         percent_str = self.calc_percent(byte_counter, data_len)
 348                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 349                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 350                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 351
 352                         # Download and write
 353                         before = time.time()
 354                         data_block = data.read(block_size)
 355                         after = time.time()
 356                         data_block_len = len(data_block)
 357                         if data_block_len == 0:
 358                                 break
 359                         byte_counter += data_block_len
 360                         stream.write(data_block)
 361                         block_size = self.best_block_size(after - before, data_block_len)
 362
 363                         # Apply rate limit
 364                         self.slow_down(start, byte_counter)
 365
 366                 self.report_finish()
 367                 if data_len is not None and str(byte_counter) != data_len:
 368                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 369
 370 class InfoExtractor(object):
 371         """Information Extractor class.
 372
 373         Information extractors are the classes that, given a URL, extract
 374         information from the video (or videos) the URL refers to. This
 375         information includes the real video URL, the video title and simplified
 376         title, author and others. The information is stored in a dictionary
 377         which is then passed to the FileDownloader. The FileDownloader
 378         processes this information possibly downloading the video to the file
 379         system, among other possible outcomes. The dictionaries must include
 380         the following fields:
 381
 382         id:             Video identifier.
 383         url:            Final video URL.
 384         uploader:       Nickname of the video uploader.
 385         title:          Literal title.
 386         stitle:         Simplified title.
 387         ext:            Video filename extension.
 388
 389         Subclasses of this one should re-define the _real_initialize() and
 390         _real_extract() methods, as well as the suitable() static method.
 391         Probably, they should also be instantiated and added to the main
 392         downloader.
 393         """
 394
 395         _ready = False
 396         _downloader = None
 397
 398         def __init__(self, downloader=None):
 399                 """Constructor. Receives an optional downloader."""
 400                 self._ready = False
 401                 self.set_downloader(downloader)
 402
 403         @staticmethod
 404         def suitable(url):
 405                 """Receives a URL and returns True if suitable for this IE."""
 406                 return False
 407
 408         def initialize(self):
 409                 """Initializes an instance (authentication, etc)."""
 410                 if not self._ready:
 411                         self._real_initialize()
 412                         self._ready = True
 413
 414         def extract(self, url):
 415                 """Extracts URL information and returns it in list of dicts."""
 416                 self.initialize()
 417                 return self._real_extract(url)
 418
 419         def set_downloader(self, downloader):
 420                 """Sets the downloader for this IE."""
 421                 self._downloader = downloader
 422
 423         def _real_initialize(self):
 424                 """Real initialization process. Redefine in subclasses."""
 425                 pass
 426
 427         def _real_extract(self, url):
 428                 """Real extraction process. Redefine in subclasses."""
 429                 pass
 430
 431 class YoutubeIE(InfoExtractor):
 432         """Information extractor for youtube.com."""
 433
 434         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 435         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 436         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 437         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 438         _NETRC_MACHINE = 'youtube'
 439         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
 440         _video_extensions = {
 441                 '13': '3gp',
 442                 '17': 'mp4',
 443                 '18': 'mp4',
 444                 '22': 'mp4',
 445         }
 446
 447         @staticmethod
 448         def suitable(url):
 449                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 450
 451         @staticmethod
 452         def htmlentity_transform(matchobj):
 453                 """Transforms an HTML entity to a Unicode character."""
 454                 entity = matchobj.group(1)
 455
 456                 # Known non-numeric HTML entity
 457                 if entity in htmlentitydefs.name2codepoint:
 458                         return unichr(htmlentitydefs.name2codepoint[entity])
 459
 460                 # Unicode character
 461                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 462                 if mobj is not None:
 463                         numstr = mobj.group(1)
 464                         if numstr.startswith(u'x'):
 465                                 base = 16
 466                                 numstr = u'0%s' % numstr
 467                         else:
 468                                 base = 10
 469                         return unichr(long(numstr, base))
 470
 471                 # Unknown entity in name, return its literal representation
 472                 return (u'&%s;' % entity)
 473
 474         def report_lang(self):
 475                 """Report attempt to set language."""
 476                 self._downloader.to_stdout(u'[youtube] Setting language')
 477
 478         def report_login(self):
 479                 """Report attempt to log in."""
 480                 self._downloader.to_stdout(u'[youtube] Logging in')
 481
 482         def report_age_confirmation(self):
 483                 """Report attempt to confirm age."""
 484                 self._downloader.to_stdout(u'[youtube] Confirming age')
 485
 486         def report_webpage_download(self, video_id):
 487                 """Report attempt to download webpage."""
 488                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 489
 490         def report_information_extraction(self, video_id):
 491                 """Report attempt to extract video information."""
 492                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 493
 494         def report_video_url(self, video_id, video_real_url):
 495                 """Report extracted video URL."""
 496                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 497
 498         def report_unavailable_format(self, video_id, format):
 499                 """Report extracted video URL."""
 500                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 501
 502         def _real_initialize(self):
 503                 if self._downloader is None:
 504                         return
 505
 506                 username = None
 507                 password = None
 508                 downloader_params = self._downloader.params
 509
 510                 # Attempt to use provided username and password or .netrc data
 511                 if downloader_params.get('username', None) is not None:
 512                         username = downloader_params['username']
 513                         password = downloader_params['password']
 514                 elif downloader_params.get('usenetrc', False):
 515                         try:
 516                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 517                                 if info is not None:
 518                                         username = info[0]
 519                                         password = info[2]
 520                                 else:
 521                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 522                         except (IOError, netrc.NetrcParseError), err:
 523                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 524                                 return
 525
 526                 # Set language
 527                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 528                 try:
 529                         self.report_lang()
 530                         urllib2.urlopen(request).read()
 531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 532                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 533                         return
 534
 535                 # No authentication to be performed
 536                 if username is None:
 537                         return
 538
 539                 # Log in
 540                 login_form = {
 541                                 'current_form': 'loginForm',
 542                                 'next':         '/',
 543                                 'action_login': 'Log In',
 544                                 'username':     username,
 545                                 'password':     password,
 546                                 }
 547                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 548                 try:
 549                         self.report_login()
 550                         login_results = urllib2.urlopen(request).read()
 551                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 552                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 553                                 return
 554                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 555                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 556                         return
 557
 558                 # Confirm age
 559                 age_form = {
 560                                 'next_url':             '/',
 561                                 'action_confirm':       'Confirm',
 562                                 }
 563                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 564                 try:
 565                         self.report_age_confirmation()
 566                         age_results = urllib2.urlopen(request).read()
 567                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 568                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 569                         return
 570
 571         def _real_extract(self, url):
 572                 # Extract video id from URL
 573                 mobj = re.match(self._VALID_URL, url)
 574                 if mobj is None:
 575                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 576                         return
 577                 video_id = mobj.group(2)
 578
 579                 # Downloader parameters
 580                 best_quality = False
 581                 format_param = None
 582                 quality_index = 0
 583                 if self._downloader is not None:
 584                         params = self._downloader.params
 585                         format_param = params.get('format', None)
 586                         if format_param == '0':
 587                                 format_param = self._available_formats[quality_index]
 588                                 best_quality = True
 589
 590                 while True:
 591                         try:
 592                                 # Extension
 593                                 video_extension = self._video_extensions.get(format_param, 'flv')
 594
 595                                 # Normalize URL, including format
 596                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 597                                 if format_param is not None:
 598                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 599                                 request = urllib2.Request(normalized_url, None, std_headers)
 600                                 try:
 601                                         self.report_webpage_download(video_id)
 602                                         video_webpage = urllib2.urlopen(request).read()
 603                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 604                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 605                                         return
 606                                 self.report_information_extraction(video_id)
 607
 608                                 # "t" param
 609                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 610                                 if mobj is None:
 611                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 612                                         return
 613                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 614                                 if format_param is not None:
 615                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 616                                 self.report_video_url(video_id, video_real_url)
 617
 618                                 # uploader
 619                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 620                                 if mobj is None:
 621                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 622                                         return
 623                                 video_uploader = mobj.group(1)
 624
 625                                 # title
 626                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 627                                 if mobj is None:
 628                                         self._downloader.trouble(u'ERROR: unable to extract video title')
 629                                         return
 630                                 video_title = mobj.group(1).decode('utf-8')
 631                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 632                                 video_title = video_title.replace(os.sep, u'%')
 633
 634                                 # simplified title
 635                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 636                                 simple_title = simple_title.strip(ur'_')
 637
 638                                 # Process video information
 639                                 self._downloader.process_info({
 640                                         'id':           video_id.decode('utf-8'),
 641                                         'url':          video_real_url.decode('utf-8'),
 642                                         'uploader':     video_uploader.decode('utf-8'),
 643                                         'title':        video_title,
 644                                         'stitle':       simple_title,
 645                                         'ext':          video_extension.decode('utf-8'),
 646                                 })
 647
 648                                 return
 649
 650                         except UnavailableFormatError, err:
 651                                 if best_quality:
 652                                         if quality_index == len(self._available_formats) - 1:
 653                                                 # I don't ever expect this to happen
 654                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 655                                                 return
 656                                         else:
 657                                                 self.report_unavailable_format(video_id, format_param)
 658                                                 quality_index += 1
 659                                                 format_param = self._available_formats[quality_index]
 660                                                 continue
 661                                 else:
 662                                         self._downloader.trouble('ERROR: format not available for video')
 663                                         return
 664
 665
 666 class MetacafeIE(InfoExtractor):
 667         """Information Extractor for metacafe.com."""
 668
 669         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 670         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 671         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 672         _youtube_ie = None
 673
 674         def __init__(self, youtube_ie, downloader=None):
 675                 InfoExtractor.__init__(self, downloader)
 676                 self._youtube_ie = youtube_ie
 677
 678         @staticmethod
 679         def suitable(url):
 680                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 681
 682         def report_disclaimer(self):
 683                 """Report disclaimer retrieval."""
 684                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 685
 686         def report_age_confirmation(self):
 687                 """Report attempt to confirm age."""
 688                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 689
 690         def report_download_webpage(self, video_id):
 691                 """Report webpage download."""
 692                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 693
 694         def report_extraction(self, video_id):
 695                 """Report information extraction."""
 696                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 697
 698         def _real_initialize(self):
 699                 # Retrieve disclaimer
 700                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 701                 try:
 702                         self.report_disclaimer()
 703                         disclaimer = urllib2.urlopen(request).read()
 704                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 705                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 706                         return
 707
 708                 # Confirm age
 709                 disclaimer_form = {
 710                         'filters': '0',
 711                         'submit': "Continue - I'm over 18",
 712                         }
 713                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 714                 try:
 715                         self.report_age_confirmation()
 716                         disclaimer = urllib2.urlopen(request).read()
 717                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 718                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 719                         return
 720
 721         def _real_extract(self, url):
 722                 # Extract id and simplified title from URL
 723                 mobj = re.match(self._VALID_URL, url)
 724                 if mobj is None:
 725                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 726                         return
 727
 728                 video_id = mobj.group(1)
 729
 730                 # Check if video comes from YouTube
 731                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 732                 if mobj2 is not None:
 733                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 734                         return
 735
 736                 simple_title = mobj.group(2).decode('utf-8')
 737                 video_extension = 'flv'
 738
 739                 # Retrieve video webpage to extract further information
 740                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 741                 try:
 742                         self.report_download_webpage(video_id)
 743                         webpage = urllib2.urlopen(request).read()
 744                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 745                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 746                         return
 747
 748                 # Extract URL, uploader and title from webpage
 749                 self.report_extraction(video_id)
 750                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 751                 if mobj is None:
 752                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 753                         return
 754                 mediaURL = urllib.unquote(mobj.group(1))
 755
 756                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 757                 if mobj is None:
 758                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 759                         return
 760                 gdaKey = mobj.group(1)
 761
 762                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 763
 764                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 765                 if mobj is None:
 766                         self._downloader.trouble(u'ERROR: unable to extract title')
 767                         return
 768                 video_title = mobj.group(1).decode('utf-8')
 769
 770                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 771                 if mobj is None:
 772                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 773                         return
 774                 video_uploader = mobj.group(1)
 775
 776                 try:
 777                         # Process video information
 778                         self._downloader.process_info({
 779                                 'id':           video_id.decode('utf-8'),
 780                                 'url':          video_url.decode('utf-8'),
 781                                 'uploader':     video_uploader.decode('utf-8'),
 782                                 'title':        video_title,
 783                                 'stitle':       simple_title,
 784                                 'ext':          video_extension.decode('utf-8'),
 785                         })
 786                 except UnavailableFormatError:
 787                         self._downloader.trouble(u'ERROR: format not available for video')
 788
 789
 790 class YoutubeSearchIE(InfoExtractor):
 791         """Information Extractor for YouTube search queries."""
 792         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 793         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 794         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 795         _MORE_PAGES_INDICATOR = r'>Next</a>'
 796         _youtube_ie = None
 797         _max_youtube_results = 1000
 798
 799         def __init__(self, youtube_ie, downloader=None):
 800                 InfoExtractor.__init__(self, downloader)
 801                 self._youtube_ie = youtube_ie
 802
 803         @staticmethod
 804         def suitable(url):
 805                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 806
 807         def report_download_page(self, query, pagenum):
 808                 """Report attempt to download playlist page with given number."""
 809                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 810
 811         def _real_initialize(self):
 812                 self._youtube_ie.initialize()
 813
 814         def _real_extract(self, query):
 815                 mobj = re.match(self._VALID_QUERY, query)
 816                 if mobj is None:
 817                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 818                         return
 819
 820                 prefix, query = query.split(':')
 821                 prefix = prefix[8:]
 822                 if prefix == '':
 823                         self._download_n_results(query, 1)
 824                         return
 825                 elif prefix == 'all':
 826                         self._download_n_results(query, self._max_youtube_results)
 827                         return
 828                 else:
 829                         try:
 830                                 n = int(prefix)
 831                                 if n <= 0:
 832                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 833                                         return
 834                                 elif n > self._max_youtube_results:
 835                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 836                                         n = self._max_youtube_results
 837                                 self._download_n_results(query, n)
 838                                 return
 839                         except ValueError: # parsing prefix as int fails
 840                                 self._download_n_results(query, 1)
 841                                 return
 842
 843         def _download_n_results(self, query, n):
 844                 """Downloads a specified number of results for a query"""
 845
 846                 video_ids = []
 847                 already_seen = set()
 848                 pagenum = 1
 849
 850                 while True:
 851                         self.report_download_page(query, pagenum)
 852                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 853                         request = urllib2.Request(result_url, None, std_headers)
 854                         try:
 855                                 page = urllib2.urlopen(request).read()
 856                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 857                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 858                                 return
 859
 860                         # Extract video identifiers
 861                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 862                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 863                                 if video_id not in already_seen:
 864                                         video_ids.append(video_id)
 865                                         already_seen.add(video_id)
 866                                         if len(video_ids) == n:
 867                                                 # Specified n videos reached
 868                                                 for id in video_ids:
 869                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 870                                                 return
 871
 872                         if self._MORE_PAGES_INDICATOR not in page:
 873                                 for id in video_ids:
 874                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 875                                 return
 876
 877                         pagenum = pagenum + 1
 878
 879 class YoutubePlaylistIE(InfoExtractor):
 880         """Information Extractor for YouTube playlists."""
 881
 882         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 883         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 884         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 885         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 886         _youtube_ie = None
 887
 888         def __init__(self, youtube_ie, downloader=None):
 889                 InfoExtractor.__init__(self, downloader)
 890                 self._youtube_ie = youtube_ie
 891
 892         @staticmethod
 893         def suitable(url):
 894                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 895
 896         def report_download_page(self, playlist_id, pagenum):
 897                 """Report attempt to download playlist page with given number."""
 898                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 899
 900         def _real_initialize(self):
 901                 self._youtube_ie.initialize()
 902
 903         def _real_extract(self, url):
 904                 # Extract playlist id
 905                 mobj = re.match(self._VALID_URL, url)
 906                 if mobj is None:
 907                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 908                         return
 909
 910                 # Download playlist pages
 911                 playlist_id = mobj.group(1)
 912                 video_ids = []
 913                 pagenum = 1
 914
 915                 while True:
 916                         self.report_download_page(playlist_id, pagenum)
 917                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 918                         try:
 919                                 page = urllib2.urlopen(request).read()
 920                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 921                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 922                                 return
 923
 924                         # Extract video identifiers
 925                         ids_in_page = []
 926                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 927                                 if mobj.group(1) not in ids_in_page:
 928                                         ids_in_page.append(mobj.group(1))
 929                         video_ids.extend(ids_in_page)
 930
 931                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 932                                 break
 933                         pagenum = pagenum + 1
 934
 935                 for id in video_ids:
 936                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 937                 return
 938
 939 class PostProcessor(object):
 940         """Post Processor class.
 941
 942         PostProcessor objects can be added to downloaders with their
 943         add_post_processor() method. When the downloader has finished a
 944         successful download, it will take its internal chain of PostProcessors
 945         and start calling the run() method on each one of them, first with
 946         an initial argument and then with the returned value of the previous
 947         PostProcessor.
 948
 949         The chain will be stopped if one of them ever returns None or the end
 950         of the chain is reached.
 951
 952         PostProcessor objects follow a "mutual registration" process similar
 953         to InfoExtractor objects.
 954         """
 955
 956         _downloader = None
 957
 958         def __init__(self, downloader=None):
 959                 self._downloader = downloader
 960
 961         def set_downloader(self, downloader):
 962                 """Sets the downloader for this PP."""
 963                 self._downloader = downloader
 964
 965         def run(self, information):
 966                 """Run the PostProcessor.
 967
 968                 The "information" argument is a dictionary like the ones
 969                 composed by InfoExtractors. The only difference is that this
 970                 one has an extra field called "filepath" that points to the
 971                 downloaded file.
 972
 973                 When this method returns None, the postprocessing chain is
 974                 stopped. However, this method may return an information
 975                 dictionary that will be passed to the next postprocessing
 976                 object in the chain. It can be the one it received after
 977                 changing some fields.
 978
 979                 In addition, this method may raise a PostProcessingError
 980                 exception that will be taken into account by the downloader
 981                 it was called from.
 982                 """
 983                 return information # by default, do nothing
 984
 985 ### MAIN PROGRAM ###
 986 if __name__ == '__main__':
 987         try:
 988                 # Modules needed only when running the main program
 989                 import getpass
 990                 import optparse
 991
 992                 # General configuration
 993                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 994                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 995                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 996
 997                 # Parse command line
 998                 parser = optparse.OptionParser(
 999                         usage='Usage: %prog [options] url...',
1000                         version='INTERNAL',
1001                         conflict_handler='resolve',
1002                 )
1003
1004                 parser.add_option('-h', '--help',
1005                                 action='help', help='print this help text and exit')
1006                 parser.add_option('-v', '--version',
1007                                 action='version', help='print program version and exit')
1008                 parser.add_option('-i', '--ignore-errors',
1009                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1010                 parser.add_option('-r', '--rate-limit',
1011                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1012
1013                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1014                 authentication.add_option('-u', '--username',
1015                                 dest='username', metavar='UN', help='account username')
1016                 authentication.add_option('-p', '--password',
1017                                 dest='password', metavar='PW', help='account password')
1018                 authentication.add_option('-n', '--netrc',
1019                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1020                 parser.add_option_group(authentication)
1021
1022                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1023                 video_format.add_option('-f', '--format',
1024                                 action='append', dest='format', metavar='FMT', help='video format code')
1025                 video_format.add_option('-b', '--best-quality',
1026                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1027                 video_format.add_option('-m', '--mobile-version',
1028                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1029                 video_format.add_option('-d', '--high-def',
1030                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1031                 parser.add_option_group(video_format)
1032
1033                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1034                 verbosity.add_option('-q', '--quiet',
1035                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1036                 verbosity.add_option('-s', '--simulate',
1037                                 action='store_true', dest='simulate', help='do not download video', default=False)
1038                 verbosity.add_option('-g', '--get-url',
1039                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1040                 verbosity.add_option('-e', '--get-title',
1041                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1042                 parser.add_option_group(verbosity)
1043
1044                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1045                 filesystem.add_option('-t', '--title',
1046                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1047                 filesystem.add_option('-l', '--literal',
1048                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1049                 filesystem.add_option('-o', '--output',
1050                                 dest='outtmpl', metavar='TPL', help='output filename template')
1051                 filesystem.add_option('-a', '--batch-file',
1052                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1053                 filesystem.add_option('-w', '--no-overwrites',
1054                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1055                 parser.add_option_group(filesystem)
1056
1057                 (opts, args) = parser.parse_args()
1058
1059                 # Batch file verification
1060                 batchurls = []
1061                 if opts.batchfile is not None:
1062                         try:
1063                                 batchurls = open(opts.batchfile, 'r').readlines()
1064                                 batchurls = [x.strip() for x in batchurls]
1065                                 batchurls = [x for x in batchurls if len(x) > 0]
1066                         except IOError:
1067                                 sys.exit(u'ERROR: batch file could not be read')
1068                 all_urls = batchurls + args
1069
1070                 # Conflicting, missing and erroneous options
1071                 if len(all_urls) < 1:
1072                         parser.error(u'you must provide at least one URL')
1073                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1074                         parser.error(u'using .netrc conflicts with giving username/password')
1075                 if opts.password is not None and opts.username is None:
1076                         parser.error(u'account username missing')
1077                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1078                         parser.error(u'using output template conflicts with using title or literal title')
1079                 if opts.usetitle and opts.useliteral:
1080                         parser.error(u'using title conflicts with using literal title')
1081                 if opts.username is not None and opts.password is None:
1082                         opts.password = getpass.getpass(u'Type account password and press return:')
1083                 if opts.ratelimit is not None:
1084                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1085                         if numeric_limit is None:
1086                                 parser.error(u'invalid rate limit specified')
1087                         opts.ratelimit = numeric_limit
1088                 if opts.format is not None and len(opts.format) > 1:
1089                         parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1090                 if opts.format is None:
1091                         real_format = None
1092                 else:
1093                         real_format = opts.format[0]
1094
1095
1096                 # Information extractors
1097                 youtube_ie = YoutubeIE()
1098                 metacafe_ie = MetacafeIE(youtube_ie)
1099                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1100                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1101
1102                 # File downloader
1103                 fd = FileDownloader({
1104                         'usenetrc': opts.usenetrc,
1105                         'username': opts.username,
1106                         'password': opts.password,
1107                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1108                         'forceurl': opts.geturl,
1109                         'forcetitle': opts.gettitle,
1110                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1111                         'format': real_format,
1112                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1113                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1114                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1115                                 or u'%(id)s.%(ext)s'),
1116                         'ignoreerrors': opts.ignoreerrors,
1117                         'ratelimit': opts.ratelimit,
1118                         'nooverwrites': opts.nooverwrites,
1119                         })
1120                 fd.add_info_extractor(youtube_search_ie)
1121                 fd.add_info_extractor(youtube_pl_ie)
1122                 fd.add_info_extractor(metacafe_ie)
1123                 fd.add_info_extractor(youtube_ie)
1124                 retcode = fd.download(all_urls)
1125                 sys.exit(retcode)
1126
1127         except DownloadError:
1128                 sys.exit(1)
1129         except SameFileError:
1130                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1131         except KeyboardInterrupt:
1132                 sys.exit(u'\nERROR: Interrupted by user')