_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class UnavailableFormatError(Exception):
  56         """Unavailable Format exception.
  57
  58         This exception will be thrown when a video is requested
  59         in a format that is not available for that video.
  60         """
  61         pass
  62
  63 class ContentTooShortError(Exception):
  64         """Content Too Short exception.
  65
  66         This exception may be raised by FileDownloader objects when a file they
  67         download is too small for what the server announced first, indicating
  68         the connection was probably interrupted.
  69         """
  70         # Both in bytes
  71         downloaded = None
  72         expected = None
  73
  74         def __init__(self, downloaded, expected):
  75                 self.downloaded = downloaded
  76                 self.expected = expected
  77
  78 class FileDownloader(object):
  79         """File Downloader class.
  80
  81         File downloader objects are the ones responsible of downloading the
  82         actual video file and writing it to disk if the user has requested
  83         it, among some other tasks. In most cases there should be one per
  84         program. As, given a video URL, the downloader doesn't know how to
  85         extract all the needed information, task that InfoExtractors do, it
  86         has to pass the URL to one of them.
  87
  88         For this, file downloader objects have a method that allows
  89         InfoExtractors to be registered in a given order. When it is passed
  90         a URL, the file downloader handles it to the first InfoExtractor it
  91         finds that reports being able to handle it. The InfoExtractor extracts
  92         all the information about the video or videos the URL refers to, and
  93         asks the FileDownloader to process the video information, possibly
  94         downloading the video.
  95
  96         File downloaders accept a lot of parameters. In order not to saturate
  97         the object constructor with arguments, it receives a dictionary of
  98         options instead. These options are available through the params
  99         attribute for the InfoExtractors to use. The FileDownloader also
 100         registers itself as the downloader in charge for the InfoExtractors
 101         that are added to it, so this is a "mutual registration".
 102
 103         Available options:
 104
 105         username:       Username for authentication purposes.
 106         password:       Password for authentication purposes.
 107         usenetrc:       Use netrc for authentication instead.
 108         quiet:          Do not print messages to stdout.
 109         forceurl:       Force printing final URL.
 110         forcetitle:     Force printing title.
 111         simulate:       Do not download the video files.
 112         format:         Video format code.
 113         outtmpl:        Template for output names.
 114         ignoreerrors:   Do not stop on download errors.
 115         ratelimit:      Download speed limit, in bytes/sec.
 116         nooverwrites:   Prevent overwriting files.
 117         """
 118
 119         params = None
 120         _ies = []
 121         _pps = []
 122         _download_retcode = None
 123
 124         def __init__(self, params):
 125                 """Create a FileDownloader object with the given options."""
 126                 self._ies = []
 127                 self._pps = []
 128                 self._download_retcode = 0
 129                 self.params = params
 130
 131         @staticmethod
 132         def pmkdir(filename):
 133                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 134                 components = filename.split(os.sep)
 135                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 136                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 137                 for dir in aggregate:
 138                         if not os.path.exists(dir):
 139                                 os.mkdir(dir)
 140
 141         @staticmethod
 142         def format_bytes(bytes):
 143                 if bytes is None:
 144                         return 'N/A'
 145                 if bytes == 0:
 146                         exponent = 0
 147                 else:
 148                         exponent = long(math.log(float(bytes), 1024.0))
 149                 suffix = 'bkMGTPEZY'[exponent]
 150                 converted = float(bytes) / float(1024**exponent)
 151                 return '%.2f%s' % (converted, suffix)
 152
 153         @staticmethod
 154         def calc_percent(byte_counter, data_len):
 155                 if data_len is None:
 156                         return '---.-%'
 157                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 158
 159         @staticmethod
 160         def calc_eta(start, now, total, current):
 161                 if total is None:
 162                         return '--:--'
 163                 dif = now - start
 164                 if current == 0 or dif < 0.001: # One millisecond
 165                         return '--:--'
 166                 rate = float(current) / dif
 167                 eta = long((float(total) - float(current)) / rate)
 168                 (eta_mins, eta_secs) = divmod(eta, 60)
 169                 if eta_mins > 99:
 170                         return '--:--'
 171                 return '%02d:%02d' % (eta_mins, eta_secs)
 172
 173         @staticmethod
 174         def calc_speed(start, now, bytes):
 175                 dif = now - start
 176                 if bytes == 0 or dif < 0.001: # One millisecond
 177                         return '%10s' % '---b/s'
 178                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 179
 180         @staticmethod
 181         def best_block_size(elapsed_time, bytes):
 182                 new_min = max(bytes / 2.0, 1.0)
 183                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 184                 if elapsed_time < 0.001:
 185                         return int(new_max)
 186                 rate = bytes / elapsed_time
 187                 if rate > new_max:
 188                         return int(new_max)
 189                 if rate < new_min:
 190                         return int(new_min)
 191                 return int(rate)
 192
 193         @staticmethod
 194         def parse_bytes(bytestr):
 195                 """Parse a string indicating a byte quantity into a long integer."""
 196                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 197                 if matchobj is None:
 198                         return None
 199                 number = float(matchobj.group(1))
 200                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 201                 return long(round(number * multiplier))
 202
 203         @staticmethod
 204         def verify_url(url):
 205                 """Verify a URL is valid and data could be downloaded."""
 206                 request = urllib2.Request(url, None, std_headers)
 207                 data = urllib2.urlopen(request)
 208                 data.read(1)
 209                 data.close()
 210
 211         def add_info_extractor(self, ie):
 212                 """Add an InfoExtractor object to the end of the list."""
 213                 self._ies.append(ie)
 214                 ie.set_downloader(self)
 215
 216         def add_post_processor(self, pp):
 217                 """Add a PostProcessor object to the end of the chain."""
 218                 self._pps.append(pp)
 219                 pp.set_downloader(self)
 220
 221         def to_stdout(self, message, skip_eol=False):
 222                 """Print message to stdout if not in quiet mode."""
 223                 if not self.params.get('quiet', False):
 224                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 225                         sys.stdout.flush()
 226
 227         def to_stderr(self, message):
 228                 """Print message to stderr."""
 229                 print >>sys.stderr, message
 230
 231         def fixed_template(self):
 232                 """Checks if the output template is fixed."""
 233                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 234
 235         def trouble(self, message=None):
 236                 """Determine action to take when a download problem appears.
 237
 238                 Depending on if the downloader has been configured to ignore
 239                 download errors or not, this method may throw an exception or
 240                 not when errors are found, after printing the message.
 241                 """
 242                 if message is not None:
 243                         self.to_stderr(message)
 244                 if not self.params.get('ignoreerrors', False):
 245                         raise DownloadError(message)
 246                 self._download_retcode = 1
 247
 248         def slow_down(self, start_time, byte_counter):
 249                 """Sleep if the download speed is over the rate limit."""
 250                 rate_limit = self.params.get('ratelimit', None)
 251                 if rate_limit is None or byte_counter == 0:
 252                         return
 253                 now = time.time()
 254                 elapsed = now - start_time
 255                 if elapsed <= 0.0:
 256                         return
 257                 speed = float(byte_counter) / elapsed
 258                 if speed > rate_limit:
 259                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 260
 261         def report_destination(self, filename):
 262                 """Report destination filename."""
 263                 self.to_stdout(u'[download] Destination: %s' % filename)
 264
 265         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 266                 """Report download progress."""
 267                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 268                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 269
 270         def report_finish(self):
 271                 """Report download finished."""
 272                 self.to_stdout(u'')
 273
 274         def process_info(self, info_dict):
 275                 """Process a single dictionary returned by an InfoExtractor."""
 276                 # Do nothing else if in simulate mode
 277                 if self.params.get('simulate', False):
 278                         try:
 279                                 self.verify_url(info_dict['url'])
 280                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 281                                 raise UnavailableFormatError
 282
 283                         # Forced printings
 284                         if self.params.get('forcetitle', False):
 285                                 print info_dict['title'].encode(locale.getpreferredencoding())
 286                         if self.params.get('forceurl', False):
 287                                 print info_dict['url'].encode(locale.getpreferredencoding())
 288
 289                         return
 290
 291
 292                 try:
 293                         template_dict = dict(info_dict)
 294                         template_dict['epoch'] = unicode(long(time.time()))
 295                         filename = self.params['outtmpl'] % template_dict
 296                         self.report_destination(filename)
 297                 except (ValueError, KeyError), err:
 298                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 299                 if self.params['nooverwrites'] and os.path.exists(filename):
 300                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 301                         return
 302
 303                 try:
 304                         self.pmkdir(filename)
 305                 except (OSError, IOError), err:
 306                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 307                         return
 308
 309                 try:
 310                         outstream = open(filename, 'wb')
 311                 except (OSError, IOError), err:
 312                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 313                         return
 314
 315                 try:
 316                         self._do_download(outstream, info_dict['url'])
 317                         outstream.close()
 318                 except (OSError, IOError), err:
 319                         outstream.close()
 320                         os.remove(filename)
 321                         raise UnavailableFormatError
 322                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 323                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 324                         return
 325                 except (ContentTooShortError, ), err:
 326                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 327                         return
 328
 329                 try:
 330                         self.post_process(filename, info_dict)
 331                 except (PostProcessingError), err:
 332                         self.trouble('ERROR: postprocessing: %s' % str(err))
 333                         return
 334
 335         def download(self, url_list):
 336                 """Download a given list of URLs."""
 337                 if len(url_list) > 1 and self.fixed_template():
 338                         raise SameFileError(self.params['outtmpl'])
 339
 340                 for url in url_list:
 341                         suitable_found = False
 342                         for ie in self._ies:
 343                                 # Go to next InfoExtractor if not suitable
 344                                 if not ie.suitable(url):
 345                                         continue
 346
 347                                 # Suitable InfoExtractor found
 348                                 suitable_found = True
 349
 350                                 # Extract information from URL and process it
 351                                 ie.extract(url)
 352
 353                                 # Suitable InfoExtractor had been found; go to next URL
 354                                 break
 355
 356                         if not suitable_found:
 357                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 358
 359                 return self._download_retcode
 360
 361         def post_process(self, filename, ie_info):
 362                 """Run the postprocessing chain on the given file."""
 363                 info = dict(ie_info)
 364                 info['filepath'] = filename
 365                 for pp in self._pps:
 366                         info = pp.run(info)
 367                         if info is None:
 368                                 break
 369
 370         def _do_download(self, stream, url):
 371                 request = urllib2.Request(url, None, std_headers)
 372                 data = urllib2.urlopen(request)
 373                 data_len = data.info().get('Content-length', None)
 374                 data_len_str = self.format_bytes(data_len)
 375                 byte_counter = 0
 376                 block_size = 1024
 377                 start = time.time()
 378                 while True:
 379                         # Progress message
 380                         percent_str = self.calc_percent(byte_counter, data_len)
 381                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 382                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 383                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 384
 385                         # Download and write
 386                         before = time.time()
 387                         data_block = data.read(block_size)
 388                         after = time.time()
 389                         data_block_len = len(data_block)
 390                         if data_block_len == 0:
 391                                 break
 392                         byte_counter += data_block_len
 393                         stream.write(data_block)
 394                         block_size = self.best_block_size(after - before, data_block_len)
 395
 396                         # Apply rate limit
 397                         self.slow_down(start, byte_counter)
 398
 399                 self.report_finish()
 400                 if data_len is not None and str(byte_counter) != data_len:
 401                         raise ContentTooShortError(byte_counter, long(data_len))
 402
 403 class InfoExtractor(object):
 404         """Information Extractor class.
 405
 406         Information extractors are the classes that, given a URL, extract
 407         information from the video (or videos) the URL refers to. This
 408         information includes the real video URL, the video title and simplified
 409         title, author and others. The information is stored in a dictionary
 410         which is then passed to the FileDownloader. The FileDownloader
 411         processes this information possibly downloading the video to the file
 412         system, among other possible outcomes. The dictionaries must include
 413         the following fields:
 414
 415         id:             Video identifier.
 416         url:            Final video URL.
 417         uploader:       Nickname of the video uploader.
 418         title:          Literal title.
 419         stitle:         Simplified title.
 420         ext:            Video filename extension.
 421
 422         Subclasses of this one should re-define the _real_initialize() and
 423         _real_extract() methods, as well as the suitable() static method.
 424         Probably, they should also be instantiated and added to the main
 425         downloader.
 426         """
 427
 428         _ready = False
 429         _downloader = None
 430
 431         def __init__(self, downloader=None):
 432                 """Constructor. Receives an optional downloader."""
 433                 self._ready = False
 434                 self.set_downloader(downloader)
 435
 436         @staticmethod
 437         def suitable(url):
 438                 """Receives a URL and returns True if suitable for this IE."""
 439                 return False
 440
 441         def initialize(self):
 442                 """Initializes an instance (authentication, etc)."""
 443                 if not self._ready:
 444                         self._real_initialize()
 445                         self._ready = True
 446
 447         def extract(self, url):
 448                 """Extracts URL information and returns it in list of dicts."""
 449                 self.initialize()
 450                 return self._real_extract(url)
 451
 452         def set_downloader(self, downloader):
 453                 """Sets the downloader for this IE."""
 454                 self._downloader = downloader
 455
 456         def _real_initialize(self):
 457                 """Real initialization process. Redefine in subclasses."""
 458                 pass
 459
 460         def _real_extract(self, url):
 461                 """Real extraction process. Redefine in subclasses."""
 462                 pass
 463
 464 class YoutubeIE(InfoExtractor):
 465         """Information extractor for youtube.com."""
 466
 467         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 468         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 469         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 470         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 471         _NETRC_MACHINE = 'youtube'
 472         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
 473         _video_extensions = {
 474                 '13': '3gp',
 475                 '17': 'mp4',
 476                 '18': 'mp4',
 477                 '22': 'mp4',
 478         }
 479
 480         @staticmethod
 481         def suitable(url):
 482                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 483
 484         @staticmethod
 485         def htmlentity_transform(matchobj):
 486                 """Transforms an HTML entity to a Unicode character."""
 487                 entity = matchobj.group(1)
 488
 489                 # Known non-numeric HTML entity
 490                 if entity in htmlentitydefs.name2codepoint:
 491                         return unichr(htmlentitydefs.name2codepoint[entity])
 492
 493                 # Unicode character
 494                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 495                 if mobj is not None:
 496                         numstr = mobj.group(1)
 497                         if numstr.startswith(u'x'):
 498                                 base = 16
 499                                 numstr = u'0%s' % numstr
 500                         else:
 501                                 base = 10
 502                         return unichr(long(numstr, base))
 503
 504                 # Unknown entity in name, return its literal representation
 505                 return (u'&%s;' % entity)
 506
 507         def report_lang(self):
 508                 """Report attempt to set language."""
 509                 self._downloader.to_stdout(u'[youtube] Setting language')
 510
 511         def report_login(self):
 512                 """Report attempt to log in."""
 513                 self._downloader.to_stdout(u'[youtube] Logging in')
 514
 515         def report_age_confirmation(self):
 516                 """Report attempt to confirm age."""
 517                 self._downloader.to_stdout(u'[youtube] Confirming age')
 518
 519         def report_webpage_download(self, video_id):
 520                 """Report attempt to download webpage."""
 521                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 522
 523         def report_information_extraction(self, video_id):
 524                 """Report attempt to extract video information."""
 525                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 526
 527         def report_video_url(self, video_id, video_real_url):
 528                 """Report extracted video URL."""
 529                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 530
 531         def report_unavailable_format(self, video_id, format):
 532                 """Report extracted video URL."""
 533                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 534
 535         def _real_initialize(self):
 536                 if self._downloader is None:
 537                         return
 538
 539                 username = None
 540                 password = None
 541                 downloader_params = self._downloader.params
 542
 543                 # Attempt to use provided username and password or .netrc data
 544                 if downloader_params.get('username', None) is not None:
 545                         username = downloader_params['username']
 546                         password = downloader_params['password']
 547                 elif downloader_params.get('usenetrc', False):
 548                         try:
 549                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 550                                 if info is not None:
 551                                         username = info[0]
 552                                         password = info[2]
 553                                 else:
 554                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 555                         except (IOError, netrc.NetrcParseError), err:
 556                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 557                                 return
 558
 559                 # Set language
 560                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 561                 try:
 562                         self.report_lang()
 563                         urllib2.urlopen(request).read()
 564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 565                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 566                         return
 567
 568                 # No authentication to be performed
 569                 if username is None:
 570                         return
 571
 572                 # Log in
 573                 login_form = {
 574                                 'current_form': 'loginForm',
 575                                 'next':         '/',
 576                                 'action_login': 'Log In',
 577                                 'username':     username,
 578                                 'password':     password,
 579                                 }
 580                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 581                 try:
 582                         self.report_login()
 583                         login_results = urllib2.urlopen(request).read()
 584                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 585                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 586                                 return
 587                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 588                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 589                         return
 590
 591                 # Confirm age
 592                 age_form = {
 593                                 'next_url':             '/',
 594                                 'action_confirm':       'Confirm',
 595                                 }
 596                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 597                 try:
 598                         self.report_age_confirmation()
 599                         age_results = urllib2.urlopen(request).read()
 600                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 601                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 602                         return
 603
 604         def _real_extract(self, url):
 605                 # Extract video id from URL
 606                 mobj = re.match(self._VALID_URL, url)
 607                 if mobj is None:
 608                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 609                         return
 610                 video_id = mobj.group(2)
 611
 612                 # Downloader parameters
 613                 best_quality = False
 614                 format_param = None
 615                 quality_index = 0
 616                 if self._downloader is not None:
 617                         params = self._downloader.params
 618                         format_param = params.get('format', None)
 619                         if format_param == '0':
 620                                 format_param = self._available_formats[quality_index]
 621                                 best_quality = True
 622
 623                 while True:
 624                         try:
 625                                 # Extension
 626                                 video_extension = self._video_extensions.get(format_param, 'flv')
 627
 628                                 # Normalize URL, including format
 629                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 630                                 if format_param is not None:
 631                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 632                                 request = urllib2.Request(normalized_url, None, std_headers)
 633                                 try:
 634                                         self.report_webpage_download(video_id)
 635                                         video_webpage = urllib2.urlopen(request).read()
 636                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 637                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 638                                         return
 639                                 self.report_information_extraction(video_id)
 640
 641                                 # "t" param
 642                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 643                                 if mobj is None:
 644                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 645                                         return
 646                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 647                                 if format_param is not None:
 648                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 649                                 self.report_video_url(video_id, video_real_url)
 650
 651                                 # uploader
 652                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 653                                 if mobj is None:
 654                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 655                                         return
 656                                 video_uploader = mobj.group(1)
 657
 658                                 # title
 659                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 660                                 if mobj is None:
 661                                         self._downloader.trouble(u'ERROR: unable to extract video title')
 662                                         return
 663                                 video_title = mobj.group(1).decode('utf-8')
 664                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 665                                 video_title = video_title.replace(os.sep, u'%')
 666
 667                                 # simplified title
 668                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 669                                 simple_title = simple_title.strip(ur'_')
 670
 671                                 # Process video information
 672                                 self._downloader.process_info({
 673                                         'id':           video_id.decode('utf-8'),
 674                                         'url':          video_real_url.decode('utf-8'),
 675                                         'uploader':     video_uploader.decode('utf-8'),
 676                                         'title':        video_title,
 677                                         'stitle':       simple_title,
 678                                         'ext':          video_extension.decode('utf-8'),
 679                                 })
 680
 681                                 return
 682
 683                         except UnavailableFormatError, err:
 684                                 if best_quality:
 685                                         if quality_index == len(self._available_formats) - 1:
 686                                                 # I don't ever expect this to happen
 687                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
 688                                                 return
 689                                         else:
 690                                                 self.report_unavailable_format(video_id, format_param)
 691                                                 quality_index += 1
 692                                                 format_param = self._available_formats[quality_index]
 693                                                 continue
 694                                 else:
 695                                         self._downloader.trouble('ERROR: format not available for video')
 696                                         return
 697
 698
 699 class MetacafeIE(InfoExtractor):
 700         """Information Extractor for metacafe.com."""
 701
 702         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 703         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 704         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 705         _youtube_ie = None
 706
 707         def __init__(self, youtube_ie, downloader=None):
 708                 InfoExtractor.__init__(self, downloader)
 709                 self._youtube_ie = youtube_ie
 710
 711         @staticmethod
 712         def suitable(url):
 713                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 714
 715         def report_disclaimer(self):
 716                 """Report disclaimer retrieval."""
 717                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 718
 719         def report_age_confirmation(self):
 720                 """Report attempt to confirm age."""
 721                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 722
 723         def report_download_webpage(self, video_id):
 724                 """Report webpage download."""
 725                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 726
 727         def report_extraction(self, video_id):
 728                 """Report information extraction."""
 729                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 730
 731         def _real_initialize(self):
 732                 # Retrieve disclaimer
 733                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 734                 try:
 735                         self.report_disclaimer()
 736                         disclaimer = urllib2.urlopen(request).read()
 737                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 738                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 739                         return
 740
 741                 # Confirm age
 742                 disclaimer_form = {
 743                         'filters': '0',
 744                         'submit': "Continue - I'm over 18",
 745                         }
 746                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 747                 try:
 748                         self.report_age_confirmation()
 749                         disclaimer = urllib2.urlopen(request).read()
 750                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 751                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 752                         return
 753
 754         def _real_extract(self, url):
 755                 # Extract id and simplified title from URL
 756                 mobj = re.match(self._VALID_URL, url)
 757                 if mobj is None:
 758                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 759                         return
 760
 761                 video_id = mobj.group(1)
 762
 763                 # Check if video comes from YouTube
 764                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 765                 if mobj2 is not None:
 766                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 767                         return
 768
 769                 simple_title = mobj.group(2).decode('utf-8')
 770                 video_extension = 'flv'
 771
 772                 # Retrieve video webpage to extract further information
 773                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 774                 try:
 775                         self.report_download_webpage(video_id)
 776                         webpage = urllib2.urlopen(request).read()
 777                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 778                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 779                         return
 780
 781                 # Extract URL, uploader and title from webpage
 782                 self.report_extraction(video_id)
 783                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 784                 if mobj is None:
 785                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 786                         return
 787                 mediaURL = urllib.unquote(mobj.group(1))
 788
 789                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 790                 if mobj is None:
 791                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 792                         return
 793                 gdaKey = mobj.group(1)
 794
 795                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 796
 797                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 798                 if mobj is None:
 799                         self._downloader.trouble(u'ERROR: unable to extract title')
 800                         return
 801                 video_title = mobj.group(1).decode('utf-8')
 802
 803                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 804                 if mobj is None:
 805                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 806                         return
 807                 video_uploader = mobj.group(1)
 808
 809                 try:
 810                         # Process video information
 811                         self._downloader.process_info({
 812                                 'id':           video_id.decode('utf-8'),
 813                                 'url':          video_url.decode('utf-8'),
 814                                 'uploader':     video_uploader.decode('utf-8'),
 815                                 'title':        video_title,
 816                                 'stitle':       simple_title,
 817                                 'ext':          video_extension.decode('utf-8'),
 818                         })
 819                 except UnavailableFormatError:
 820                         self._downloader.trouble(u'ERROR: format not available for video')
 821
 822
 823 class YoutubeSearchIE(InfoExtractor):
 824         """Information Extractor for YouTube search queries."""
 825         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 826         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 827         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 828         _MORE_PAGES_INDICATOR = r'>Next</a>'
 829         _youtube_ie = None
 830         _max_youtube_results = 1000
 831
 832         def __init__(self, youtube_ie, downloader=None):
 833                 InfoExtractor.__init__(self, downloader)
 834                 self._youtube_ie = youtube_ie
 835
 836         @staticmethod
 837         def suitable(url):
 838                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 839
 840         def report_download_page(self, query, pagenum):
 841                 """Report attempt to download playlist page with given number."""
 842                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 843
 844         def _real_initialize(self):
 845                 self._youtube_ie.initialize()
 846
 847         def _real_extract(self, query):
 848                 mobj = re.match(self._VALID_QUERY, query)
 849                 if mobj is None:
 850                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 851                         return
 852
 853                 prefix, query = query.split(':')
 854                 prefix = prefix[8:]
 855                 if prefix == '':
 856                         self._download_n_results(query, 1)
 857                         return
 858                 elif prefix == 'all':
 859                         self._download_n_results(query, self._max_youtube_results)
 860                         return
 861                 else:
 862                         try:
 863                                 n = int(prefix)
 864                                 if n <= 0:
 865                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 866                                         return
 867                                 elif n > self._max_youtube_results:
 868                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 869                                         n = self._max_youtube_results
 870                                 self._download_n_results(query, n)
 871                                 return
 872                         except ValueError: # parsing prefix as int fails
 873                                 self._download_n_results(query, 1)
 874                                 return
 875
 876         def _download_n_results(self, query, n):
 877                 """Downloads a specified number of results for a query"""
 878
 879                 video_ids = []
 880                 already_seen = set()
 881                 pagenum = 1
 882
 883                 while True:
 884                         self.report_download_page(query, pagenum)
 885                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 886                         request = urllib2.Request(result_url, None, std_headers)
 887                         try:
 888                                 page = urllib2.urlopen(request).read()
 889                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 890                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 891                                 return
 892
 893                         # Extract video identifiers
 894                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 895                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 896                                 if video_id not in already_seen:
 897                                         video_ids.append(video_id)
 898                                         already_seen.add(video_id)
 899                                         if len(video_ids) == n:
 900                                                 # Specified n videos reached
 901                                                 for id in video_ids:
 902                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 903                                                 return
 904
 905                         if self._MORE_PAGES_INDICATOR not in page:
 906                                 for id in video_ids:
 907                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 908                                 return
 909
 910                         pagenum = pagenum + 1
 911
 912 class YoutubePlaylistIE(InfoExtractor):
 913         """Information Extractor for YouTube playlists."""
 914
 915         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 916         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 917         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 918         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 919         _youtube_ie = None
 920
 921         def __init__(self, youtube_ie, downloader=None):
 922                 InfoExtractor.__init__(self, downloader)
 923                 self._youtube_ie = youtube_ie
 924
 925         @staticmethod
 926         def suitable(url):
 927                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 928
 929         def report_download_page(self, playlist_id, pagenum):
 930                 """Report attempt to download playlist page with given number."""
 931                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 932
 933         def _real_initialize(self):
 934                 self._youtube_ie.initialize()
 935
 936         def _real_extract(self, url):
 937                 # Extract playlist id
 938                 mobj = re.match(self._VALID_URL, url)
 939                 if mobj is None:
 940                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 941                         return
 942
 943                 # Download playlist pages
 944                 playlist_id = mobj.group(1)
 945                 video_ids = []
 946                 pagenum = 1
 947
 948                 while True:
 949                         self.report_download_page(playlist_id, pagenum)
 950                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 951                         try:
 952                                 page = urllib2.urlopen(request).read()
 953                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 954                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 955                                 return
 956
 957                         # Extract video identifiers
 958                         ids_in_page = []
 959                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 960                                 if mobj.group(1) not in ids_in_page:
 961                                         ids_in_page.append(mobj.group(1))
 962                         video_ids.extend(ids_in_page)
 963
 964                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 965                                 break
 966                         pagenum = pagenum + 1
 967
 968                 for id in video_ids:
 969                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 970                 return
 971
 972 class PostProcessor(object):
 973         """Post Processor class.
 974
 975         PostProcessor objects can be added to downloaders with their
 976         add_post_processor() method. When the downloader has finished a
 977         successful download, it will take its internal chain of PostProcessors
 978         and start calling the run() method on each one of them, first with
 979         an initial argument and then with the returned value of the previous
 980         PostProcessor.
 981
 982         The chain will be stopped if one of them ever returns None or the end
 983         of the chain is reached.
 984
 985         PostProcessor objects follow a "mutual registration" process similar
 986         to InfoExtractor objects.
 987         """
 988
 989         _downloader = None
 990
 991         def __init__(self, downloader=None):
 992                 self._downloader = downloader
 993
 994         def set_downloader(self, downloader):
 995                 """Sets the downloader for this PP."""
 996                 self._downloader = downloader
 997
 998         def run(self, information):
 999                 """Run the PostProcessor.
1000
1001                 The "information" argument is a dictionary like the ones
1002                 composed by InfoExtractors. The only difference is that this
1003                 one has an extra field called "filepath" that points to the
1004                 downloaded file.
1005
1006                 When this method returns None, the postprocessing chain is
1007                 stopped. However, this method may return an information
1008                 dictionary that will be passed to the next postprocessing
1009                 object in the chain. It can be the one it received after
1010                 changing some fields.
1011
1012                 In addition, this method may raise a PostProcessingError
1013                 exception that will be taken into account by the downloader
1014                 it was called from.
1015                 """
1016                 return information # by default, do nothing
1017
1018 ### MAIN PROGRAM ###
1019 if __name__ == '__main__':
1020         try:
1021                 # Modules needed only when running the main program
1022                 import getpass
1023                 import optparse
1024
1025                 # General configuration
1026                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1027                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1028                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1029
1030                 # Parse command line
1031                 parser = optparse.OptionParser(
1032                         usage='Usage: %prog [options] url...',
1033                         version='INTERNAL',
1034                         conflict_handler='resolve',
1035                 )
1036
1037                 parser.add_option('-h', '--help',
1038                                 action='help', help='print this help text and exit')
1039                 parser.add_option('-v', '--version',
1040                                 action='version', help='print program version and exit')
1041                 parser.add_option('-i', '--ignore-errors',
1042                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1043                 parser.add_option('-r', '--rate-limit',
1044                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1045
1046                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1047                 authentication.add_option('-u', '--username',
1048                                 dest='username', metavar='UN', help='account username')
1049                 authentication.add_option('-p', '--password',
1050                                 dest='password', metavar='PW', help='account password')
1051                 authentication.add_option('-n', '--netrc',
1052                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1053                 parser.add_option_group(authentication)
1054
1055                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1056                 video_format.add_option('-f', '--format',
1057                                 action='append', dest='format', metavar='FMT', help='video format code')
1058                 video_format.add_option('-b', '--best-quality',
1059                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1060                 video_format.add_option('-m', '--mobile-version',
1061                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1062                 video_format.add_option('-d', '--high-def',
1063                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1064                 parser.add_option_group(video_format)
1065
1066                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1067                 verbosity.add_option('-q', '--quiet',
1068                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1069                 verbosity.add_option('-s', '--simulate',
1070                                 action='store_true', dest='simulate', help='do not download video', default=False)
1071                 verbosity.add_option('-g', '--get-url',
1072                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1073                 verbosity.add_option('-e', '--get-title',
1074                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1075                 parser.add_option_group(verbosity)
1076
1077                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1078                 filesystem.add_option('-t', '--title',
1079                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1080                 filesystem.add_option('-l', '--literal',
1081                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1082                 filesystem.add_option('-o', '--output',
1083                                 dest='outtmpl', metavar='TPL', help='output filename template')
1084                 filesystem.add_option('-a', '--batch-file',
1085                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1086                 filesystem.add_option('-w', '--no-overwrites',
1087                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1088                 parser.add_option_group(filesystem)
1089
1090                 (opts, args) = parser.parse_args()
1091
1092                 # Batch file verification
1093                 batchurls = []
1094                 if opts.batchfile is not None:
1095                         try:
1096                                 batchurls = open(opts.batchfile, 'r').readlines()
1097                                 batchurls = [x.strip() for x in batchurls]
1098                                 batchurls = [x for x in batchurls if len(x) > 0]
1099                         except IOError:
1100                                 sys.exit(u'ERROR: batch file could not be read')
1101                 all_urls = batchurls + args
1102
1103                 # Conflicting, missing and erroneous options
1104                 if len(all_urls) < 1:
1105                         parser.error(u'you must provide at least one URL')
1106                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1107                         parser.error(u'using .netrc conflicts with giving username/password')
1108                 if opts.password is not None and opts.username is None:
1109                         parser.error(u'account username missing')
1110                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1111                         parser.error(u'using output template conflicts with using title or literal title')
1112                 if opts.usetitle and opts.useliteral:
1113                         parser.error(u'using title conflicts with using literal title')
1114                 if opts.username is not None and opts.password is None:
1115                         opts.password = getpass.getpass(u'Type account password and press return:')
1116                 if opts.ratelimit is not None:
1117                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1118                         if numeric_limit is None:
1119                                 parser.error(u'invalid rate limit specified')
1120                         opts.ratelimit = numeric_limit
1121                 if opts.format is not None and len(opts.format) > 1:
1122                         parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1123                 if opts.format is None:
1124                         real_format = None
1125                 else:
1126                         real_format = opts.format[0]
1127
1128
1129                 # Information extractors
1130                 youtube_ie = YoutubeIE()
1131                 metacafe_ie = MetacafeIE(youtube_ie)
1132                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1133                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1134
1135                 # File downloader
1136                 fd = FileDownloader({
1137                         'usenetrc': opts.usenetrc,
1138                         'username': opts.username,
1139                         'password': opts.password,
1140                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1141                         'forceurl': opts.geturl,
1142                         'forcetitle': opts.gettitle,
1143                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1144                         'format': real_format,
1145                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1146                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1147                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1148                                 or u'%(id)s.%(ext)s'),
1149                         'ignoreerrors': opts.ignoreerrors,
1150                         'ratelimit': opts.ratelimit,
1151                         'nooverwrites': opts.nooverwrites,
1152                         })
1153                 fd.add_info_extractor(youtube_search_ie)
1154                 fd.add_info_extractor(youtube_pl_ie)
1155                 fd.add_info_extractor(metacafe_ie)
1156                 fd.add_info_extractor(youtube_ie)
1157                 retcode = fd.download(all_urls)
1158                 sys.exit(retcode)
1159
1160         except DownloadError:
1161                 sys.exit(1)
1162         except SameFileError:
1163                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1164         except KeyboardInterrupt:
1165                 sys.exit(u'\nERROR: Interrupted by user')