git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor returns
  69         all the information to the FileDownloader and the latter downloads the
  70         file or does whatever it's instructed to do.
  71
  72         File downloaders accept a lot of parameters. In order not to saturate
  73         the object constructor with arguments, it receives a dictionary of
  74         options instead. These options are available through the params
  75         attribute for the InfoExtractors to use. The FileDownloader also
  76         registers itself as the downloader in charge for the InfoExtractors
  77         that are added to it, so this is a "mutual registration".
  78
  79         Available options:
  80
  81         username:       Username for authentication purposes.
  82         password:       Password for authentication purposes.
  83         usenetrc:       Use netrc for authentication instead.
  84         quiet:          Do not print messages to stdout.
  85         forceurl:       Force printing final URL.
  86         forcetitle:     Force printing title.
  87         simulate:       Do not download the video files.
  88         format:         Video format code.
  89         outtmpl:        Template for output names.
  90         ignoreerrors:   Do not stop on download errors.
  91         ratelimit:      Download speed limit, in bytes/sec.
  92         nooverwrites:   Prevent overwriting files.
  93         """
  94
  95         params = None
  96         _ies = []
  97         _pps = []
  98         _download_retcode = None
  99
 100         def __init__(self, params):
 101                 """Create a FileDownloader object with the given options."""
 102                 self._ies = []
 103                 self._pps = []
 104                 self._download_retcode = 0
 105                 self.params = params
 106
 107         @staticmethod
 108         def pmkdir(filename):
 109                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 110                 components = filename.split(os.sep)
 111                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 112                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 113                 for dir in aggregate:
 114                         if not os.path.exists(dir):
 115                                 os.mkdir(dir)
 116
 117         @staticmethod
 118         def format_bytes(bytes):
 119                 if bytes is None:
 120                         return 'N/A'
 121                 if bytes == 0:
 122                         exponent = 0
 123                 else:
 124                         exponent = long(math.log(float(bytes), 1024.0))
 125                 suffix = 'bkMGTPEZY'[exponent]
 126                 converted = float(bytes) / float(1024**exponent)
 127                 return '%.2f%s' % (converted, suffix)
 128
 129         @staticmethod
 130         def calc_percent(byte_counter, data_len):
 131                 if data_len is None:
 132                         return '---.-%'
 133                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 134
 135         @staticmethod
 136         def calc_eta(start, now, total, current):
 137                 if total is None:
 138                         return '--:--'
 139                 dif = now - start
 140                 if current == 0 or dif < 0.001: # One millisecond
 141                         return '--:--'
 142                 rate = float(current) / dif
 143                 eta = long((float(total) - float(current)) / rate)
 144                 (eta_mins, eta_secs) = divmod(eta, 60)
 145                 if eta_mins > 99:
 146                         return '--:--'
 147                 return '%02d:%02d' % (eta_mins, eta_secs)
 148
 149         @staticmethod
 150         def calc_speed(start, now, bytes):
 151                 dif = now - start
 152                 if bytes == 0 or dif < 0.001: # One millisecond
 153                         return '%10s' % '---b/s'
 154                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 155
 156         @staticmethod
 157         def best_block_size(elapsed_time, bytes):
 158                 new_min = max(bytes / 2.0, 1.0)
 159                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 160                 if elapsed_time < 0.001:
 161                         return int(new_max)
 162                 rate = bytes / elapsed_time
 163                 if rate > new_max:
 164                         return int(new_max)
 165                 if rate < new_min:
 166                         return int(new_min)
 167                 return int(rate)
 168
 169         @staticmethod
 170         def parse_bytes(bytestr):
 171                 """Parse a string indicating a byte quantity into a long integer."""
 172                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 173                 if matchobj is None:
 174                         return None
 175                 number = float(matchobj.group(1))
 176                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 177                 return long(round(number * multiplier))
 178
 179         def add_info_extractor(self, ie):
 180                 """Add an InfoExtractor object to the end of the list."""
 181                 self._ies.append(ie)
 182                 ie.set_downloader(self)
 183
 184         def add_post_processor(self, pp):
 185                 """Add a PostProcessor object to the end of the chain."""
 186                 self._pps.append(pp)
 187                 pp.set_downloader(self)
 188
 189         def to_stdout(self, message, skip_eol=False):
 190                 """Print message to stdout if not in quiet mode."""
 191                 if not self.params.get('quiet', False):
 192                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 193                         sys.stdout.flush()
 194
 195         def to_stderr(self, message):
 196                 """Print message to stderr."""
 197                 print >>sys.stderr, message
 198
 199         def fixed_template(self):
 200                 """Checks if the output template is fixed."""
 201                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 202
 203         def trouble(self, message=None):
 204                 """Determine action to take when a download problem appears.
 205
 206                 Depending on if the downloader has been configured to ignore
 207                 download errors or not, this method may throw an exception or
 208                 not when errors are found, after printing the message.
 209                 """
 210                 if message is not None:
 211                         self.to_stderr(message)
 212                 if not self.params.get('ignoreerrors', False):
 213                         raise DownloadError(message)
 214                 self._download_retcode = 1
 215
 216         def slow_down(self, start_time, byte_counter):
 217                 """Sleep if the download speed is over the rate limit."""
 218                 rate_limit = self.params.get('ratelimit', None)
 219                 if rate_limit is None or byte_counter == 0:
 220                         return
 221                 now = time.time()
 222                 elapsed = now - start_time
 223                 if elapsed <= 0.0:
 224                         return
 225                 speed = float(byte_counter) / elapsed
 226                 if speed > rate_limit:
 227                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 228
 229         def report_destination(self, filename):
 230                 """Report destination filename."""
 231                 self.to_stdout(u'[download] Destination: %s' % filename)
 232
 233         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 234                 """Report download progress."""
 235                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 236                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 237
 238         def report_finish(self):
 239                 """Report download finished."""
 240                 self.to_stdout(u'')
 241
 242         def process_info(self, info_dict):
 243                 """Process a single dictionary returned by an InfoExtractor."""
 244                 # Forced printings
 245                 if self.params.get('forcetitle', False):
 246                         print info_dict['title']
 247                 if self.params.get('forceurl', False):
 248                         print info_dict['url']
 249
 250                 # Do nothing else if in simulate mode
 251                 if self.params.get('simulate', False):
 252                         return
 253
 254                 try:
 255                         filename = self.params['outtmpl'] % info_dict
 256                         self.report_destination(filename)
 257                 except (ValueError, KeyError), err:
 258                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 259                 if self.params['nooverwrites'] and os.path.exists(filename):
 260                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 261                         return
 262                 try:
 263                         self.pmkdir(filename)
 264                 except (OSError, IOError), err:
 265                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 266                         return
 267                 try:
 268                         outstream = open(filename, 'wb')
 269                 except (OSError, IOError), err:
 270                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 271                         return
 272                 try:
 273                         self._do_download(outstream, info_dict['url'])
 274                         outstream.close()
 275                 except (OSError, IOError), err:
 276                         self.trouble('ERROR: unable to write video data: %s' % str(err))
 277                         return
 278                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 279                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 280                         return
 281                 try:
 282                         self.post_process(filename, info_dict)
 283                 except (PostProcessingError), err:
 284                         self.trouble('ERROR: postprocessing: %s' % str(err))
 285                         return
 286
 287                 return
 288
 289         def download(self, url_list):
 290                 """Download a given list of URLs."""
 291                 if len(url_list) > 1 and self.fixed_template():
 292                         raise SameFileError(self.params['outtmpl'])
 293
 294                 for url in url_list:
 295                         suitable_found = False
 296                         for ie in self._ies:
 297                                 # Go to next InfoExtractor if not suitable
 298                                 if not ie.suitable(url):
 299                                         continue
 300
 301                                 # Suitable InfoExtractor found
 302                                 suitable_found = True
 303
 304                                 # Extract information from URL and process it
 305                                 ie.extract(url)
 306
 307                                 # Suitable InfoExtractor had been found; go to next URL
 308                                 break
 309
 310                         if not suitable_found:
 311                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 312
 313                 return self._download_retcode
 314
 315         def post_process(self, filename, ie_info):
 316                 """Run the postprocessing chain on the given file."""
 317                 info = dict(ie_info)
 318                 info['filepath'] = filename
 319                 for pp in self._pps:
 320                         info = pp.run(info)
 321                         if info is None:
 322                                 break
 323
 324         def _do_download(self, stream, url):
 325                 request = urllib2.Request(url, None, std_headers)
 326                 data = urllib2.urlopen(request)
 327                 data_len = data.info().get('Content-length', None)
 328                 data_len_str = self.format_bytes(data_len)
 329                 byte_counter = 0
 330                 block_size = 1024
 331                 start = time.time()
 332                 while True:
 333                         # Progress message
 334                         percent_str = self.calc_percent(byte_counter, data_len)
 335                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 336                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 337                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 338
 339                         # Download and write
 340                         before = time.time()
 341                         data_block = data.read(block_size)
 342                         after = time.time()
 343                         data_block_len = len(data_block)
 344                         if data_block_len == 0:
 345                                 break
 346                         byte_counter += data_block_len
 347                         stream.write(data_block)
 348                         block_size = self.best_block_size(after - before, data_block_len)
 349
 350                         # Apply rate limit
 351                         self.slow_down(start, byte_counter)
 352
 353                 self.report_finish()
 354                 if data_len is not None and str(byte_counter) != data_len:
 355                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 356
 357 class InfoExtractor(object):
 358         """Information Extractor class.
 359
 360         Information extractors are the classes that, given a URL, extract
 361         information from the video (or videos) the URL refers to. This
 362         information includes the real video URL, the video title and simplified
 363         title, author and others. It is returned in a list of dictionaries when
 364         calling its extract() method. It is a list because a URL can refer to
 365         more than one video (think of playlists). The dictionaries must include
 366         the following fields:
 367
 368         id:             Video identifier.
 369         url:            Final video URL.
 370         uploader:       Nickname of the video uploader.
 371         title:          Literal title.
 372         stitle:         Simplified title.
 373         ext:            Video filename extension.
 374
 375         Subclasses of this one should re-define the _real_initialize() and
 376         _real_extract() methods, as well as the suitable() static method.
 377         Probably, they should also be instantiated and added to the main
 378         downloader.
 379         """
 380
 381         _ready = False
 382         _downloader = None
 383
 384         def __init__(self, downloader=None):
 385                 """Constructor. Receives an optional downloader."""
 386                 self._ready = False
 387                 self.set_downloader(downloader)
 388
 389         @staticmethod
 390         def suitable(url):
 391                 """Receives a URL and returns True if suitable for this IE."""
 392                 return False
 393
 394         def initialize(self):
 395                 """Initializes an instance (authentication, etc)."""
 396                 if not self._ready:
 397                         self._real_initialize()
 398                         self._ready = True
 399
 400         def extract(self, url):
 401                 """Extracts URL information and returns it in list of dicts."""
 402                 self.initialize()
 403                 return self._real_extract(url)
 404
 405         def set_downloader(self, downloader):
 406                 """Sets the downloader for this IE."""
 407                 self._downloader = downloader
 408
 409         def _real_initialize(self):
 410                 """Real initialization process. Redefine in subclasses."""
 411                 pass
 412
 413         def _real_extract(self, url):
 414                 """Real extraction process. Redefine in subclasses."""
 415                 pass
 416
 417 class YoutubeIE(InfoExtractor):
 418         """Information extractor for youtube.com."""
 419
 420         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 421         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 422         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 423         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 424         _NETRC_MACHINE = 'youtube'
 425
 426         @staticmethod
 427         def suitable(url):
 428                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 429
 430         @staticmethod
 431         def htmlentity_transform(matchobj):
 432                 """Transforms an HTML entity to a Unicode character."""
 433                 entity = matchobj.group(1)
 434
 435                 # Known non-numeric HTML entity
 436                 if entity in htmlentitydefs.name2codepoint:
 437                         return unichr(htmlentitydefs.name2codepoint[entity])
 438
 439                 # Unicode character
 440                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 441                 if mobj is not None:
 442                         numstr = mobj.group(1)
 443                         if numstr.startswith(u'x'):
 444                                 base = 16
 445                                 numstr = u'0%s' % numstr
 446                         else:
 447                                 base = 10
 448                         return unichr(long(numstr, base))
 449
 450                 # Unknown entity in name, return its literal representation
 451                 return (u'&%s;' % entity)
 452
 453         def report_lang(self):
 454                 """Report attempt to set language."""
 455                 self._downloader.to_stdout(u'[youtube] Setting language')
 456
 457         def report_login(self):
 458                 """Report attempt to log in."""
 459                 self._downloader.to_stdout(u'[youtube] Logging in')
 460
 461         def report_age_confirmation(self):
 462                 """Report attempt to confirm age."""
 463                 self._downloader.to_stdout(u'[youtube] Confirming age')
 464
 465         def report_webpage_download(self, video_id):
 466                 """Report attempt to download webpage."""
 467                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 468
 469         def report_information_extraction(self, video_id):
 470                 """Report attempt to extract video information."""
 471                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 472
 473         def report_video_url(self, video_id, video_real_url):
 474                 """Report extracted video URL."""
 475                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 476
 477         def _real_initialize(self):
 478                 if self._downloader is None:
 479                         return
 480
 481                 username = None
 482                 password = None
 483                 downloader_params = self._downloader.params
 484
 485                 # Attempt to use provided username and password or .netrc data
 486                 if downloader_params.get('username', None) is not None:
 487                         username = downloader_params['username']
 488                         password = downloader_params['password']
 489                 elif downloader_params.get('usenetrc', False):
 490                         try:
 491                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 492                                 if info is not None:
 493                                         username = info[0]
 494                                         password = info[2]
 495                                 else:
 496                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 497                         except (IOError, netrc.NetrcParseError), err:
 498                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 499                                 return
 500
 501                 # Set language
 502                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 503                 try:
 504                         self.report_lang()
 505                         urllib2.urlopen(request).read()
 506                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 507                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 508                         return
 509
 510                 # No authentication to be performed
 511                 if username is None:
 512                         return
 513
 514                 # Log in
 515                 login_form = {
 516                                 'current_form': 'loginForm',
 517                                 'next':         '/',
 518                                 'action_login': 'Log In',
 519                                 'username':     username,
 520                                 'password':     password,
 521                                 }
 522                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 523                 try:
 524                         self.report_login()
 525                         login_results = urllib2.urlopen(request).read()
 526                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 527                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 528                                 return
 529                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 530                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 531                         return
 532
 533                 # Confirm age
 534                 age_form = {
 535                                 'next_url':             '/',
 536                                 'action_confirm':       'Confirm',
 537                                 }
 538                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 539                 try:
 540                         self.report_age_confirmation()
 541                         age_results = urllib2.urlopen(request).read()
 542                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 543                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 544                         return
 545
 546         def _real_extract(self, url):
 547                 # Extract video id from URL
 548                 mobj = re.match(self._VALID_URL, url)
 549                 if mobj is None:
 550                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 551                         return
 552                 video_id = mobj.group(2)
 553
 554                 # Downloader parameters
 555                 format_param = None
 556                 if self._downloader is not None:
 557                         params = self._downloader.params
 558                         format_param = params.get('format', None)
 559
 560                 # Extension
 561                 video_extension = {
 562                         '17': '3gp',
 563                         '18': 'mp4',
 564                         '22': 'mp4',
 565                 }.get(format_param, 'flv')
 566
 567                 # Normalize URL, including format
 568                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 569                 if format_param is not None:
 570                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 571                 request = urllib2.Request(normalized_url, None, std_headers)
 572                 try:
 573                         self.report_webpage_download(video_id)
 574                         video_webpage = urllib2.urlopen(request).read()
 575                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 576                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 577                         return
 578                 self.report_information_extraction(video_id)
 579
 580                 # "t" param
 581                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 582                 if mobj is None:
 583                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 584                         return
 585                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 586                 if format_param is not None:
 587                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 588                 self.report_video_url(video_id, video_real_url)
 589
 590                 # uploader
 591                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 592                 if mobj is None:
 593                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 594                         return
 595                 video_uploader = mobj.group(1)
 596
 597                 # title
 598                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 599                 if mobj is None:
 600                         self._downloader.trouble(u'ERROR: unable to extract video title')
 601                         return
 602                 video_title = mobj.group(1).decode('utf-8')
 603                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 604                 video_title = video_title.replace(os.sep, u'%')
 605
 606                 # simplified title
 607                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 608                 simple_title = simple_title.strip(ur'_')
 609
 610                 # Process video information
 611                 self._downloader.process_info({
 612                         'id':           video_id.decode('utf-8'),
 613                         'url':          video_real_url.decode('utf-8'),
 614                         'uploader':     video_uploader.decode('utf-8'),
 615                         'title':        video_title,
 616                         'stitle':       simple_title,
 617                         'ext':          video_extension.decode('utf-8'),
 618                         })
 619
 620 class MetacafeIE(InfoExtractor):
 621         """Information Extractor for metacafe.com."""
 622
 623         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 624         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 625         _youtube_ie = None
 626
 627         def __init__(self, youtube_ie, downloader=None):
 628                 InfoExtractor.__init__(self, downloader)
 629                 self._youtube_ie = youtube_ie
 630
 631         @staticmethod
 632         def suitable(url):
 633                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 634
 635         def report_disclaimer(self):
 636                 """Report disclaimer retrieval."""
 637                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 638
 639         def report_age_confirmation(self):
 640                 """Report attempt to confirm age."""
 641                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 642
 643         def report_download_webpage(self, video_id):
 644                 """Report webpage download."""
 645                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 646
 647         def report_extraction(self, video_id):
 648                 """Report information extraction."""
 649                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 650
 651         def _real_initialize(self):
 652                 # Retrieve disclaimer
 653                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 654                 try:
 655                         self.report_disclaimer()
 656                         disclaimer = urllib2.urlopen(request).read()
 657                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 658                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 659                         return
 660
 661                 # Confirm age
 662                 disclaimer_form = {
 663                         'filters': '0',
 664                         'submit': "Continue - I'm over 18",
 665                         }
 666                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 667                 try:
 668                         self.report_age_confirmation()
 669                         disclaimer = urllib2.urlopen(request).read()
 670                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 671                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 672                         return
 673
 674         def _real_extract(self, url):
 675                 # Extract id and simplified title from URL
 676                 mobj = re.match(self._VALID_URL, url)
 677                 if mobj is None:
 678                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 679                         return
 680
 681                 video_id = mobj.group(1)
 682
 683                 # Check if video comes from YouTube
 684                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 685                 if mobj2 is not None:
 686                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 687                         return
 688
 689                 simple_title = mobj.group(2).decode('utf-8')
 690                 video_extension = 'flv'
 691
 692                 # Retrieve video webpage to extract further information
 693                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 694                 try:
 695                         self.report_download_webpage(video_id)
 696                         webpage = urllib2.urlopen(request).read()
 697                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 698                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 699                         return
 700
 701                 # Extract URL, uploader and title from webpage
 702                 self.report_extraction(video_id)
 703                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 704                 if mobj is None:
 705                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 706                         return
 707                 mediaURL = mobj.group(1).replace('\\', '')
 708
 709                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 710                 if mobj is None:
 711                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 712                         return
 713                 gdaKey = mobj.group(1)
 714
 715                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 716
 717                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 718                 if mobj is None:
 719                         self._downloader.trouble(u'ERROR: unable to extract title')
 720                         return
 721                 video_title = mobj.group(1).decode('utf-8')
 722
 723                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 724                 if mobj is None:
 725                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 726                         return
 727                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 728
 729                 # Process video information
 730                 self._downloader.process_info({
 731                         'id':           video_id.decode('utf-8'),
 732                         'url':          video_url.decode('utf-8'),
 733                         'uploader':     video_uploader.decode('utf-8'),
 734                         'title':        video_title,
 735                         'stitle':       simple_title,
 736                         'ext':          video_extension.decode('utf-8'),
 737                         })
 738
 739
 740 class YoutubeSearchIE(InfoExtractor):
 741         """Information Extractor for YouTube search queries."""
 742         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 743         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 744         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 745         _MORE_PAGES_INDICATOR = r'>Next</a>'
 746         _youtube_ie = None
 747         _max_youtube_results = 1000
 748
 749         def __init__(self, youtube_ie, downloader=None):
 750                 InfoExtractor.__init__(self, downloader)
 751                 self._youtube_ie = youtube_ie
 752
 753         @staticmethod
 754         def suitable(url):
 755                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 756
 757         def report_download_page(self, query, pagenum):
 758                 """Report attempt to download playlist page with given number."""
 759                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 760
 761         def _real_initialize(self):
 762                 self._youtube_ie.initialize()
 763
 764         def _real_extract(self, query):
 765                 mobj = re.match(self._VALID_QUERY, query)
 766                 if mobj is None:
 767                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 768                         return
 769
 770                 prefix, query = query.split(':')
 771                 prefix = prefix[8:]
 772                 if prefix == '':
 773                         self._download_n_results(query, 1)
 774                         return
 775                 elif prefix == 'all':
 776                         self._download_n_results(query, self._max_youtube_results)
 777                         return
 778                 else:
 779                         try:
 780                                 n = int(prefix)
 781                                 if n <= 0:
 782                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 783                                         return
 784                                 elif n > self._max_youtube_results:
 785                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 786                                         n = self._max_youtube_results
 787                                 self._download_n_results(query, n)
 788                                 return
 789                         except ValueError: # parsing prefix as int fails
 790                                 self._download_n_results(query, 1)
 791                                 return
 792
 793         def _download_n_results(self, query, n):
 794                 """Downloads a specified number of results for a query"""
 795
 796                 video_ids = []
 797                 already_seen = set()
 798                 pagenum = 1
 799
 800                 while True:
 801                         self.report_download_page(query, pagenum)
 802                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 803                         request = urllib2.Request(result_url, None, std_headers)
 804                         try:
 805                                 page = urllib2.urlopen(request).read()
 806                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 807                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 808                                 return
 809
 810                         # Extract video identifiers
 811                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 812                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 813                                 if video_id not in already_seen:
 814                                         video_ids.append(video_id)
 815                                         already_seen.add(video_id)
 816                                         if len(video_ids) == n:
 817                                                 # Specified n videos reached
 818                                                 for id in video_ids:
 819                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 820                                                 return
 821
 822                         if self._MORE_PAGES_INDICATOR not in page:
 823                                 for id in video_ids:
 824                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 825                                 return
 826
 827                         pagenum = pagenum + 1
 828
 829 class YoutubePlaylistIE(InfoExtractor):
 830         """Information Extractor for YouTube playlists."""
 831
 832         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 833         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 834         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 835         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 836         _youtube_ie = None
 837
 838         def __init__(self, youtube_ie, downloader=None):
 839                 InfoExtractor.__init__(self, downloader)
 840                 self._youtube_ie = youtube_ie
 841
 842         @staticmethod
 843         def suitable(url):
 844                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 845
 846         def report_download_page(self, playlist_id, pagenum):
 847                 """Report attempt to download playlist page with given number."""
 848                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 849
 850         def _real_initialize(self):
 851                 self._youtube_ie.initialize()
 852
 853         def _real_extract(self, url):
 854                 # Extract playlist id
 855                 mobj = re.match(self._VALID_URL, url)
 856                 if mobj is None:
 857                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 858                         return
 859
 860                 # Download playlist pages
 861                 playlist_id = mobj.group(1)
 862                 video_ids = []
 863                 pagenum = 1
 864
 865                 while True:
 866                         self.report_download_page(playlist_id, pagenum)
 867                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 868                         try:
 869                                 page = urllib2.urlopen(request).read()
 870                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 871                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 872                                 return
 873
 874                         # Extract video identifiers
 875                         ids_in_page = []
 876                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 877                                 if mobj.group(1) not in ids_in_page:
 878                                         ids_in_page.append(mobj.group(1))
 879                         video_ids.extend(ids_in_page)
 880
 881                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 882                                 break
 883                         pagenum = pagenum + 1
 884
 885                 for id in video_ids:
 886                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 887                 return
 888
 889 class PostProcessor(object):
 890         """Post Processor class.
 891
 892         PostProcessor objects can be added to downloaders with their
 893         add_post_processor() method. When the downloader has finished a
 894         successful download, it will take its internal chain of PostProcessors
 895         and start calling the run() method on each one of them, first with
 896         an initial argument and then with the returned value of the previous
 897         PostProcessor.
 898
 899         The chain will be stopped if one of them ever returns None or the end
 900         of the chain is reached.
 901
 902         PostProcessor objects follow a "mutual registration" process similar
 903         to InfoExtractor objects.
 904         """
 905
 906         _downloader = None
 907
 908         def __init__(self, downloader=None):
 909                 self._downloader = downloader
 910
 911         def set_downloader(self, downloader):
 912                 """Sets the downloader for this PP."""
 913                 self._downloader = downloader
 914
 915         def run(self, information):
 916                 """Run the PostProcessor.
 917
 918                 The "information" argument is a dictionary like the ones
 919                 returned by InfoExtractors. The only difference is that this
 920                 one has an extra field called "filepath" that points to the
 921                 downloaded file.
 922
 923                 When this method returns None, the postprocessing chain is
 924                 stopped. However, this method may return an information
 925                 dictionary that will be passed to the next postprocessing
 926                 object in the chain. It can be the one it received after
 927                 changing some fields.
 928
 929                 In addition, this method may raise a PostProcessingError
 930                 exception that will be taken into account by the downloader
 931                 it was called from.
 932                 """
 933                 return information # by default, do nothing
 934
 935 ### MAIN PROGRAM ###
 936 if __name__ == '__main__':
 937         try:
 938                 # Modules needed only when running the main program
 939                 import getpass
 940                 import optparse
 941
 942                 # General configuration
 943                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 944                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 945                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 946
 947                 # Parse command line
 948                 parser = optparse.OptionParser(
 949                                 usage='Usage: %prog [options] url...',
 950                                 version='INTERNAL',
 951                                 conflict_handler='resolve',
 952                                 )
 953                 parser.add_option('-h', '--help',
 954                                 action='help', help='print this help text and exit')
 955                 parser.add_option('-v', '--version',
 956                                 action='version', help='print program version and exit')
 957                 parser.add_option('-u', '--username',
 958                                 dest='username', metavar='UN', help='account username')
 959                 parser.add_option('-p', '--password',
 960                                 dest='password', metavar='PW', help='account password')
 961                 parser.add_option('-o', '--output',
 962                                 dest='outtmpl', metavar='TPL', help='output filename template')
 963                 parser.add_option('-q', '--quiet',
 964                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 965                 parser.add_option('-s', '--simulate',
 966                                 action='store_true', dest='simulate', help='do not download video', default=False)
 967                 parser.add_option('-t', '--title',
 968                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 969                 parser.add_option('-l', '--literal',
 970                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 971                 parser.add_option('-n', '--netrc',
 972                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 973                 parser.add_option('-g', '--get-url',
 974                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 975                 parser.add_option('-e', '--get-title',
 976                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 977                 parser.add_option('-f', '--format',
 978                                 dest='format', metavar='FMT', help='video format code')
 979                 parser.add_option('-m', '--mobile-version',
 980                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 981                 parser.add_option('-d', '--high-def',
 982                                 action='store_const', dest='format', help='alias for -f 22', const='22')
 983                 parser.add_option('-i', '--ignore-errors',
 984                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 985                 parser.add_option('-r', '--rate-limit',
 986                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 987                 parser.add_option('-a', '--batch-file',
 988                                 dest='batchfile', metavar='F', help='file containing URLs to download')
 989                 parser.add_option('-w', '--no-overwrites',
 990                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
 991                 (opts, args) = parser.parse_args()
 992
 993                 # Batch file verification
 994                 batchurls = []
 995                 if opts.batchfile is not None:
 996                         try:
 997                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
 998                         except IOError:
 999                                 sys.exit(u'ERROR: batch file could not be read')
1000                 all_urls = batchurls + args
1001
1002                 # Conflicting, missing and erroneous options
1003                 if len(all_urls) < 1:
1004                         sys.exit(u'ERROR: you must provide at least one URL')
1005                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1006                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1007                 if opts.password is not None and opts.username is None:
1008                         sys.exit(u'ERROR: account username missing')
1009                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1010                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1011                 if opts.usetitle and opts.useliteral:
1012                         sys.exit(u'ERROR: using title conflicts with using literal title')
1013                 if opts.username is not None and opts.password is None:
1014                         opts.password = getpass.getpass(u'Type account password and press return:')
1015                 if opts.ratelimit is not None:
1016                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1017                         if numeric_limit is None:
1018                                 sys.exit(u'ERROR: invalid rate limit specified')
1019                         opts.ratelimit = numeric_limit
1020
1021                 # Information extractors
1022                 youtube_ie = YoutubeIE()
1023                 metacafe_ie = MetacafeIE(youtube_ie)
1024                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1025                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1026
1027                 # File downloader
1028                 charset = locale.getpreferredencoding()
1029                 if charset is None:
1030                         charset = 'ascii'
1031                 fd = FileDownloader({
1032                         'usenetrc': opts.usenetrc,
1033                         'username': opts.username,
1034                         'password': opts.password,
1035                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1036                         'forceurl': opts.geturl,
1037                         'forcetitle': opts.gettitle,
1038                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1039                         'format': opts.format,
1040                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1041                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1042                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1043                                 or u'%(id)s.%(ext)s'),
1044                         'ignoreerrors': opts.ignoreerrors,
1045                         'ratelimit': opts.ratelimit,
1046                         'nooverwrites': opts.nooverwrites,
1047                         })
1048                 fd.add_info_extractor(youtube_search_ie)
1049                 fd.add_info_extractor(youtube_pl_ie)
1050                 fd.add_info_extractor(metacafe_ie)
1051                 fd.add_info_extractor(youtube_ie)
1052                 retcode = fd.download(all_urls)
1053                 sys.exit(retcode)
1054
1055         except DownloadError:
1056                 sys.exit(1)
1057         except SameFileError:
1058                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1059         except KeyboardInterrupt:
1060                 sys.exit(u'\nERROR: Interrupted by user')