_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor extracts
  69         all the information about the video or videos the URL refers to, and
  70         asks the FileDownloader to process the video information, possibly
  71         downloading the video.
  72
  73         File downloaders accept a lot of parameters. In order not to saturate
  74         the object constructor with arguments, it receives a dictionary of
  75         options instead. These options are available through the params
  76         attribute for the InfoExtractors to use. The FileDownloader also
  77         registers itself as the downloader in charge for the InfoExtractors
  78         that are added to it, so this is a "mutual registration".
  79
  80         Available options:
  81
  82         username:       Username for authentication purposes.
  83         password:       Password for authentication purposes.
  84         usenetrc:       Use netrc for authentication instead.
  85         quiet:          Do not print messages to stdout.
  86         forceurl:       Force printing final URL.
  87         forcetitle:     Force printing title.
  88         simulate:       Do not download the video files.
  89         format:         Video format code.
  90         outtmpl:        Template for output names.
  91         ignoreerrors:   Do not stop on download errors.
  92         ratelimit:      Download speed limit, in bytes/sec.
  93         nooverwrites:   Prevent overwriting files.
  94         """
  95
  96         params = None
  97         _ies = []
  98         _pps = []
  99         _download_retcode = None
 100
 101         def __init__(self, params):
 102                 """Create a FileDownloader object with the given options."""
 103                 self._ies = []
 104                 self._pps = []
 105                 self._download_retcode = 0
 106                 self.params = params
 107
 108         @staticmethod
 109         def pmkdir(filename):
 110                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 111                 components = filename.split(os.sep)
 112                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 113                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 114                 for dir in aggregate:
 115                         if not os.path.exists(dir):
 116                                 os.mkdir(dir)
 117
 118         @staticmethod
 119         def format_bytes(bytes):
 120                 if bytes is None:
 121                         return 'N/A'
 122                 if bytes == 0:
 123                         exponent = 0
 124                 else:
 125                         exponent = long(math.log(float(bytes), 1024.0))
 126                 suffix = 'bkMGTPEZY'[exponent]
 127                 converted = float(bytes) / float(1024**exponent)
 128                 return '%.2f%s' % (converted, suffix)
 129
 130         @staticmethod
 131         def calc_percent(byte_counter, data_len):
 132                 if data_len is None:
 133                         return '---.-%'
 134                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 135
 136         @staticmethod
 137         def calc_eta(start, now, total, current):
 138                 if total is None:
 139                         return '--:--'
 140                 dif = now - start
 141                 if current == 0 or dif < 0.001: # One millisecond
 142                         return '--:--'
 143                 rate = float(current) / dif
 144                 eta = long((float(total) - float(current)) / rate)
 145                 (eta_mins, eta_secs) = divmod(eta, 60)
 146                 if eta_mins > 99:
 147                         return '--:--'
 148                 return '%02d:%02d' % (eta_mins, eta_secs)
 149
 150         @staticmethod
 151         def calc_speed(start, now, bytes):
 152                 dif = now - start
 153                 if bytes == 0 or dif < 0.001: # One millisecond
 154                         return '%10s' % '---b/s'
 155                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 156
 157         @staticmethod
 158         def best_block_size(elapsed_time, bytes):
 159                 new_min = max(bytes / 2.0, 1.0)
 160                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 161                 if elapsed_time < 0.001:
 162                         return int(new_max)
 163                 rate = bytes / elapsed_time
 164                 if rate > new_max:
 165                         return int(new_max)
 166                 if rate < new_min:
 167                         return int(new_min)
 168                 return int(rate)
 169
 170         @staticmethod
 171         def parse_bytes(bytestr):
 172                 """Parse a string indicating a byte quantity into a long integer."""
 173                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 174                 if matchobj is None:
 175                         return None
 176                 number = float(matchobj.group(1))
 177                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 178                 return long(round(number * multiplier))
 179
 180         def add_info_extractor(self, ie):
 181                 """Add an InfoExtractor object to the end of the list."""
 182                 self._ies.append(ie)
 183                 ie.set_downloader(self)
 184
 185         def add_post_processor(self, pp):
 186                 """Add a PostProcessor object to the end of the chain."""
 187                 self._pps.append(pp)
 188                 pp.set_downloader(self)
 189
 190         def to_stdout(self, message, skip_eol=False):
 191                 """Print message to stdout if not in quiet mode."""
 192                 if not self.params.get('quiet', False):
 193                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
 194                         sys.stdout.flush()
 195
 196         def to_stderr(self, message):
 197                 """Print message to stderr."""
 198                 print >>sys.stderr, message
 199
 200         def fixed_template(self):
 201                 """Checks if the output template is fixed."""
 202                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 203
 204         def trouble(self, message=None):
 205                 """Determine action to take when a download problem appears.
 206
 207                 Depending on if the downloader has been configured to ignore
 208                 download errors or not, this method may throw an exception or
 209                 not when errors are found, after printing the message.
 210                 """
 211                 if message is not None:
 212                         self.to_stderr(message)
 213                 if not self.params.get('ignoreerrors', False):
 214                         raise DownloadError(message)
 215                 self._download_retcode = 1
 216
 217         def slow_down(self, start_time, byte_counter):
 218                 """Sleep if the download speed is over the rate limit."""
 219                 rate_limit = self.params.get('ratelimit', None)
 220                 if rate_limit is None or byte_counter == 0:
 221                         return
 222                 now = time.time()
 223                 elapsed = now - start_time
 224                 if elapsed <= 0.0:
 225                         return
 226                 speed = float(byte_counter) / elapsed
 227                 if speed > rate_limit:
 228                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 229
 230         def report_destination(self, filename):
 231                 """Report destination filename."""
 232                 self.to_stdout(u'[download] Destination: %s' % filename)
 233
 234         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 235                 """Report download progress."""
 236                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 237                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 238
 239         def report_finish(self):
 240                 """Report download finished."""
 241                 self.to_stdout(u'')
 242
 243         def process_info(self, info_dict):
 244                 """Process a single dictionary returned by an InfoExtractor."""
 245                 # Forced printings
 246                 if self.params.get('forcetitle', False):
 247                         print info_dict['title'].encode(locale.getpreferredencoding())
 248                 if self.params.get('forceurl', False):
 249                         print info_dict['url'].encode(locale.getpreferredencoding())
 250
 251                 # Do nothing else if in simulate mode
 252                 if self.params.get('simulate', False):
 253                         return
 254
 255                 try:
 256                         filename = self.params['outtmpl'] % info_dict
 257                         self.report_destination(filename)
 258                 except (ValueError, KeyError), err:
 259                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 260                 if self.params['nooverwrites'] and os.path.exists(filename):
 261                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 262                         return
 263                 try:
 264                         self.pmkdir(filename)
 265                 except (OSError, IOError), err:
 266                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 267                         return
 268                 try:
 269                         outstream = open(filename, 'wb')
 270                 except (OSError, IOError), err:
 271                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 272                         return
 273                 try:
 274                         self._do_download(outstream, info_dict['url'])
 275                         outstream.close()
 276                 except (OSError, IOError), err:
 277                         self.trouble('ERROR: unable to write video data: %s' % str(err))
 278                         return
 279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 280                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 281                         return
 282                 try:
 283                         self.post_process(filename, info_dict)
 284                 except (PostProcessingError), err:
 285                         self.trouble('ERROR: postprocessing: %s' % str(err))
 286                         return
 287
 288                 return
 289
 290         def download(self, url_list):
 291                 """Download a given list of URLs."""
 292                 if len(url_list) > 1 and self.fixed_template():
 293                         raise SameFileError(self.params['outtmpl'])
 294
 295                 for url in url_list:
 296                         suitable_found = False
 297                         for ie in self._ies:
 298                                 # Go to next InfoExtractor if not suitable
 299                                 if not ie.suitable(url):
 300                                         continue
 301
 302                                 # Suitable InfoExtractor found
 303                                 suitable_found = True
 304
 305                                 # Extract information from URL and process it
 306                                 ie.extract(url)
 307
 308                                 # Suitable InfoExtractor had been found; go to next URL
 309                                 break
 310
 311                         if not suitable_found:
 312                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 313
 314                 return self._download_retcode
 315
 316         def post_process(self, filename, ie_info):
 317                 """Run the postprocessing chain on the given file."""
 318                 info = dict(ie_info)
 319                 info['filepath'] = filename
 320                 for pp in self._pps:
 321                         info = pp.run(info)
 322                         if info is None:
 323                                 break
 324
 325         def _do_download(self, stream, url):
 326                 request = urllib2.Request(url, None, std_headers)
 327                 data = urllib2.urlopen(request)
 328                 data_len = data.info().get('Content-length', None)
 329                 data_len_str = self.format_bytes(data_len)
 330                 byte_counter = 0
 331                 block_size = 1024
 332                 start = time.time()
 333                 while True:
 334                         # Progress message
 335                         percent_str = self.calc_percent(byte_counter, data_len)
 336                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 337                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 338                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 339
 340                         # Download and write
 341                         before = time.time()
 342                         data_block = data.read(block_size)
 343                         after = time.time()
 344                         data_block_len = len(data_block)
 345                         if data_block_len == 0:
 346                                 break
 347                         byte_counter += data_block_len
 348                         stream.write(data_block)
 349                         block_size = self.best_block_size(after - before, data_block_len)
 350
 351                         # Apply rate limit
 352                         self.slow_down(start, byte_counter)
 353
 354                 self.report_finish()
 355                 if data_len is not None and str(byte_counter) != data_len:
 356                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 357
 358 class InfoExtractor(object):
 359         """Information Extractor class.
 360
 361         Information extractors are the classes that, given a URL, extract
 362         information from the video (or videos) the URL refers to. This
 363         information includes the real video URL, the video title and simplified
 364         title, author and others. The information is stored in a dictionary
 365         which is then passed to the FileDownloader. The FileDownloader
 366         processes this information possibly downloading the video to the file
 367         system, among other possible outcomes. The dictionaries must include
 368         the following fields:
 369
 370         id:             Video identifier.
 371         url:            Final video URL.
 372         uploader:       Nickname of the video uploader.
 373         title:          Literal title.
 374         stitle:         Simplified title.
 375         ext:            Video filename extension.
 376
 377         Subclasses of this one should re-define the _real_initialize() and
 378         _real_extract() methods, as well as the suitable() static method.
 379         Probably, they should also be instantiated and added to the main
 380         downloader.
 381         """
 382
 383         _ready = False
 384         _downloader = None
 385
 386         def __init__(self, downloader=None):
 387                 """Constructor. Receives an optional downloader."""
 388                 self._ready = False
 389                 self.set_downloader(downloader)
 390
 391         @staticmethod
 392         def suitable(url):
 393                 """Receives a URL and returns True if suitable for this IE."""
 394                 return False
 395
 396         def initialize(self):
 397                 """Initializes an instance (authentication, etc)."""
 398                 if not self._ready:
 399                         self._real_initialize()
 400                         self._ready = True
 401
 402         def extract(self, url):
 403                 """Extracts URL information and returns it in list of dicts."""
 404                 self.initialize()
 405                 return self._real_extract(url)
 406
 407         def set_downloader(self, downloader):
 408                 """Sets the downloader for this IE."""
 409                 self._downloader = downloader
 410
 411         def _real_initialize(self):
 412                 """Real initialization process. Redefine in subclasses."""
 413                 pass
 414
 415         def _real_extract(self, url):
 416                 """Real extraction process. Redefine in subclasses."""
 417                 pass
 418
 419 class YoutubeIE(InfoExtractor):
 420         """Information extractor for youtube.com."""
 421
 422         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 423         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 424         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 425         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 426         _NETRC_MACHINE = 'youtube'
 427
 428         @staticmethod
 429         def suitable(url):
 430                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 431
 432         @staticmethod
 433         def htmlentity_transform(matchobj):
 434                 """Transforms an HTML entity to a Unicode character."""
 435                 entity = matchobj.group(1)
 436
 437                 # Known non-numeric HTML entity
 438                 if entity in htmlentitydefs.name2codepoint:
 439                         return unichr(htmlentitydefs.name2codepoint[entity])
 440
 441                 # Unicode character
 442                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
 443                 if mobj is not None:
 444                         numstr = mobj.group(1)
 445                         if numstr.startswith(u'x'):
 446                                 base = 16
 447                                 numstr = u'0%s' % numstr
 448                         else:
 449                                 base = 10
 450                         return unichr(long(numstr, base))
 451
 452                 # Unknown entity in name, return its literal representation
 453                 return (u'&%s;' % entity)
 454
 455         def report_lang(self):
 456                 """Report attempt to set language."""
 457                 self._downloader.to_stdout(u'[youtube] Setting language')
 458
 459         def report_login(self):
 460                 """Report attempt to log in."""
 461                 self._downloader.to_stdout(u'[youtube] Logging in')
 462
 463         def report_age_confirmation(self):
 464                 """Report attempt to confirm age."""
 465                 self._downloader.to_stdout(u'[youtube] Confirming age')
 466
 467         def report_webpage_download(self, video_id):
 468                 """Report attempt to download webpage."""
 469                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 470
 471         def report_information_extraction(self, video_id):
 472                 """Report attempt to extract video information."""
 473                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 474
 475         def report_video_url(self, video_id, video_real_url):
 476                 """Report extracted video URL."""
 477                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 478
 479         def _real_initialize(self):
 480                 if self._downloader is None:
 481                         return
 482
 483                 username = None
 484                 password = None
 485                 downloader_params = self._downloader.params
 486
 487                 # Attempt to use provided username and password or .netrc data
 488                 if downloader_params.get('username', None) is not None:
 489                         username = downloader_params['username']
 490                         password = downloader_params['password']
 491                 elif downloader_params.get('usenetrc', False):
 492                         try:
 493                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 494                                 if info is not None:
 495                                         username = info[0]
 496                                         password = info[2]
 497                                 else:
 498                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 499                         except (IOError, netrc.NetrcParseError), err:
 500                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 501                                 return
 502
 503                 # Set language
 504                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 505                 try:
 506                         self.report_lang()
 507                         urllib2.urlopen(request).read()
 508                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 509                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 510                         return
 511
 512                 # No authentication to be performed
 513                 if username is None:
 514                         return
 515
 516                 # Log in
 517                 login_form = {
 518                                 'current_form': 'loginForm',
 519                                 'next':         '/',
 520                                 'action_login': 'Log In',
 521                                 'username':     username,
 522                                 'password':     password,
 523                                 }
 524                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 525                 try:
 526                         self.report_login()
 527                         login_results = urllib2.urlopen(request).read()
 528                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 529                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 530                                 return
 531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 532                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 533                         return
 534
 535                 # Confirm age
 536                 age_form = {
 537                                 'next_url':             '/',
 538                                 'action_confirm':       'Confirm',
 539                                 }
 540                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 541                 try:
 542                         self.report_age_confirmation()
 543                         age_results = urllib2.urlopen(request).read()
 544                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 545                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 546                         return
 547
 548         def _real_extract(self, url):
 549                 # Extract video id from URL
 550                 mobj = re.match(self._VALID_URL, url)
 551                 if mobj is None:
 552                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 553                         return
 554                 video_id = mobj.group(2)
 555
 556                 # Downloader parameters
 557                 format_param = None
 558                 if self._downloader is not None:
 559                         params = self._downloader.params
 560                         format_param = params.get('format', None)
 561
 562                 # Extension
 563                 video_extension = {
 564                         '13': '3gp',
 565                         '17': 'mp4',
 566                         '18': 'mp4',
 567                         '22': 'mp4',
 568                 }.get(format_param, 'flv')
 569
 570                 # Normalize URL, including format
 571                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 572                 if format_param is not None:
 573                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 574                 request = urllib2.Request(normalized_url, None, std_headers)
 575                 try:
 576                         self.report_webpage_download(video_id)
 577                         video_webpage = urllib2.urlopen(request).read()
 578                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 579                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
 580                         return
 581                 self.report_information_extraction(video_id)
 582
 583                 # "t" param
 584                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 585                 if mobj is None:
 586                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
 587                         return
 588                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
 589                 if format_param is not None:
 590                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 591                 self.report_video_url(video_id, video_real_url)
 592
 593                 # uploader
 594                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 595                 if mobj is None:
 596                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 597                         return
 598                 video_uploader = mobj.group(1)
 599
 600                 # title
 601                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 602                 if mobj is None:
 603                         self._downloader.trouble(u'ERROR: unable to extract video title')
 604                         return
 605                 video_title = mobj.group(1).decode('utf-8')
 606                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
 607                 video_title = video_title.replace(os.sep, u'%')
 608
 609                 # simplified title
 610                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 611                 simple_title = simple_title.strip(ur'_')
 612
 613                 # Process video information
 614                 self._downloader.process_info({
 615                         'id':           video_id.decode('utf-8'),
 616                         'url':          video_real_url.decode('utf-8'),
 617                         'uploader':     video_uploader.decode('utf-8'),
 618                         'title':        video_title,
 619                         'stitle':       simple_title,
 620                         'ext':          video_extension.decode('utf-8'),
 621                         })
 622
 623 class MetacafeIE(InfoExtractor):
 624         """Information Extractor for metacafe.com."""
 625
 626         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 627         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 628         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 629         _youtube_ie = None
 630
 631         def __init__(self, youtube_ie, downloader=None):
 632                 InfoExtractor.__init__(self, downloader)
 633                 self._youtube_ie = youtube_ie
 634
 635         @staticmethod
 636         def suitable(url):
 637                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 638
 639         def report_disclaimer(self):
 640                 """Report disclaimer retrieval."""
 641                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 642
 643         def report_age_confirmation(self):
 644                 """Report attempt to confirm age."""
 645                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 646
 647         def report_download_webpage(self, video_id):
 648                 """Report webpage download."""
 649                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 650
 651         def report_extraction(self, video_id):
 652                 """Report information extraction."""
 653                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 654
 655         def _real_initialize(self):
 656                 # Retrieve disclaimer
 657                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 658                 try:
 659                         self.report_disclaimer()
 660                         disclaimer = urllib2.urlopen(request).read()
 661                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 662                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 663                         return
 664
 665                 # Confirm age
 666                 disclaimer_form = {
 667                         'filters': '0',
 668                         'submit': "Continue - I'm over 18",
 669                         }
 670                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 671                 try:
 672                         self.report_age_confirmation()
 673                         disclaimer = urllib2.urlopen(request).read()
 674                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 675                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 676                         return
 677
 678         def _real_extract(self, url):
 679                 # Extract id and simplified title from URL
 680                 mobj = re.match(self._VALID_URL, url)
 681                 if mobj is None:
 682                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 683                         return
 684
 685                 video_id = mobj.group(1)
 686
 687                 # Check if video comes from YouTube
 688                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 689                 if mobj2 is not None:
 690                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 691                         return
 692
 693                 simple_title = mobj.group(2).decode('utf-8')
 694                 video_extension = 'flv'
 695
 696                 # Retrieve video webpage to extract further information
 697                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 698                 try:
 699                         self.report_download_webpage(video_id)
 700                         webpage = urllib2.urlopen(request).read()
 701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 702                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
 703                         return
 704
 705                 # Extract URL, uploader and title from webpage
 706                 self.report_extraction(video_id)
 707                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
 708                 if mobj is None:
 709                         self._downloader.trouble(u'ERROR: unable to extract media URL')
 710                         return
 711                 mediaURL = urllib.unquote(mobj.group(1))
 712
 713                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 714                 if mobj is None:
 715                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
 716                         return
 717                 gdaKey = mobj.group(1)
 718
 719                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 720
 721                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 722                 if mobj is None:
 723                         self._downloader.trouble(u'ERROR: unable to extract title')
 724                         return
 725                 video_title = mobj.group(1).decode('utf-8')
 726
 727                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
 728                 if mobj is None:
 729                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 730                         return
 731                 video_uploader = mobj.group(1)
 732
 733                 # Process video information
 734                 self._downloader.process_info({
 735                         'id':           video_id.decode('utf-8'),
 736                         'url':          video_url.decode('utf-8'),
 737                         'uploader':     video_uploader.decode('utf-8'),
 738                         'title':        video_title,
 739                         'stitle':       simple_title,
 740                         'ext':          video_extension.decode('utf-8'),
 741                         })
 742
 743
 744 class YoutubeSearchIE(InfoExtractor):
 745         """Information Extractor for YouTube search queries."""
 746         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 747         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 748         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 749         _MORE_PAGES_INDICATOR = r'>Next</a>'
 750         _youtube_ie = None
 751         _max_youtube_results = 1000
 752
 753         def __init__(self, youtube_ie, downloader=None):
 754                 InfoExtractor.__init__(self, downloader)
 755                 self._youtube_ie = youtube_ie
 756
 757         @staticmethod
 758         def suitable(url):
 759                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 760
 761         def report_download_page(self, query, pagenum):
 762                 """Report attempt to download playlist page with given number."""
 763                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 764
 765         def _real_initialize(self):
 766                 self._youtube_ie.initialize()
 767
 768         def _real_extract(self, query):
 769                 mobj = re.match(self._VALID_QUERY, query)
 770                 if mobj is None:
 771                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
 772                         return
 773
 774                 prefix, query = query.split(':')
 775                 prefix = prefix[8:]
 776                 if prefix == '':
 777                         self._download_n_results(query, 1)
 778                         return
 779                 elif prefix == 'all':
 780                         self._download_n_results(query, self._max_youtube_results)
 781                         return
 782                 else:
 783                         try:
 784                                 n = int(prefix)
 785                                 if n <= 0:
 786                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 787                                         return
 788                                 elif n > self._max_youtube_results:
 789                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
 790                                         n = self._max_youtube_results
 791                                 self._download_n_results(query, n)
 792                                 return
 793                         except ValueError: # parsing prefix as int fails
 794                                 self._download_n_results(query, 1)
 795                                 return
 796
 797         def _download_n_results(self, query, n):
 798                 """Downloads a specified number of results for a query"""
 799
 800                 video_ids = []
 801                 already_seen = set()
 802                 pagenum = 1
 803
 804                 while True:
 805                         self.report_download_page(query, pagenum)
 806                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 807                         request = urllib2.Request(result_url, None, std_headers)
 808                         try:
 809                                 page = urllib2.urlopen(request).read()
 810                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 811                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 812                                 return
 813
 814                         # Extract video identifiers
 815                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 816                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 817                                 if video_id not in already_seen:
 818                                         video_ids.append(video_id)
 819                                         already_seen.add(video_id)
 820                                         if len(video_ids) == n:
 821                                                 # Specified n videos reached
 822                                                 for id in video_ids:
 823                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 824                                                 return
 825
 826                         if self._MORE_PAGES_INDICATOR not in page:
 827                                 for id in video_ids:
 828                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 829                                 return
 830
 831                         pagenum = pagenum + 1
 832
 833 class YoutubePlaylistIE(InfoExtractor):
 834         """Information Extractor for YouTube playlists."""
 835
 836         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 837         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 838         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 839         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 840         _youtube_ie = None
 841
 842         def __init__(self, youtube_ie, downloader=None):
 843                 InfoExtractor.__init__(self, downloader)
 844                 self._youtube_ie = youtube_ie
 845
 846         @staticmethod
 847         def suitable(url):
 848                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 849
 850         def report_download_page(self, playlist_id, pagenum):
 851                 """Report attempt to download playlist page with given number."""
 852                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 853
 854         def _real_initialize(self):
 855                 self._youtube_ie.initialize()
 856
 857         def _real_extract(self, url):
 858                 # Extract playlist id
 859                 mobj = re.match(self._VALID_URL, url)
 860                 if mobj is None:
 861                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
 862                         return
 863
 864                 # Download playlist pages
 865                 playlist_id = mobj.group(1)
 866                 video_ids = []
 867                 pagenum = 1
 868
 869                 while True:
 870                         self.report_download_page(playlist_id, pagenum)
 871                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 872                         try:
 873                                 page = urllib2.urlopen(request).read()
 874                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 875                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
 876                                 return
 877
 878                         # Extract video identifiers
 879                         ids_in_page = []
 880                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 881                                 if mobj.group(1) not in ids_in_page:
 882                                         ids_in_page.append(mobj.group(1))
 883                         video_ids.extend(ids_in_page)
 884
 885                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 886                                 break
 887                         pagenum = pagenum + 1
 888
 889                 for id in video_ids:
 890                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
 891                 return
 892
 893 class PostProcessor(object):
 894         """Post Processor class.
 895
 896         PostProcessor objects can be added to downloaders with their
 897         add_post_processor() method. When the downloader has finished a
 898         successful download, it will take its internal chain of PostProcessors
 899         and start calling the run() method on each one of them, first with
 900         an initial argument and then with the returned value of the previous
 901         PostProcessor.
 902
 903         The chain will be stopped if one of them ever returns None or the end
 904         of the chain is reached.
 905
 906         PostProcessor objects follow a "mutual registration" process similar
 907         to InfoExtractor objects.
 908         """
 909
 910         _downloader = None
 911
 912         def __init__(self, downloader=None):
 913                 self._downloader = downloader
 914
 915         def set_downloader(self, downloader):
 916                 """Sets the downloader for this PP."""
 917                 self._downloader = downloader
 918
 919         def run(self, information):
 920                 """Run the PostProcessor.
 921
 922                 The "information" argument is a dictionary like the ones
 923                 composed by InfoExtractors. The only difference is that this
 924                 one has an extra field called "filepath" that points to the
 925                 downloaded file.
 926
 927                 When this method returns None, the postprocessing chain is
 928                 stopped. However, this method may return an information
 929                 dictionary that will be passed to the next postprocessing
 930                 object in the chain. It can be the one it received after
 931                 changing some fields.
 932
 933                 In addition, this method may raise a PostProcessingError
 934                 exception that will be taken into account by the downloader
 935                 it was called from.
 936                 """
 937                 return information # by default, do nothing
 938
 939 ### MAIN PROGRAM ###
 940 if __name__ == '__main__':
 941         try:
 942                 # Modules needed only when running the main program
 943                 import getpass
 944                 import optparse
 945
 946                 # General configuration
 947                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 948                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 949                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 950
 951                 # Parse command line
 952                 parser = optparse.OptionParser(
 953                                 usage='Usage: %prog [options] url...',
 954                                 version='INTERNAL',
 955                                 conflict_handler='resolve',
 956                                 )
 957                 parser.add_option('-h', '--help',
 958                                 action='help', help='print this help text and exit')
 959                 parser.add_option('-v', '--version',
 960                                 action='version', help='print program version and exit')
 961                 parser.add_option('-u', '--username',
 962                                 dest='username', metavar='UN', help='account username')
 963                 parser.add_option('-p', '--password',
 964                                 dest='password', metavar='PW', help='account password')
 965                 parser.add_option('-o', '--output',
 966                                 dest='outtmpl', metavar='TPL', help='output filename template')
 967                 parser.add_option('-q', '--quiet',
 968                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 969                 parser.add_option('-s', '--simulate',
 970                                 action='store_true', dest='simulate', help='do not download video', default=False)
 971                 parser.add_option('-t', '--title',
 972                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 973                 parser.add_option('-l', '--literal',
 974                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 975                 parser.add_option('-n', '--netrc',
 976                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 977                 parser.add_option('-g', '--get-url',
 978                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 979                 parser.add_option('-e', '--get-title',
 980                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 981                 parser.add_option('-f', '--format',
 982                                 dest='format', metavar='FMT', help='video format code')
 983                 parser.add_option('-m', '--mobile-version',
 984                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 985                 parser.add_option('-d', '--high-def',
 986                                 action='store_const', dest='format', help='alias for -f 22', const='22')
 987                 parser.add_option('-i', '--ignore-errors',
 988                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 989                 parser.add_option('-r', '--rate-limit',
 990                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 991                 parser.add_option('-a', '--batch-file',
 992                                 dest='batchfile', metavar='F', help='file containing URLs to download')
 993                 parser.add_option('-w', '--no-overwrites',
 994                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
 995                 (opts, args) = parser.parse_args()
 996
 997                 # Batch file verification
 998                 batchurls = []
 999                 if opts.batchfile is not None:
1000                         try:
1001                                 batchurls = open(opts.batchfile, 'r').readlines()
1002                                 batchurls = [x.strip() for x in batchurls]
1003                                 batchurls = [x for x in batchurls if len(x) > 0]
1004                         except IOError:
1005                                 sys.exit(u'ERROR: batch file could not be read')
1006                 all_urls = batchurls + args
1007
1008                 # Conflicting, missing and erroneous options
1009                 if len(all_urls) < 1:
1010                         sys.exit(u'ERROR: you must provide at least one URL')
1011                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1012                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1013                 if opts.password is not None and opts.username is None:
1014                         sys.exit(u'ERROR: account username missing')
1015                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1016                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1017                 if opts.usetitle and opts.useliteral:
1018                         sys.exit(u'ERROR: using title conflicts with using literal title')
1019                 if opts.username is not None and opts.password is None:
1020                         opts.password = getpass.getpass(u'Type account password and press return:')
1021                 if opts.ratelimit is not None:
1022                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1023                         if numeric_limit is None:
1024                                 sys.exit(u'ERROR: invalid rate limit specified')
1025                         opts.ratelimit = numeric_limit
1026
1027                 # Information extractors
1028                 youtube_ie = YoutubeIE()
1029                 metacafe_ie = MetacafeIE(youtube_ie)
1030                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1031                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1032
1033                 # File downloader
1034                 fd = FileDownloader({
1035                         'usenetrc': opts.usenetrc,
1036                         'username': opts.username,
1037                         'password': opts.password,
1038                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1039                         'forceurl': opts.geturl,
1040                         'forcetitle': opts.gettitle,
1041                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1042                         'format': opts.format,
1043                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1044                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1045                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1046                                 or u'%(id)s.%(ext)s'),
1047                         'ignoreerrors': opts.ignoreerrors,
1048                         'ratelimit': opts.ratelimit,
1049                         'nooverwrites': opts.nooverwrites,
1050                         })
1051                 fd.add_info_extractor(youtube_search_ie)
1052                 fd.add_info_extractor(youtube_pl_ie)
1053                 fd.add_info_extractor(metacafe_ie)
1054                 fd.add_info_extractor(youtube_ie)
1055                 retcode = fd.download(all_urls)
1056                 sys.exit(retcode)
1057
1058         except DownloadError:
1059                 sys.exit(1)
1060         except SameFileError:
1061                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1062         except KeyboardInterrupt:
1063                 sys.exit(u'\nERROR: Interrupted by user')