git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # License: Public domain code
   6 import htmlentitydefs
   7 import httplib
   8 import locale
   9 import math
  10 import netrc
  11 import os
  12 import os.path
  13 import re
  14 import socket
  15 import string
  16 import sys
  17 import time
  18 import urllib
  19 import urllib2
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  29
  30 class DownloadError(Exception):
  31         """Download Error exception.
  32
  33         This exception may be thrown by FileDownloader objects if they are not
  34         configured to continue on errors. They will contain the appropriate
  35         error message.
  36         """
  37         pass
  38
  39 class SameFileError(Exception):
  40         """Same File exception.
  41
  42         This exception will be thrown by FileDownloader objects if they detect
  43         multiple files would have to be downloaded to the same file on disk.
  44         """
  45         pass
  46
  47 class PostProcessingError(Exception):
  48         """Post Processing exception.
  49
  50         This exception may be raised by PostProcessor's .run() method to
  51         indicate an error in the postprocessing task.
  52         """
  53         pass
  54
  55 class FileDownloader(object):
  56         """File Downloader class.
  57
  58         File downloader objects are the ones responsible of downloading the
  59         actual video file and writing it to disk if the user has requested
  60         it, among some other tasks. In most cases there should be one per
  61         program. As, given a video URL, the downloader doesn't know how to
  62         extract all the needed information, task that InfoExtractors do, it
  63         has to pass the URL to one of them.
  64
  65         For this, file downloader objects have a method that allows
  66         InfoExtractors to be registered in a given order. When it is passed
  67         a URL, the file downloader handles it to the first InfoExtractor it
  68         finds that reports being able to handle it. The InfoExtractor returns
  69         all the information to the FileDownloader and the latter downloads the
  70         file or does whatever it's instructed to do.
  71
  72         File downloaders accept a lot of parameters. In order not to saturate
  73         the object constructor with arguments, it receives a dictionary of
  74         options instead. These options are available through the params
  75         attribute for the InfoExtractors to use. The FileDownloader also
  76         registers itself as the downloader in charge for the InfoExtractors
  77         that are added to it, so this is a "mutual registration".
  78
  79         Available options:
  80
  81         username:       Username for authentication purposes.
  82         password:       Password for authentication purposes.
  83         usenetrc:       Use netrc for authentication instead.
  84         quiet:          Do not print messages to stdout.
  85         forceurl:       Force printing final URL.
  86         forcetitle:     Force printing title.
  87         simulate:       Do not download the video files.
  88         format:         Video format code.
  89         outtmpl:        Template for output names.
  90         ignoreerrors:   Do not stop on download errors.
  91         ratelimit:      Download speed limit, in bytes/sec.
  92         nooverwrites:   Prevent overwriting files.
  93         """
  94
  95         params = None
  96         _ies = []
  97         _pps = []
  98
  99         def __init__(self, params):
 100                 """Create a FileDownloader object with the given options."""
 101                 self._ies = []
 102                 self._pps = []
 103                 self.params = params
 104
 105         @staticmethod
 106         def pmkdir(filename):
 107                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 108                 components = filename.split(os.sep)
 109                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 110                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 111                 for dir in aggregate:
 112                         if not os.path.exists(dir):
 113                                 os.mkdir(dir)
 114
 115         @staticmethod
 116         def format_bytes(bytes):
 117                 if bytes is None:
 118                         return 'N/A'
 119                 if bytes == 0:
 120                         exponent = 0
 121                 else:
 122                         exponent = long(math.log(float(bytes), 1024.0))
 123                 suffix = 'bkMGTPEZY'[exponent]
 124                 converted = float(bytes) / float(1024**exponent)
 125                 return '%.2f%s' % (converted, suffix)
 126
 127         @staticmethod
 128         def calc_percent(byte_counter, data_len):
 129                 if data_len is None:
 130                         return '---.-%'
 131                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 132
 133         @staticmethod
 134         def calc_eta(start, now, total, current):
 135                 if total is None:
 136                         return '--:--'
 137                 dif = now - start
 138                 if current == 0 or dif < 0.001: # One millisecond
 139                         return '--:--'
 140                 rate = float(current) / dif
 141                 eta = long((float(total) - float(current)) / rate)
 142                 (eta_mins, eta_secs) = divmod(eta, 60)
 143                 if eta_mins > 99:
 144                         return '--:--'
 145                 return '%02d:%02d' % (eta_mins, eta_secs)
 146
 147         @staticmethod
 148         def calc_speed(start, now, bytes):
 149                 dif = now - start
 150                 if bytes == 0 or dif < 0.001: # One millisecond
 151                         return '%10s' % '---b/s'
 152                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 153
 154         @staticmethod
 155         def best_block_size(elapsed_time, bytes):
 156                 new_min = max(bytes / 2.0, 1.0)
 157                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 158                 if elapsed_time < 0.001:
 159                         return int(new_max)
 160                 rate = bytes / elapsed_time
 161                 if rate > new_max:
 162                         return int(new_max)
 163                 if rate < new_min:
 164                         return int(new_min)
 165                 return int(rate)
 166
 167         @staticmethod
 168         def parse_bytes(bytestr):
 169                 """Parse a string indicating a byte quantity into a long integer."""
 170                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 171                 if matchobj is None:
 172                         return None
 173                 number = float(matchobj.group(1))
 174                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 175                 return long(round(number * multiplier))
 176
 177         def add_info_extractor(self, ie):
 178                 """Add an InfoExtractor object to the end of the list."""
 179                 self._ies.append(ie)
 180                 ie.set_downloader(self)
 181
 182         def add_post_processor(self, pp):
 183                 """Add a PostProcessor object to the end of the chain."""
 184                 self._pps.append(pp)
 185                 pp.set_downloader(self)
 186
 187         def to_stdout(self, message, skip_eol=False):
 188                 """Print message to stdout if not in quiet mode."""
 189                 if not self.params.get('quiet', False):
 190                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 191                         sys.stdout.flush()
 192
 193         def to_stderr(self, message):
 194                 """Print message to stderr."""
 195                 print >>sys.stderr, message
 196
 197         def fixed_template(self):
 198                 """Checks if the output template is fixed."""
 199                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 200
 201         def trouble(self, message=None):
 202                 """Determine action to take when a download problem appears.
 203
 204                 Depending on if the downloader has been configured to ignore
 205                 download errors or not, this method may throw an exception or
 206                 not when errors are found, after printing the message. If it
 207                 doesn't raise, it returns an error code suitable to be returned
 208                 later as a program exit code to indicate error.
 209                 """
 210                 if message is not None:
 211                         self.to_stderr(message)
 212                 if not self.params.get('ignoreerrors', False):
 213                         raise DownloadError(message)
 214                 return 1
 215
 216         def slow_down(self, start_time, byte_counter):
 217                 """Sleep if the download speed is over the rate limit."""
 218                 rate_limit = self.params.get('ratelimit', None)
 219                 if rate_limit is None or byte_counter == 0:
 220                         return
 221                 now = time.time()
 222                 elapsed = now - start_time
 223                 if elapsed <= 0.0:
 224                         return
 225                 speed = float(byte_counter) / elapsed
 226                 if speed > rate_limit:
 227                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 228
 229         def report_destination(self, filename):
 230                 """Report destination filename."""
 231                 self.to_stdout(u'[download] Destination: %s' % filename)
 232
 233         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 234                 """Report download progress."""
 235                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 236                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 237
 238         def report_finish(self):
 239                 """Report download finished."""
 240                 self.to_stdout(u'')
 241
 242         def download(self, url_list):
 243                 """Download a given list of URLs."""
 244                 retcode = 0
 245                 if len(url_list) > 1 and self.fixed_template():
 246                         raise SameFileError(self.params['outtmpl'])
 247
 248                 for url in url_list:
 249                         suitable_found = False
 250                         for ie in self._ies:
 251                                 if not ie.suitable(url):
 252                                         continue
 253                                 # Suitable InfoExtractor found
 254                                 suitable_found = True
 255                                 all_results = ie.extract(url)
 256                                 results = [x for x in all_results if x is not None]
 257                                 if len(results) != len(all_results):
 258                                         retcode = self.trouble()
 259
 260                                 if len(results) > 1 and self.fixed_template():
 261                                         raise SameFileError(self.params['outtmpl'])
 262
 263                                 for result in results:
 264                                         # Forced printings
 265                                         if self.params.get('forcetitle', False):
 266                                                 print result['title']
 267                                         if self.params.get('forceurl', False):
 268                                                 print result['url']
 269
 270                                         # Do nothing else if in simulate mode
 271                                         if self.params.get('simulate', False):
 272                                                 continue
 273
 274                                         try:
 275                                                 filename = self.params['outtmpl'] % result
 276                                                 self.report_destination(filename)
 277                                         except (ValueError, KeyError), err:
 278                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 279                                                 continue
 280                                         if self.params['nooverwrites'] and os.path.exists(filename):
 281                                                 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
 282                                                 continue
 283                                         try:
 284                                                 self.pmkdir(filename)
 285                                         except (OSError, IOError), err:
 286                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 287                                                 continue
 288                                         try:
 289                                                 outstream = open(filename, 'wb')
 290                                         except (OSError, IOError), err:
 291                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 292                                                 continue
 293                                         try:
 294                                                 self._do_download(outstream, result['url'])
 295                                                 outstream.close()
 296                                         except (OSError, IOError), err:
 297                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 298                                                 continue
 299                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 300                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 301                                                 continue
 302                                         try:
 303                                                 self.post_process(filename, result)
 304                                         except (PostProcessingError), err:
 305                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
 306                                                 continue
 307
 308                                 break
 309                         if not suitable_found:
 310                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 311
 312                 return retcode
 313
 314         def post_process(self, filename, ie_info):
 315                 """Run the postprocessing chain on the given file."""
 316                 info = dict(ie_info)
 317                 info['filepath'] = filename
 318                 for pp in self._pps:
 319                         info = pp.run(info)
 320                         if info is None:
 321                                 break
 322
 323         def _do_download(self, stream, url):
 324                 request = urllib2.Request(url, None, std_headers)
 325                 data = urllib2.urlopen(request)
 326                 data_len = data.info().get('Content-length', None)
 327                 data_len_str = self.format_bytes(data_len)
 328                 byte_counter = 0
 329                 block_size = 1024
 330                 start = time.time()
 331                 while True:
 332                         # Progress message
 333                         percent_str = self.calc_percent(byte_counter, data_len)
 334                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 335                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 336                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 337
 338                         # Download and write
 339                         before = time.time()
 340                         data_block = data.read(block_size)
 341                         after = time.time()
 342                         data_block_len = len(data_block)
 343                         if data_block_len == 0:
 344                                 break
 345                         byte_counter += data_block_len
 346                         stream.write(data_block)
 347                         block_size = self.best_block_size(after - before, data_block_len)
 348
 349                         # Apply rate limit
 350                         self.slow_down(start, byte_counter)
 351
 352                 self.report_finish()
 353                 if data_len is not None and str(byte_counter) != data_len:
 354                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 355
 356 class InfoExtractor(object):
 357         """Information Extractor class.
 358
 359         Information extractors are the classes that, given a URL, extract
 360         information from the video (or videos) the URL refers to. This
 361         information includes the real video URL, the video title and simplified
 362         title, author and others. It is returned in a list of dictionaries when
 363         calling its extract() method. It is a list because a URL can refer to
 364         more than one video (think of playlists). The dictionaries must include
 365         the following fields:
 366
 367         id:             Video identifier.
 368         url:            Final video URL.
 369         uploader:       Nickname of the video uploader.
 370         title:          Literal title.
 371         stitle:         Simplified title.
 372         ext:            Video filename extension.
 373
 374         Subclasses of this one should re-define the _real_initialize() and
 375         _real_extract() methods, as well as the suitable() static method.
 376         Probably, they should also be instantiated and added to the main
 377         downloader.
 378         """
 379
 380         _ready = False
 381         _downloader = None
 382
 383         def __init__(self, downloader=None):
 384                 """Constructor. Receives an optional downloader."""
 385                 self._ready = False
 386                 self.set_downloader(downloader)
 387
 388         @staticmethod
 389         def suitable(url):
 390                 """Receives a URL and returns True if suitable for this IE."""
 391                 return False
 392
 393         def initialize(self):
 394                 """Initializes an instance (authentication, etc)."""
 395                 if not self._ready:
 396                         self._real_initialize()
 397                         self._ready = True
 398
 399         def extract(self, url):
 400                 """Extracts URL information and returns it in list of dicts."""
 401                 self.initialize()
 402                 return self._real_extract(url)
 403
 404         def set_downloader(self, downloader):
 405                 """Sets the downloader for this IE."""
 406                 self._downloader = downloader
 407
 408         def to_stdout(self, message):
 409                 """Print message to stdout if downloader is not in quiet mode."""
 410                 if self._downloader is None or not self._downloader.params.get('quiet', False):
 411                         print message
 412
 413         def to_stderr(self, message):
 414                 """Print message to stderr."""
 415                 print >>sys.stderr, message
 416
 417         def _real_initialize(self):
 418                 """Real initialization process. Redefine in subclasses."""
 419                 pass
 420
 421         def _real_extract(self, url):
 422                 """Real extraction process. Redefine in subclasses."""
 423                 pass
 424
 425 class YoutubeIE(InfoExtractor):
 426         """Information extractor for youtube.com."""
 427
 428         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 429         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 430         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 431         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 432         _NETRC_MACHINE = 'youtube'
 433
 434         @staticmethod
 435         def suitable(url):
 436                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 437
 438         def report_lang(self):
 439                 """Report attempt to set language."""
 440                 self.to_stdout(u'[youtube] Setting language')
 441
 442         def report_login(self):
 443                 """Report attempt to log in."""
 444                 self.to_stdout(u'[youtube] Logging in')
 445
 446         def report_age_confirmation(self):
 447                 """Report attempt to confirm age."""
 448                 self.to_stdout(u'[youtube] Confirming age')
 449
 450         def report_webpage_download(self, video_id):
 451                 """Report attempt to download webpage."""
 452                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 453
 454         def report_information_extraction(self, video_id):
 455                 """Report attempt to extract video information."""
 456                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 457
 458         def report_video_url(self, video_id, video_real_url):
 459                 """Report extracted video URL."""
 460                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 461
 462         def _real_initialize(self):
 463                 if self._downloader is None:
 464                         return
 465
 466                 username = None
 467                 password = None
 468                 downloader_params = self._downloader.params
 469
 470                 # Attempt to use provided username and password or .netrc data
 471                 if downloader_params.get('username', None) is not None:
 472                         username = downloader_params['username']
 473                         password = downloader_params['password']
 474                 elif downloader_params.get('usenetrc', False):
 475                         try:
 476                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 477                                 if info is not None:
 478                                         username = info[0]
 479                                         password = info[2]
 480                                 else:
 481                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 482                         except (IOError, netrc.NetrcParseError), err:
 483                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 484                                 return
 485
 486                 # Set language
 487                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 488                 try:
 489                         self.report_lang()
 490                         urllib2.urlopen(request).read()
 491                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 492                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 493                         return
 494
 495                 # No authentication to be performed
 496                 if username is None:
 497                         return
 498
 499                 # Log in
 500                 login_form = {
 501                                 'current_form': 'loginForm',
 502                                 'next':         '/',
 503                                 'action_login': 'Log In',
 504                                 'username':     username,
 505                                 'password':     password,
 506                                 }
 507                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 508                 try:
 509                         self.report_login()
 510                         login_results = urllib2.urlopen(request).read()
 511                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 512                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 513                                 return
 514                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 515                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 516                         return
 517
 518                 # Confirm age
 519                 age_form = {
 520                                 'next_url':             '/',
 521                                 'action_confirm':       'Confirm',
 522                                 }
 523                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 524                 try:
 525                         self.report_age_confirmation()
 526                         age_results = urllib2.urlopen(request).read()
 527                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 528                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 529                         return
 530
 531         def _real_extract(self, url):
 532                 # Extract video id from URL
 533                 mobj = re.match(self._VALID_URL, url)
 534                 if mobj is None:
 535                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 536                         return [None]
 537                 video_id = mobj.group(2)
 538
 539                 # Downloader parameters
 540                 format_param = None
 541                 if self._downloader is not None:
 542                         params = self._downloader.params
 543                         format_param = params.get('format', None)
 544
 545                 # Extension
 546                 video_extension = {
 547                         '17': '3gp',
 548                         '18': 'mp4',
 549                         '22': 'mp4',
 550                 }.get(format_param, 'flv')
 551
 552                 # Normalize URL, including format
 553                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
 554                 if format_param is not None:
 555                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 556                 request = urllib2.Request(normalized_url, None, std_headers)
 557                 try:
 558                         self.report_webpage_download(video_id)
 559                         video_webpage = urllib2.urlopen(request).read()
 560                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 561                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 562                         return [None]
 563                 self.report_information_extraction(video_id)
 564
 565                 # "t" param
 566                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 567                 if mobj is None:
 568                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 569                         return [None]
 570                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 571                 if format_param is not None:
 572                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 573                 self.report_video_url(video_id, video_real_url)
 574
 575                 # uploader
 576                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 577                 if mobj is None:
 578                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 579                         return [None]
 580                 video_uploader = mobj.group(1)
 581
 582                 # title
 583                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 584                 if mobj is None:
 585                         self.to_stderr(u'ERROR: unable to extract video title')
 586                         return [None]
 587                 video_title = mobj.group(1).decode('utf-8')
 588                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 589                 video_title = video_title.replace(os.sep, u'%')
 590
 591                 # simplified title
 592                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 593                 simple_title = simple_title.strip(ur'_')
 594
 595                 # Return information
 596                 return [{
 597                         'id':           video_id.decode('utf-8'),
 598                         'url':          video_real_url.decode('utf-8'),
 599                         'uploader':     video_uploader.decode('utf-8'),
 600                         'title':        video_title,
 601                         'stitle':       simple_title,
 602                         'ext':          video_extension.decode('utf-8'),
 603                         }]
 604
 605 class MetacafeIE(InfoExtractor):
 606         """Information Extractor for metacafe.com."""
 607
 608         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 609         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 610         _youtube_ie = None
 611
 612         def __init__(self, youtube_ie, downloader=None):
 613                 InfoExtractor.__init__(self, downloader)
 614                 self._youtube_ie = youtube_ie
 615
 616         @staticmethod
 617         def suitable(url):
 618                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 619
 620         def report_disclaimer(self):
 621                 """Report disclaimer retrieval."""
 622                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 623
 624         def report_age_confirmation(self):
 625                 """Report attempt to confirm age."""
 626                 self.to_stdout(u'[metacafe] Confirming age')
 627
 628         def report_download_webpage(self, video_id):
 629                 """Report webpage download."""
 630                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 631
 632         def report_extraction(self, video_id):
 633                 """Report information extraction."""
 634                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 635
 636         def _real_initialize(self):
 637                 # Retrieve disclaimer
 638                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 639                 try:
 640                         self.report_disclaimer()
 641                         disclaimer = urllib2.urlopen(request).read()
 642                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 643                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 644                         return
 645
 646                 # Confirm age
 647                 disclaimer_form = {
 648                         'filters': '0',
 649                         'submit': "Continue - I'm over 18",
 650                         }
 651                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
 652                 try:
 653                         self.report_age_confirmation()
 654                         disclaimer = urllib2.urlopen(request).read()
 655                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 656                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 657                         return
 658
 659         def _real_extract(self, url):
 660                 # Extract id and simplified title from URL
 661                 mobj = re.match(self._VALID_URL, url)
 662                 if mobj is None:
 663                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 664                         return [None]
 665
 666                 video_id = mobj.group(1)
 667
 668                 # Check if video comes from YouTube
 669                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 670                 if mobj2 is not None:
 671                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 672
 673                 simple_title = mobj.group(2).decode('utf-8')
 674                 video_extension = 'flv'
 675
 676                 # Retrieve video webpage to extract further information
 677                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 678                 try:
 679                         self.report_download_webpage(video_id)
 680                         webpage = urllib2.urlopen(request).read()
 681                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 682                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 683                         return [None]
 684
 685                 # Extract URL, uploader and title from webpage
 686                 self.report_extraction(video_id)
 687                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 688                 if mobj is None:
 689                         self.to_stderr(u'ERROR: unable to extract media URL')
 690                         return [None]
 691                 mediaURL = mobj.group(1).replace('\\', '')
 692
 693                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 694                 if mobj is None:
 695                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 696                         return [None]
 697                 gdaKey = mobj.group(1)
 698
 699                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 700
 701                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 702                 if mobj is None:
 703                         self.to_stderr(u'ERROR: unable to extract title')
 704                         return [None]
 705                 video_title = mobj.group(1).decode('utf-8')
 706
 707                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 708                 if mobj is None:
 709                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 710                         return [None]
 711                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 712
 713                 # Return information
 714                 return [{
 715                         'id':           video_id.decode('utf-8'),
 716                         'url':          video_url.decode('utf-8'),
 717                         'uploader':     video_uploader.decode('utf-8'),
 718                         'title':        video_title,
 719                         'stitle':       simple_title,
 720                         'ext':          video_extension.decode('utf-8'),
 721                         }]
 722
 723
 724 class YoutubeSearchIE(InfoExtractor):
 725         """Information Extractor for YouTube search queries."""
 726         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
 727         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
 728         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
 729         _MORE_PAGES_INDICATOR = r'>Next</a>'
 730         _youtube_ie = None
 731
 732         def __init__(self, youtube_ie, downloader=None):
 733                 InfoExtractor.__init__(self, downloader)
 734                 self._youtube_ie = youtube_ie
 735
 736         @staticmethod
 737         def suitable(url):
 738                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
 739
 740         def report_download_page(self, query, pagenum):
 741                 """Report attempt to download playlist page with given number."""
 742                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 743
 744         def _real_initialize(self):
 745                 self._youtube_ie.initialize()
 746
 747         def _real_extract(self, query):
 748                 mobj = re.match(self._VALID_QUERY, query)
 749                 if mobj is None:
 750                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
 751                         return [None]
 752
 753                 prefix, query = query.split(':')
 754                 prefix = prefix[8:]
 755                 if prefix == '':
 756                         return self._download_n_results(query, 1)
 757                 elif prefix == 'all':
 758                         return self._download_n_results(query, -1)
 759                 else:
 760                         try:
 761                                 n = int(prefix)
 762                                 if n <= 0:
 763                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
 764                                         return [None]
 765                                 return self._download_n_results(query, n)
 766                         except ValueError: # parsing prefix as int fails
 767                                 return self._download_n_results(query, 1)
 768
 769         def _download_n_results(self, query, n):
 770                 """Downloads a specified number of results for a query"""
 771
 772                 video_ids = []
 773                 already_seen = set()
 774                 pagenum = 1
 775
 776                 while True:
 777                         self.report_download_page(query, pagenum)
 778                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
 779                         request = urllib2.Request(result_url, None, std_headers)
 780                         try:
 781                                 page = urllib2.urlopen(request).read()
 782                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 783                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 784                                 return [None]
 785
 786                         # Extract video identifiers
 787                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 788                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
 789                                 if video_id not in already_seen:
 790                                         video_ids.append(video_id)
 791                                         already_seen.add(video_id)
 792                                         if len(video_ids) == n:
 793                                                 # Specified n videos reached
 794                                                 information = []
 795                                                 for id in video_ids:
 796                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 797                                                 return information
 798
 799                         if self._MORE_PAGES_INDICATOR not in page:
 800                                 information = []
 801                                 for id in video_ids:
 802                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 803                                 return information
 804
 805                         pagenum = pagenum + 1
 806
 807 class YoutubePlaylistIE(InfoExtractor):
 808         """Information Extractor for YouTube playlists."""
 809
 810         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 811         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
 812         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 813         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 814         _youtube_ie = None
 815
 816         def __init__(self, youtube_ie, downloader=None):
 817                 InfoExtractor.__init__(self, downloader)
 818                 self._youtube_ie = youtube_ie
 819
 820         @staticmethod
 821         def suitable(url):
 822                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 823
 824         def report_download_page(self, playlist_id, pagenum):
 825                 """Report attempt to download playlist page with given number."""
 826                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 827
 828         def _real_initialize(self):
 829                 self._youtube_ie.initialize()
 830
 831         def _real_extract(self, url):
 832                 # Extract playlist id
 833                 mobj = re.match(self._VALID_URL, url)
 834                 if mobj is None:
 835                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 836                         return [None]
 837
 838                 # Download playlist pages
 839                 playlist_id = mobj.group(1)
 840                 video_ids = []
 841                 pagenum = 1
 842
 843                 while True:
 844                         self.report_download_page(playlist_id, pagenum)
 845                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 846                         try:
 847                                 page = urllib2.urlopen(request).read()
 848                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 849                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 850                                 return [None]
 851
 852                         # Extract video identifiers
 853                         ids_in_page = []
 854                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 855                                 if mobj.group(1) not in ids_in_page:
 856                                         ids_in_page.append(mobj.group(1))
 857                         video_ids.extend(ids_in_page)
 858
 859                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 860                                 break
 861                         pagenum = pagenum + 1
 862
 863                 information = []
 864                 for id in video_ids:
 865                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 866                 return information
 867
 868 class PostProcessor(object):
 869         """Post Processor class.
 870
 871         PostProcessor objects can be added to downloaders with their
 872         add_post_processor() method. When the downloader has finished a
 873         successful download, it will take its internal chain of PostProcessors
 874         and start calling the run() method on each one of them, first with
 875         an initial argument and then with the returned value of the previous
 876         PostProcessor.
 877
 878         The chain will be stopped if one of them ever returns None or the end
 879         of the chain is reached.
 880
 881         PostProcessor objects follow a "mutual registration" process similar
 882         to InfoExtractor objects.
 883         """
 884
 885         _downloader = None
 886
 887         def __init__(self, downloader=None):
 888                 self._downloader = downloader
 889
 890         def to_stdout(self, message):
 891                 """Print message to stdout if downloader is not in quiet mode."""
 892                 if self._downloader is None or not self._downloader.params.get('quiet', False):
 893                         print message
 894
 895         def to_stderr(self, message):
 896                 """Print message to stderr."""
 897                 print >>sys.stderr, message
 898
 899         def set_downloader(self, downloader):
 900                 """Sets the downloader for this PP."""
 901                 self._downloader = downloader
 902
 903         def run(self, information):
 904                 """Run the PostProcessor.
 905
 906                 The "information" argument is a dictionary like the ones
 907                 returned by InfoExtractors. The only difference is that this
 908                 one has an extra field called "filepath" that points to the
 909                 downloaded file.
 910
 911                 When this method returns None, the postprocessing chain is
 912                 stopped. However, this method may return an information
 913                 dictionary that will be passed to the next postprocessing
 914                 object in the chain. It can be the one it received after
 915                 changing some fields.
 916
 917                 In addition, this method may raise a PostProcessingError
 918                 exception that will be taken into account by the downloader
 919                 it was called from.
 920                 """
 921                 return information # by default, do nothing
 922
 923 ### MAIN PROGRAM ###
 924 if __name__ == '__main__':
 925         try:
 926                 # Modules needed only when running the main program
 927                 import getpass
 928                 import optparse
 929
 930                 # General configuration
 931                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 932                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 933                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 934
 935                 # Parse command line
 936                 parser = optparse.OptionParser(
 937                                 usage='Usage: %prog [options] url...',
 938                                 version='INTERNAL',
 939                                 conflict_handler='resolve',
 940                                 )
 941                 parser.add_option('-h', '--help',
 942                                 action='help', help='print this help text and exit')
 943                 parser.add_option('-v', '--version',
 944                                 action='version', help='print program version and exit')
 945                 parser.add_option('-u', '--username',
 946                                 dest='username', metavar='UN', help='account username')
 947                 parser.add_option('-p', '--password',
 948                                 dest='password', metavar='PW', help='account password')
 949                 parser.add_option('-o', '--output',
 950                                 dest='outtmpl', metavar='TPL', help='output filename template')
 951                 parser.add_option('-q', '--quiet',
 952                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 953                 parser.add_option('-s', '--simulate',
 954                                 action='store_true', dest='simulate', help='do not download video', default=False)
 955                 parser.add_option('-t', '--title',
 956                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 957                 parser.add_option('-l', '--literal',
 958                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 959                 parser.add_option('-n', '--netrc',
 960                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 961                 parser.add_option('-g', '--get-url',
 962                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 963                 parser.add_option('-e', '--get-title',
 964                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 965                 parser.add_option('-f', '--format',
 966                                 dest='format', metavar='FMT', help='video format code')
 967                 parser.add_option('-m', '--mobile-version',
 968                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 969                 parser.add_option('-d', '--high-def',
 970                                 action='store_const', dest='format', help='alias for -f 22', const='22')
 971                 parser.add_option('-i', '--ignore-errors',
 972                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 973                 parser.add_option('-r', '--rate-limit',
 974                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 975                 parser.add_option('-a', '--batch-file',
 976                                 dest='batchfile', metavar='F', help='file containing URLs to download')
 977                 parser.add_option('-w', '--no-overwrites',
 978                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
 979                 (opts, args) = parser.parse_args()
 980
 981                 # Batch file verification
 982                 batchurls = []
 983                 if opts.batchfile is not None:
 984                         try:
 985                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
 986                         except IOError:
 987                                 sys.exit(u'ERROR: batch file could not be read')
 988                 all_urls = batchurls + args
 989
 990                 # Conflicting, missing and erroneous options
 991                 if len(all_urls) < 1:
 992                         sys.exit(u'ERROR: you must provide at least one URL')
 993                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 994                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
 995                 if opts.password is not None and opts.username is None:
 996                         sys.exit(u'ERROR: account username missing')
 997                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 998                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
 999                 if opts.usetitle and opts.useliteral:
1000                         sys.exit(u'ERROR: using title conflicts with using literal title')
1001                 if opts.username is not None and opts.password is None:
1002                         opts.password = getpass.getpass(u'Type account password and press return:')
1003                 if opts.ratelimit is not None:
1004                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1005                         if numeric_limit is None:
1006                                 sys.exit(u'ERROR: invalid rate limit specified')
1007                         opts.ratelimit = numeric_limit
1008
1009                 # Information extractors
1010                 youtube_ie = YoutubeIE()
1011                 metacafe_ie = MetacafeIE(youtube_ie)
1012                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1013                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1014
1015                 # File downloader
1016                 charset = locale.getdefaultlocale()[1]
1017                 if charset is None:
1018                         charset = 'ascii'
1019                 fd = FileDownloader({
1020                         'usenetrc': opts.usenetrc,
1021                         'username': opts.username,
1022                         'password': opts.password,
1023                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1024                         'forceurl': opts.geturl,
1025                         'forcetitle': opts.gettitle,
1026                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1027                         'format': opts.format,
1028                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1029                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1030                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1031                                 or u'%(id)s.%(ext)s'),
1032                         'ignoreerrors': opts.ignoreerrors,
1033                         'ratelimit': opts.ratelimit,
1034                         'nooverwrites': opts.nooverwrites,
1035                         })
1036                 fd.add_info_extractor(youtube_search_ie)
1037                 fd.add_info_extractor(youtube_pl_ie)
1038                 fd.add_info_extractor(metacafe_ie)
1039                 fd.add_info_extractor(youtube_ie)
1040                 retcode = fd.download(all_urls)
1041                 sys.exit(retcode)
1042
1043         except DownloadError:
1044                 sys.exit(1)
1045         except SameFileError:
1046                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1047         except KeyboardInterrupt:
1048                 sys.exit(u'\nERROR: Interrupted by user')