_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         forceurl:       Force printing final URL.
  59         forcetitle:     Force printing title.
  60         simulate:       Do not download the video files.
  61         format:         Video format code.
  62         outtmpl:        Template for output names.
  63         """
  64
  65         _params = None
  66         _ies = []
  67
  68         def __init__(self, params):
  69                 self._ies = []
  70                 self.set_params(params)
  71
  72         @staticmethod
  73         def pmkdir(filename):
  74                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  75                 components = filename.split(os.sep)
  76                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  77                 for dir in aggregate:
  78                         if not os.path.exists(dir):
  79                                 os.mkdir(dir)
  80
  81         @staticmethod
  82         def format_bytes(bytes):
  83                 if bytes is None:
  84                         return 'N/A'
  85                 if bytes == 0:
  86                         exponent = 0
  87                 else:
  88                         exponent = long(math.log(float(bytes), 1024.0))
  89                 suffix = 'bkMGTPEZY'[exponent]
  90                 converted = float(bytes) / float(1024**exponent)
  91                 return '%.2f%s' % (converted, suffix)
  92
  93         @staticmethod
  94         def calc_percent(byte_counter, data_len):
  95                 if data_len is None:
  96                         return '---.-%'
  97                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  98
  99         @staticmethod
 100         def calc_eta(start, now, total, current):
 101                 if total is None:
 102                         return '--:--'
 103                 dif = now - start
 104                 if current == 0 or dif < 0.001: # One millisecond
 105                         return '--:--'
 106                 rate = float(current) / dif
 107                 eta = long((float(total) - float(current)) / rate)
 108                 (eta_mins, eta_secs) = divmod(eta, 60)
 109                 if eta_mins > 99:
 110                         return '--:--'
 111                 return '%02d:%02d' % (eta_mins, eta_secs)
 112
 113         @staticmethod
 114         def calc_speed(start, now, bytes):
 115                 dif = now - start
 116                 if bytes == 0 or dif < 0.001: # One millisecond
 117                         return '%10s' % '---b/s'
 118                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 119
 120         @staticmethod
 121         def best_block_size(elapsed_time, bytes):
 122                 new_min = max(bytes / 2.0, 1.0)
 123                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 124                 if elapsed_time < 0.001:
 125                         return int(new_max)
 126                 rate = bytes / elapsed_time
 127                 if rate > new_max:
 128                         return int(new_max)
 129                 if rate < new_min:
 130                         return int(new_min)
 131                 return int(rate)
 132
 133         def set_params(self, params):
 134                 """Sets parameters."""
 135                 if type(params) != dict:
 136                         raise ValueError('params: dictionary expected')
 137                 self._params = params
 138
 139         def get_params(self):
 140                 """Get parameters."""
 141                 return self._params
 142
 143         def add_info_extractor(self, ie):
 144                 """Add an InfoExtractor object to the end of the list."""
 145                 self._ies.append(ie)
 146                 ie.set_downloader(self)
 147
 148         def to_stdout(self, message, skip_eol=False):
 149                 """Print message to stdout if not in quiet mode."""
 150                 if not self._params.get('quiet', False):
 151                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 152                         sys.stdout.flush()
 153
 154         def to_stderr(self, message):
 155                 """Print message to stderr."""
 156                 sys.stderr.write('%s\n' % message)
 157
 158         def fixed_template(self):
 159                 """Checks if the output template is fixed."""
 160                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 161
 162         def download(self, url_list):
 163                 """Download a given list of URLs."""
 164                 if len(url_list) > 1 and self.fixed_template():
 165                         sys.exit('ERROR: fixed output name but more than one file to download')
 166
 167                 for url in url_list:
 168                         suitable_found = False
 169                         for ie in self._ies:
 170                                 if not ie.suitable(url):
 171                                         continue
 172                                 # Suitable InfoExtractor found
 173                                 suitable_found = True
 174                                 results = [x for x in ie.extract(url) if x is not None]
 175
 176                                 if len(results) > 1 and self.fixed_template():
 177                                         sys.exit('ERROR: fixed output name but more than one file to download')
 178
 179                                 for result in results:
 180
 181                                         # Forced printings
 182                                         if self._params.get('forcetitle', False):
 183                                                 print result['title']
 184                                         if self._params.get('forceurl', False):
 185                                                 print result['url']
 186
 187                                         # Do nothing else if in simulate mode
 188                                         if self._params.get('simulate', False):
 189                                                 continue
 190
 191                                         try:
 192                                                 filename = self._params['outtmpl'] % result
 193                                         except (ValueError, KeyError), err:
 194                                                 self.to_stderr('ERROR: invalid output template: %s' % str(err))
 195                                                 continue
 196                                         try:
 197                                                 self.pmkdir(filename)
 198                                         except (OSError, IOError), err:
 199                                                 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
 200                                                 continue
 201                                         try:
 202                                                 outstream = open(filename, 'wb')
 203                                         except (OSError, IOError), err:
 204                                                 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
 205                                                 continue
 206                                         try:
 207                                                 self._do_download(outstream, result['url'])
 208                                                 outstream.close()
 209                                         except (OSError, IOError), err:
 210                                                 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
 211                                                 continue
 212                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 213                                                 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
 214                                                 continue
 215                                 break
 216                         if not suitable_found:
 217                                 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
 218
 219         def _do_download(self, stream, url):
 220                 request = urllib2.Request(url, None, std_headers)
 221                 data = urllib2.urlopen(request)
 222                 data_len = data.info().get('Content-length', None)
 223                 data_len_str = self.format_bytes(data_len)
 224                 byte_counter = 0
 225                 block_size = 1024
 226                 start = time.time()
 227                 while True:
 228                         percent_str = self.calc_percent(byte_counter, data_len)
 229                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 230                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 231                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 232                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 233
 234                         before = time.time()
 235                         data_block = data.read(block_size)
 236                         after = time.time()
 237                         data_block_len = len(data_block)
 238                         if data_block_len == 0:
 239                                 break
 240                         byte_counter += data_block_len
 241                         stream.write(data_block)
 242                         block_size = self.best_block_size(after - before, data_block_len)
 243
 244                 self.to_stdout('')
 245                 if data_len is not None and str(byte_counter) != data_len:
 246                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 247
 248 class InfoExtractor(object):
 249         """Information Extractor class.
 250
 251         Information extractors are the classes that, given a URL, extract
 252         information from the video (or videos) the URL refers to. This
 253         information includes the real video URL, the video title and simplified
 254         title, author and others. It is returned in a list of dictionaries when
 255         calling its extract() method. It is a list because a URL can refer to
 256         more than one video (think of playlists). The dictionaries must include
 257         the following fields:
 258
 259         id:             Video identifier.
 260         url:            Final video URL.
 261         uploader:       Nickname of the video uploader.
 262         title:          Literal title.
 263         stitle:         Simplified title.
 264         ext:            Video filename extension.
 265
 266         Subclasses of this one should re-define the _real_initialize() and
 267         _real_extract() methods, as well as the suitable() static method.
 268         Probably, they should also be instantiated and added to the main
 269         downloader.
 270         """
 271
 272         _ready = False
 273         _downloader = None
 274
 275         def __init__(self, downloader=None):
 276                 """Constructor. Receives an optional downloader."""
 277                 self._ready = False
 278                 self.set_downloader(downloader)
 279
 280         @staticmethod
 281         def suitable(url):
 282                 """Receives a URL and returns True if suitable for this IE."""
 283                 return True
 284
 285         def initialize(self):
 286                 """Initializes an instance (login, etc)."""
 287                 if not self._ready:
 288                         self._real_initialize()
 289                         self._ready = True
 290
 291         def extract(self, url):
 292                 """Extracts URL information and returns it in list of dicts."""
 293                 self.initialize()
 294                 return self._real_extract(url)
 295
 296         def set_downloader(self, downloader):
 297                 """Sets the downloader for this IE."""
 298                 self._downloader = downloader
 299
 300         def to_stdout(self, message):
 301                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 302                         print message
 303
 304         def to_stderr(self, message):
 305                 sys.stderr.write('%s\n' % message)
 306
 307         def _real_initialize(self):
 308                 """Real initialization process. Redefine in subclasses."""
 309                 pass
 310
 311         def _real_extract(self, url):
 312                 """Real extraction process. Redefine in subclasses."""
 313                 pass
 314
 315 class YoutubeIE(InfoExtractor):
 316         """Information extractor for youtube.com."""
 317
 318         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 319         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 320         _NETRC_MACHINE = 'youtube'
 321
 322         def _real_initialize(self):
 323                 if self._downloader is None:
 324                         return
 325
 326                 username = None
 327                 password = None
 328                 downloader_params = self._downloader.get_params()
 329
 330                 # Attempt to use provided username and password or .netrc data
 331                 if downloader_params.get('username', None) is not None:
 332                         username = downloader_params['username']
 333                         password = downloader_params['password']
 334                 elif downloader_params.get('usenetrc', False):
 335                         try:
 336                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 337                                 if info is not None:
 338                                         username = info[0]
 339                                         password = info[2]
 340                                 else:
 341                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 342                         except (IOError, netrc.NetrcParseError), err:
 343                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 344                                 return
 345
 346                 if username is None:
 347                         return
 348
 349                 # Log in
 350                 login_form = {
 351                                 'current_form': 'loginForm',
 352                                 'next':         '/',
 353                                 'action_login': 'Log In',
 354                                 'username':     username,
 355                                 'password':     password,
 356                                 }
 357                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 358                 try:
 359                         self.to_stdout('[youtube] Logging in')
 360                         login_results = urllib2.urlopen(request).read()
 361                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 362                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 363                                 return
 364                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 365                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 366                         return
 367
 368                 # Confirm age
 369                 age_form = {
 370                                 'next_url':             '/',
 371                                 'action_confirm':       'Confirm',
 372                                 }
 373                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 374                 try:
 375                         self.to_stdout('[youtube] Confirming age')
 376                         age_results = urllib2.urlopen(request).read()
 377                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 378                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 379
 380         def _real_extract(self, url):
 381                 # Extract video id from URL
 382                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 383                 if mobj is None:
 384                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 385                         return [None]
 386                 video_id = mobj.group(2)
 387
 388                 # Downloader parameters
 389                 format_param = None
 390                 if self._downloader is not None:
 391                         params = self._downloader.get_params()
 392                         format_param = params.get('format', None)
 393
 394                 # Extension
 395                 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
 396
 397                 # Normalize URL, including format
 398                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 399                 if format_param is not None:
 400                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 401                 request = urllib2.Request(normalized_url, None, std_headers)
 402                 try:
 403                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 404                         video_webpage = urllib2.urlopen(request).read()
 405                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 406                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 407                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 408
 409                 # "t" param
 410                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 411                 if mobj is None:
 412                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 413                         return [None]
 414                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 415                 if format_param is not None:
 416                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 417                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 418
 419                 # uploader
 420                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 421                 if mobj is None:
 422                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 423                         return [None]
 424                 video_uploader = mobj.group(1)
 425
 426                 # title
 427                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 428                 if mobj is None:
 429                         self.to_stderr('ERROR: Unable to extract video title')
 430                         return [None]
 431                 video_title = mobj.group(1).decode('utf-8')
 432                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 433
 434                 # simplified title
 435                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 436                 simple_title = simple_title.strip(ur'_')
 437
 438                 # Return information
 439                 return [{
 440                         'id':           video_id,
 441                         'url':          video_real_url,
 442                         'uploader':     video_uploader,
 443                         'title':        video_title,
 444                         'stitle':       simple_title,
 445                         'ext':          video_extension,
 446                         }]
 447
 448 if __name__ == '__main__':
 449         try:
 450                 # Modules needed only when running the main program
 451                 import getpass
 452                 import optparse
 453
 454                 # General configuration
 455                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 456                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 457                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 458
 459                 # Parse command line
 460                 parser = optparse.OptionParser(
 461                                 usage='Usage: %prog [options] url...',
 462                                 version='INTERNAL',
 463                                 conflict_handler='resolve',
 464                                 )
 465                 parser.add_option('-h', '--help',
 466                                 action='help', help='print this help text and exit')
 467                 parser.add_option('-v', '--version',
 468                                 action='version', help='print program version and exit')
 469                 parser.add_option('-u', '--username',
 470                                 dest='username', metavar='UN', help='account username')
 471                 parser.add_option('-p', '--password',
 472                                 dest='password', metavar='PW', help='account password')
 473                 parser.add_option('-o', '--output',
 474                                 dest='outtmpl', metavar='TPL', help='output filename template')
 475                 parser.add_option('-q', '--quiet',
 476                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 477                 parser.add_option('-s', '--simulate',
 478                                 action='store_true', dest='simulate', help='do not download video', default=False)
 479                 parser.add_option('-t', '--title',
 480                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 481                 parser.add_option('-l', '--literal',
 482                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 483                 parser.add_option('-n', '--netrc',
 484                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 485                 parser.add_option('-g', '--get-url',
 486                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 487                 parser.add_option('-e', '--get-title',
 488                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 489                 parser.add_option('-f', '--format',
 490                                 dest='format', metavar='FMT', help='video format code')
 491                 parser.add_option('-b', '--best-quality',
 492                                 action='store_const', dest='video_format', help='alias for -f 18', const='18')
 493                 (opts, args) = parser.parse_args()
 494
 495                 # Conflicting, missing and erroneous options
 496                 if len(args) < 1:
 497                         sys.exit('ERROR: you must provide at least one URL')
 498                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 499                         sys.exit('ERROR: using .netrc conflicts with giving username/password')
 500                 if opts.password is not None and opts.username is None:
 501                         sys.exit('ERROR: account username missing')
 502                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 503                         sys.exit('ERROR: using output template conflicts with using title or literal title')
 504                 if opts.usetitle and opts.useliteral:
 505                         sys.exit('ERROR: using title conflicts with using literal title')
 506                 if opts.username is not None and opts.password is None:
 507                         opts.password = getpass.getpass('Type account password and press return:')
 508
 509                 # Information extractors
 510                 youtube_ie = YoutubeIE()
 511
 512                 # File downloader
 513                 fd = FileDownloader({
 514                         'usenetrc': opts.usenetrc,
 515                         'username': opts.username,
 516                         'password': opts.password,
 517                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 518                         'forceurl': opts.geturl,
 519                         'forcetitle': opts.gettitle,
 520                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 521                         'format': opts.format,
 522                         'outtmpl': ((opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
 523                                 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
 524                                 or '%(id)s.%(ext)s'),
 525                         })
 526                 fd.add_info_extractor(youtube_ie)
 527                 fd.download(args)
 528
 529         except KeyboardInterrupt:
 530                 sys.exit('\nERROR: Interrupted by user')