_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         forceurl:       Force printing final URL.
  59         forcetitle:     Force printing title.
  60         simulate:       Do not download the video files.
  61         format:         Video format code.
  62         outtmpl:        Template for output names.
  63         ignoreerrors:   Do not stop on download errors.
  64         """
  65
  66         _params = None
  67         _ies = []
  68
  69         def __init__(self, params):
  70                 """Create a FileDownloader object with the given options."""
  71                 self._ies = []
  72                 self.set_params(params)
  73
  74         @staticmethod
  75         def pmkdir(filename):
  76                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  77                 components = filename.split(os.sep)
  78                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  79                 for dir in aggregate:
  80                         if not os.path.exists(dir):
  81                                 os.mkdir(dir)
  82
  83         @staticmethod
  84         def format_bytes(bytes):
  85                 if bytes is None:
  86                         return 'N/A'
  87                 if bytes == 0:
  88                         exponent = 0
  89                 else:
  90                         exponent = long(math.log(float(bytes), 1024.0))
  91                 suffix = 'bkMGTPEZY'[exponent]
  92                 converted = float(bytes) / float(1024**exponent)
  93                 return '%.2f%s' % (converted, suffix)
  94
  95         @staticmethod
  96         def calc_percent(byte_counter, data_len):
  97                 if data_len is None:
  98                         return '---.-%'
  99                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 100
 101         @staticmethod
 102         def calc_eta(start, now, total, current):
 103                 if total is None:
 104                         return '--:--'
 105                 dif = now - start
 106                 if current == 0 or dif < 0.001: # One millisecond
 107                         return '--:--'
 108                 rate = float(current) / dif
 109                 eta = long((float(total) - float(current)) / rate)
 110                 (eta_mins, eta_secs) = divmod(eta, 60)
 111                 if eta_mins > 99:
 112                         return '--:--'
 113                 return '%02d:%02d' % (eta_mins, eta_secs)
 114
 115         @staticmethod
 116         def calc_speed(start, now, bytes):
 117                 dif = now - start
 118                 if bytes == 0 or dif < 0.001: # One millisecond
 119                         return '%10s' % '---b/s'
 120                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 121
 122         @staticmethod
 123         def best_block_size(elapsed_time, bytes):
 124                 new_min = max(bytes / 2.0, 1.0)
 125                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 126                 if elapsed_time < 0.001:
 127                         return int(new_max)
 128                 rate = bytes / elapsed_time
 129                 if rate > new_max:
 130                         return int(new_max)
 131                 if rate < new_min:
 132                         return int(new_min)
 133                 return int(rate)
 134
 135         def set_params(self, params):
 136                 """Sets parameters."""
 137                 if type(params) != dict:
 138                         raise ValueError('params: dictionary expected')
 139                 self._params = params
 140
 141         def get_params(self):
 142                 """Get parameters."""
 143                 return self._params
 144
 145         def add_info_extractor(self, ie):
 146                 """Add an InfoExtractor object to the end of the list."""
 147                 self._ies.append(ie)
 148                 ie.set_downloader(self)
 149
 150         def to_stdout(self, message, skip_eol=False):
 151                 """Print message to stdout if not in quiet mode."""
 152                 if not self._params.get('quiet', False):
 153                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 154                         sys.stdout.flush()
 155
 156         def to_stderr(self, message):
 157                 """Print message to stderr."""
 158                 sys.stderr.write('%s\n' % message)
 159
 160         def fixed_template(self):
 161                 """Checks if the output template is fixed."""
 162                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 163
 164         def trouble(self, message=None):
 165                 """Determine action to take when a download problem appears.
 166
 167                 Depending on if the downloader has been configured to ignore
 168                 download errors or not, this method may exit the program or
 169                 not when errors are found, after printing the message. If it
 170                 doesn't exit, it returns an error code suitable to be returned
 171                 later as a program exit code to indicate error.
 172                 """
 173                 if message is not None:
 174                         self.to_stderr(message)
 175                 if not self._params.get('ignoreerrors', False):
 176                         sys.exit(1)
 177                 return 1
 178
 179         def download(self, url_list):
 180                 """Download a given list of URLs."""
 181                 retcode = 0
 182                 if len(url_list) > 1 and self.fixed_template():
 183                         sys.exit('ERROR: fixed output name but more than one file to download')
 184
 185                 for url in url_list:
 186                         suitable_found = False
 187                         for ie in self._ies:
 188                                 if not ie.suitable(url):
 189                                         continue
 190                                 # Suitable InfoExtractor found
 191                                 suitable_found = True
 192                                 all_results = ie.extract(url)
 193                                 results = [x for x in all_results if x is not None]
 194                                 if len(results) != len(all_results):
 195                                         retcode = self.trouble()
 196
 197                                 if len(results) > 1 and self.fixed_template():
 198                                         sys.exit('ERROR: fixed output name but more than one file to download')
 199
 200                                 for result in results:
 201
 202                                         # Forced printings
 203                                         if self._params.get('forcetitle', False):
 204                                                 print result['title']
 205                                         if self._params.get('forceurl', False):
 206                                                 print result['url']
 207
 208                                         # Do nothing else if in simulate mode
 209                                         if self._params.get('simulate', False):
 210                                                 continue
 211
 212                                         try:
 213                                                 filename = self._params['outtmpl'] % result
 214                                         except (ValueError, KeyError), err:
 215                                                 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
 216                                                 continue
 217                                         try:
 218                                                 self.pmkdir(filename)
 219                                         except (OSError, IOError), err:
 220                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 221                                                 continue
 222                                         try:
 223                                                 outstream = open(filename, 'wb')
 224                                         except (OSError, IOError), err:
 225                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 226                                                 continue
 227                                         try:
 228                                                 self._do_download(outstream, result['url'])
 229                                                 outstream.close()
 230                                         except (OSError, IOError), err:
 231                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 232                                                 continue
 233                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 234                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 235                                                 continue
 236                                 break
 237                         if not suitable_found:
 238                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 239
 240                 return retcode
 241
 242         def _do_download(self, stream, url):
 243                 request = urllib2.Request(url, None, std_headers)
 244                 data = urllib2.urlopen(request)
 245                 data_len = data.info().get('Content-length', None)
 246                 data_len_str = self.format_bytes(data_len)
 247                 byte_counter = 0
 248                 block_size = 1024
 249                 start = time.time()
 250                 while True:
 251                         percent_str = self.calc_percent(byte_counter, data_len)
 252                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 253                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 254                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 255                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 256
 257                         before = time.time()
 258                         data_block = data.read(block_size)
 259                         after = time.time()
 260                         data_block_len = len(data_block)
 261                         if data_block_len == 0:
 262                                 break
 263                         byte_counter += data_block_len
 264                         stream.write(data_block)
 265                         block_size = self.best_block_size(after - before, data_block_len)
 266
 267                 self.to_stdout('')
 268                 if data_len is not None and str(byte_counter) != data_len:
 269                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 270
 271 class InfoExtractor(object):
 272         """Information Extractor class.
 273
 274         Information extractors are the classes that, given a URL, extract
 275         information from the video (or videos) the URL refers to. This
 276         information includes the real video URL, the video title and simplified
 277         title, author and others. It is returned in a list of dictionaries when
 278         calling its extract() method. It is a list because a URL can refer to
 279         more than one video (think of playlists). The dictionaries must include
 280         the following fields:
 281
 282         id:             Video identifier.
 283         url:            Final video URL.
 284         uploader:       Nickname of the video uploader.
 285         title:          Literal title.
 286         stitle:         Simplified title.
 287         ext:            Video filename extension.
 288
 289         Subclasses of this one should re-define the _real_initialize() and
 290         _real_extract() methods, as well as the suitable() static method.
 291         Probably, they should also be instantiated and added to the main
 292         downloader.
 293         """
 294
 295         _ready = False
 296         _downloader = None
 297
 298         def __init__(self, downloader=None):
 299                 """Constructor. Receives an optional downloader."""
 300                 self._ready = False
 301                 self.set_downloader(downloader)
 302
 303         @staticmethod
 304         def suitable(url):
 305                 """Receives a URL and returns True if suitable for this IE."""
 306                 return True
 307
 308         def initialize(self):
 309                 """Initializes an instance (authentication, etc)."""
 310                 if not self._ready:
 311                         self._real_initialize()
 312                         self._ready = True
 313
 314         def extract(self, url):
 315                 """Extracts URL information and returns it in list of dicts."""
 316                 self.initialize()
 317                 return self._real_extract(url)
 318
 319         def set_downloader(self, downloader):
 320                 """Sets the downloader for this IE."""
 321                 self._downloader = downloader
 322
 323         def to_stdout(self, message):
 324                 """Print message to stdout if downloader is not in quiet mode."""
 325                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 326                         print message
 327
 328         def to_stderr(self, message):
 329                 """Print message to stderr."""
 330                 sys.stderr.write('%s\n' % message)
 331
 332         def _real_initialize(self):
 333                 """Real initialization process. Redefine in subclasses."""
 334                 pass
 335
 336         def _real_extract(self, url):
 337                 """Real extraction process. Redefine in subclasses."""
 338                 pass
 339
 340 class YoutubeIE(InfoExtractor):
 341         """Information extractor for youtube.com."""
 342
 343         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 344         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 345         _NETRC_MACHINE = 'youtube'
 346
 347         def _real_initialize(self):
 348                 if self._downloader is None:
 349                         return
 350
 351                 username = None
 352                 password = None
 353                 downloader_params = self._downloader.get_params()
 354
 355                 # Attempt to use provided username and password or .netrc data
 356                 if downloader_params.get('username', None) is not None:
 357                         username = downloader_params['username']
 358                         password = downloader_params['password']
 359                 elif downloader_params.get('usenetrc', False):
 360                         try:
 361                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 362                                 if info is not None:
 363                                         username = info[0]
 364                                         password = info[2]
 365                                 else:
 366                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 367                         except (IOError, netrc.NetrcParseError), err:
 368                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 369                                 return
 370
 371                 # No authentication to be performed
 372                 if username is None:
 373                         return
 374
 375                 # Log in
 376                 login_form = {
 377                                 'current_form': 'loginForm',
 378                                 'next':         '/',
 379                                 'action_login': 'Log In',
 380                                 'username':     username,
 381                                 'password':     password,
 382                                 }
 383                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 384                 try:
 385                         self.to_stdout('[youtube] Logging in')
 386                         login_results = urllib2.urlopen(request).read()
 387                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 388                                 self.to_stderr('WARNING: unable to log in: bad username or password')
 389                                 return
 390                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 391                         self.to_stderr('WARNING: unable to log in: %s' % str(err))
 392                         return
 393
 394                 # Confirm age
 395                 age_form = {
 396                                 'next_url':             '/',
 397                                 'action_confirm':       'Confirm',
 398                                 }
 399                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 400                 try:
 401                         self.to_stdout('[youtube] Confirming age')
 402                         age_results = urllib2.urlopen(request).read()
 403                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 404                         sys.exit('ERROR: unable to confirm age: %s' % str(err))
 405
 406         def _real_extract(self, url):
 407                 # Extract video id from URL
 408                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 409                 if mobj is None:
 410                         self.to_stderr('ERROR: invalid URL: %s' % url)
 411                         return [None]
 412                 video_id = mobj.group(2)
 413
 414                 # Downloader parameters
 415                 format_param = None
 416                 if self._downloader is not None:
 417                         params = self._downloader.get_params()
 418                         format_param = params.get('format', None)
 419
 420                 # Extension
 421                 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
 422
 423                 # Normalize URL, including format
 424                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 425                 if format_param is not None:
 426                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 427                 request = urllib2.Request(normalized_url, None, std_headers)
 428                 try:
 429                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 430                         video_webpage = urllib2.urlopen(request).read()
 431                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 432                         sys.exit('ERROR: unable to download video: %s' % str(err))
 433                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 434
 435                 # "t" param
 436                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 437                 if mobj is None:
 438                         self.to_stderr('ERROR: unable to extract "t" parameter')
 439                         return [None]
 440                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 441                 if format_param is not None:
 442                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 443                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 444
 445                 # uploader
 446                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 447                 if mobj is None:
 448                         self.to_stderr('ERROR: unable to extract uploader nickname')
 449                         return [None]
 450                 video_uploader = mobj.group(1)
 451
 452                 # title
 453                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 454                 if mobj is None:
 455                         self.to_stderr('ERROR: unable to extract video title')
 456                         return [None]
 457                 video_title = mobj.group(1).decode('utf-8')
 458                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 459                 video_title = video_title.replace(os.sep, u'%')
 460
 461                 # simplified title
 462                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 463                 simple_title = simple_title.strip(ur'_')
 464
 465                 # Return information
 466                 return [{
 467                         'id':           video_id,
 468                         'url':          video_real_url,
 469                         'uploader':     video_uploader,
 470                         'title':        video_title,
 471                         'stitle':       simple_title,
 472                         'ext':          video_extension,
 473                         }]
 474
 475 if __name__ == '__main__':
 476         try:
 477                 # Modules needed only when running the main program
 478                 import getpass
 479                 import optparse
 480
 481                 # General configuration
 482                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 483                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 484                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 485
 486                 # Parse command line
 487                 parser = optparse.OptionParser(
 488                                 usage='Usage: %prog [options] url...',
 489                                 version='INTERNAL',
 490                                 conflict_handler='resolve',
 491                                 )
 492                 parser.add_option('-h', '--help',
 493                                 action='help', help='print this help text and exit')
 494                 parser.add_option('-v', '--version',
 495                                 action='version', help='print program version and exit')
 496                 parser.add_option('-u', '--username',
 497                                 dest='username', metavar='UN', help='account username')
 498                 parser.add_option('-p', '--password',
 499                                 dest='password', metavar='PW', help='account password')
 500                 parser.add_option('-o', '--output',
 501                                 dest='outtmpl', metavar='TPL', help='output filename template')
 502                 parser.add_option('-q', '--quiet',
 503                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 504                 parser.add_option('-s', '--simulate',
 505                                 action='store_true', dest='simulate', help='do not download video', default=False)
 506                 parser.add_option('-t', '--title',
 507                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 508                 parser.add_option('-l', '--literal',
 509                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 510                 parser.add_option('-n', '--netrc',
 511                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 512                 parser.add_option('-g', '--get-url',
 513                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 514                 parser.add_option('-e', '--get-title',
 515                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 516                 parser.add_option('-f', '--format',
 517                                 dest='format', metavar='FMT', help='video format code')
 518                 parser.add_option('-b', '--best-quality',
 519                                 action='store_const', dest='video_format', help='alias for -f 18', const='18')
 520                 parser.add_option('-i', '--ignore-errors',
 521                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 522                 (opts, args) = parser.parse_args()
 523
 524                 # Conflicting, missing and erroneous options
 525                 if len(args) < 1:
 526                         sys.exit('ERROR: you must provide at least one URL')
 527                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 528                         sys.exit('ERROR: using .netrc conflicts with giving username/password')
 529                 if opts.password is not None and opts.username is None:
 530                         sys.exit('ERROR: account username missing')
 531                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 532                         sys.exit('ERROR: using output template conflicts with using title or literal title')
 533                 if opts.usetitle and opts.useliteral:
 534                         sys.exit('ERROR: using title conflicts with using literal title')
 535                 if opts.username is not None and opts.password is None:
 536                         opts.password = getpass.getpass('Type account password and press return:')
 537
 538                 # Information extractors
 539                 youtube_ie = YoutubeIE()
 540
 541                 # File downloader
 542                 fd = FileDownloader({
 543                         'usenetrc': opts.usenetrc,
 544                         'username': opts.username,
 545                         'password': opts.password,
 546                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 547                         'forceurl': opts.geturl,
 548                         'forcetitle': opts.gettitle,
 549                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 550                         'format': opts.format,
 551                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
 552                                 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
 553                                 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
 554                                 or '%(id)s.%(ext)s'),
 555                         'ignoreerrors': opts.ignoreerrors,
 556                         })
 557                 fd.add_info_extractor(youtube_ie)
 558                 retcode = fd.download(args)
 559                 sys.exit(retcode)
 560
 561         except KeyboardInterrupt:
 562                 sys.exit('\nERROR: Interrupted by user')