git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports being able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         simulate:       Do not download the video files.
  59         format:         Video format code.
  60         outtmpl:        Template for output names.
  61         """
  62
  63         _params = None
  64         _ies = []
  65
  66         def __init__(self, params):
  67                 self._ies = []
  68                 self.set_params(params)
  69
  70         @staticmethod
  71         def pmkdir(filename):
  72                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  73                 components = filename.split(os.sep)
  74                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  75                 for dir in aggregate:
  76                         if not os.path.exists(dir):
  77                                 os.mkdir(dir)
  78
  79         @staticmethod
  80         def format_bytes(bytes):
  81                 if bytes is None:
  82                         return 'N/A'
  83                 if bytes == 0:
  84                         exponent = 0
  85                 else:
  86                         exponent = long(math.log(float(bytes), 1024.0))
  87                 suffix = 'bkMGTPEZY'[exponent]
  88                 converted = float(bytes) / float(1024**exponent)
  89                 return '%.2f%s' % (converted, suffix)
  90
  91         @staticmethod
  92         def calc_percent(byte_counter, data_len):
  93                 if data_len is None:
  94                         return '---.-%'
  95                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  96
  97         @staticmethod
  98         def calc_eta(start, now, total, current):
  99                 if total is None:
 100                         return '--:--'
 101                 dif = now - start
 102                 if current == 0 or dif < 0.001: # One millisecond
 103                         return '--:--'
 104                 rate = float(current) / dif
 105                 eta = long((float(total) - float(current)) / rate)
 106                 (eta_mins, eta_secs) = divmod(eta, 60)
 107                 if eta_mins > 99:
 108                         return '--:--'
 109                 return '%02d:%02d' % (eta_mins, eta_secs)
 110
 111         @staticmethod
 112         def calc_speed(start, now, bytes):
 113                 dif = now - start
 114                 if bytes == 0 or dif < 0.001: # One millisecond
 115                         return '%10s' % '---b/s'
 116                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 117
 118         @staticmethod
 119         def best_block_size(elapsed_time, bytes):
 120                 new_min = max(bytes / 2.0, 1.0)
 121                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 122                 if elapsed_time < 0.001:
 123                         return int(new_max)
 124                 rate = bytes / elapsed_time
 125                 if rate > new_max:
 126                         return int(new_max)
 127                 if rate < new_min:
 128                         return int(new_min)
 129                 return int(rate)
 130
 131         def set_params(self, params):
 132                 """Sets parameters."""
 133                 if type(params) != dict:
 134                         raise ValueError('params: dictionary expected')
 135                 self._params = params
 136
 137         def get_params(self):
 138                 """Get parameters."""
 139                 return self._params
 140
 141         def add_info_extractor(self, ie):
 142                 """Add an InfoExtractor object to the end of the list."""
 143                 self._ies.append(ie)
 144                 ie.set_downloader(self)
 145
 146         def to_stdout(self, message, skip_eol=False):
 147                 """Print message to stdout if not in quiet mode."""
 148                 if not self._params.get('quiet', False):
 149                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
 150                         sys.stdout.flush()
 151
 152         def to_stderr(self, message):
 153                 """Print message to stderr."""
 154                 sys.stderr.write('%s\n' % message)
 155
 156         def fixed_template(self):
 157                 """Checks if the output template is fixed."""
 158                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 159
 160         def download(self, url_list):
 161                 """Download a given list of URLs."""
 162                 if len(url_list) > 1 and self.fixed_template():
 163                         sys.exit('ERROR: fixed output name but more than one file to download')
 164
 165                 for url in url_list:
 166                         suitable_found = False
 167                         for ie in self._ies:
 168                                 if not ie.suitable(url):
 169                                         continue
 170                                 # Suitable InfoExtractor found
 171                                 suitable_found = True
 172                                 results = [x for x in ie.extract(url) if x is not None]
 173
 174                                 if len(results) > 1 and self.fixed_template():
 175                                         sys.exit('ERROR: fixed output name but more than one file to download')
 176
 177                                 if self._params.get('simulate', False):
 178                                         continue
 179
 180                                 for result in results:
 181                                         try:
 182                                                 filename = self._params['outtmpl'] % result
 183                                         except (ValueError, KeyError), err:
 184                                                 self.to_stderr('ERROR: invalid output template: %s' % str(err))
 185                                                 continue
 186                                         try:
 187                                                 self.pmkdir(filename)
 188                                         except (OSError, IOError), err:
 189                                                 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
 190                                                 continue
 191                                         try:
 192                                                 outstream = open(filename, 'wb')
 193                                         except (OSError, IOError), err:
 194                                                 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
 195                                                 continue
 196                                         try:
 197                                                 self._do_download(outstream, result['url'])
 198                                                 outstream.close()
 199                                         except (OSError, IOError), err:
 200                                                 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
 201                                                 continue
 202                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 203                                                 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
 204                                                 continue
 205                                 break
 206                         if not suitable_found:
 207                                 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
 208
 209         def _do_download(self, stream, url):
 210                 request = urllib2.Request(url, None, std_headers)
 211                 data = urllib2.urlopen(request)
 212                 data_len = data.info().get('Content-length', None)
 213                 data_len_str = self.format_bytes(data_len)
 214                 byte_counter = 0
 215                 block_size = 1024
 216                 start = time.time()
 217                 while True:
 218                         percent_str = self.calc_percent(byte_counter, data_len)
 219                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 220                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 221                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
 222                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 223
 224                         before = time.time()
 225                         data_block = data.read(block_size)
 226                         after = time.time()
 227                         data_block_len = len(data_block)
 228                         if data_block_len == 0:
 229                                 break
 230                         byte_counter += data_block_len
 231                         stream.write(data_block)
 232                         block_size = self.best_block_size(after - before, data_block_len)
 233
 234                 self.to_stdout('')
 235                 if data_len is not None and str(byte_counter) != data_len:
 236                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 237
 238 class InfoExtractor(object):
 239         """Information Extractor class.
 240
 241         Information extractors are the classes that, given a URL, extract
 242         information from the video (or videos) the URL refers to. This
 243         information includes the real video URL, the video title and simplified
 244         title, author and others. It is returned in a list of dictionaries when
 245         calling its extract() method. It is a list because a URL can refer to
 246         more than one video (think of playlists). The dictionaries must include
 247         the following fields:
 248
 249         id:             Video identifier.
 250         url:            Final video URL.
 251         uploader:       Nickname of the video uploader.
 252         title:          Literal title.
 253         stitle:         Simplified title.
 254         ext:            Video filename extension.
 255
 256         Subclasses of this one should re-define the _real_initialize() and
 257         _real_extract() methods, as well as the suitable() static method.
 258         Probably, they should also be instantiated and added to the main
 259         downloader.
 260         """
 261
 262         _ready = False
 263         _downloader = None
 264
 265         def __init__(self, downloader=None):
 266                 """Constructor. Receives an optional downloader."""
 267                 self._ready = False
 268                 self.set_downloader(downloader)
 269
 270         @staticmethod
 271         def suitable(url):
 272                 """Receives a URL and returns True if suitable for this IE."""
 273                 return True
 274
 275         def initialize(self):
 276                 """Initializes an instance (login, etc)."""
 277                 if not self._ready:
 278                         self._real_initialize()
 279                         self._ready = True
 280
 281         def extract(self, url):
 282                 """Extracts URL information and returns it in list of dicts."""
 283                 self.initialize()
 284                 return self._real_extract(url)
 285
 286         def set_downloader(self, downloader):
 287                 """Sets the downloader for this IE."""
 288                 self._downloader = downloader
 289
 290         def to_stdout(self, message):
 291                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 292                         print message
 293
 294         def to_stderr(self, message):
 295                 sys.stderr.write('%s\n' % message)
 296
 297         def _real_initialize(self):
 298                 """Real initialization process. Redefine in subclasses."""
 299                 pass
 300
 301         def _real_extract(self, url):
 302                 """Real extraction process. Redefine in subclasses."""
 303                 pass
 304
 305 class YoutubeIE(InfoExtractor):
 306         """Information extractor for youtube.com."""
 307
 308         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 309         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 310         _NETRC_MACHINE = 'youtube'
 311
 312         def _real_initialize(self):
 313                 if self._downloader is None:
 314                         return
 315
 316                 username = None
 317                 password = None
 318                 downloader_params = self._downloader.get_params()
 319
 320                 # Attempt to use provided username and password or .netrc data
 321                 if downloader_params.get('username', None) is not None:
 322                         username = downloader_params['username']
 323                         password = downloader_params['password']
 324                 elif downloader_params.get('usenetrc', False):
 325                         try:
 326                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 327                                 if info is not None:
 328                                         username = info[0]
 329                                         password = info[2]
 330                                 else:
 331                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 332                         except (IOError, netrc.NetrcParseError), err:
 333                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 334                                 return
 335
 336                 if username is None:
 337                         return
 338
 339                 # Log in
 340                 login_form = {
 341                                 'current_form': 'loginForm',
 342                                 'next':         '/',
 343                                 'action_login': 'Log In',
 344                                 'username':     username,
 345                                 'password':     password,
 346                                 }
 347                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 348                 try:
 349                         self.to_stdout('[youtube] Logging in')
 350                         login_results = urllib2.urlopen(request).read()
 351                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 352                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 353                                 return
 354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 355                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 356                         return
 357
 358                 # Confirm age
 359                 age_form = {
 360                                 'next_url':             '/',
 361                                 'action_confirm':       'Confirm',
 362                                 }
 363                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 364                 try:
 365                         self.to_stdout('[youtube] Confirming age')
 366                         age_results = urllib2.urlopen(request).read()
 367                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 368                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 369
 370         def _real_extract(self, url):
 371                 # Extract video id from URL
 372                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 373                 if mobj is None:
 374                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 375                         return [None]
 376                 video_id = mobj.group(2)
 377
 378                 # Downloader parameters
 379                 format_param = None
 380                 if self._downloader is not None:
 381                         params = self._downloader.get_params()
 382                         format_param = params.get('format', None)
 383
 384                 # Extension
 385                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
 386
 387                 # Normalize URL, including format
 388                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 389                 if format_param is not None:
 390                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 391                 request = urllib2.Request(normalized_url, None, std_headers)
 392                 try:
 393                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 394                         video_webpage = urllib2.urlopen(request).read()
 395                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 396                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 397                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 398
 399                 # "t" param
 400                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 401                 if mobj is None:
 402                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 403                         return [None]
 404                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 405                 if format_param is not None:
 406                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 407                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
 408
 409                 # uploader
 410                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 411                 if mobj is None:
 412                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 413                         return [None]
 414                 video_uploader = mobj.group(1)
 415
 416                 # title
 417                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 418                 if mobj is None:
 419                         self.to_stderr('ERROR: Unable to extract video title')
 420                         return [None]
 421                 video_title = mobj.group(1).decode('utf-8')
 422                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 423
 424                 # simplified title
 425                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 426                 simple_title = simple_title.strip(ur'_')
 427
 428                 # Return information
 429                 return [{
 430                         'id':           video_id,
 431                         'url':          video_real_url,
 432                         'uploader':     video_uploader,
 433                         'title':        video_title,
 434                         'stitle':       simple_title,
 435                         'ext':          video_extension,
 436                         }]
 437
 438 if __name__ == '__main__':
 439         try:
 440                 # General configuration
 441                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 442                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 443
 444                 # Information extractors
 445                 youtube_ie = YoutubeIE()
 446
 447                 # File downloader
 448                 fd = FileDownloader({
 449                         'usenetrc': False,
 450                         'username': None,
 451                         'password': None,
 452                         'quiet': False,
 453                         'simulate': False,
 454                         'format': None,
 455                         'outtmpl': '%(ext)s/%(ext)s/%(id)s.%(ext)s'
 456                         })
 457                 fd.add_info_extractor(youtube_ie)
 458                 fd.download([
 459                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
 460                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
 461                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
 462                         ])
 463
 464         except KeyboardInterrupt:
 465                 sys.exit('\nERROR: Interrupted by user')