git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class FileDownloader(object):
  29         """File Downloader class.
  30
  31         File downloader objects are the ones responsible of downloading the
  32         actual video file and writing it to disk if the user has requested
  33         it, among some other tasks. In most cases there should be one per
  34         program. As, given a video URL, the downloader doesn't know how to
  35         extract all the needed information, task that InfoExtractors do, it
  36         has to pass the URL to one of them.
  37
  38         For this, file downloader objects have a method that allows
  39         InfoExtractors to be registered in a given order. When it is passed
  40         a URL, the file downloader handles it to the first InfoExtractor it
  41         finds that reports it's able to handle it. The InfoExtractor returns
  42         all the information to the FileDownloader and the latter downloads the
  43         file or does whatever it's instructed to do.
  44
  45         File downloaders accept a lot of parameters. In order not to saturate
  46         the object constructor with arguments, it receives a dictionary of
  47         options instead. These options are available through the get_params()
  48         method for the InfoExtractors to use. The FileDownloader also registers
  49         itself as the downloader in charge for the InfoExtractors that are
  50         added to it, so this is a "mutual registration".
  51
  52         Available options:
  53
  54         username:       Username for authentication purposes.
  55         password:       Password for authentication purposes.
  56         usenetrc:       Use netrc for authentication instead.
  57         quiet:          Do not print messages to stdout.
  58         format:         Video format code.
  59         outtmpl:        Template for output names.
  60         """
  61
  62         _params = None
  63         _ies = []
  64
  65         def __init__(self, params):
  66                 self._ies = []
  67                 self.set_params(params)
  68
  69         @staticmethod
  70         def pmkdir(filename):
  71                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  72                 components = filename.split(os.sep)
  73                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  74                 for dir in aggregate:
  75                         if not os.path.exists(dir):
  76                                 os.mkdir(dir)
  77
  78         @staticmethod
  79         def format_bytes(bytes):
  80                 if bytes is None:
  81                         return 'N/A'
  82                 if bytes == 0:
  83                         exponent = 0
  84                 else:
  85                         exponent = long(math.log(float(bytes), 1024.0))
  86                 suffix = 'bkMGTPEZY'[exponent]
  87                 if exponent == 0:
  88                         return '%s%s' % (bytes, suffix)
  89                 converted = float(bytes) / float(1024**exponent)
  90                 return '%.2f%s' % (converted, suffix)
  91
  92         @staticmethod
  93         def calc_percent(byte_counter, data_len):
  94                 if data_len is None:
  95                         return '---.-%'
  96                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  97
  98         @staticmethod
  99         def calc_eta(start, now, total, current):
 100                 if total is None:
 101                         return '--:--'
 102                 dif = now - start
 103                 if current == 0 or dif < 0.001: # One millisecond
 104                         return '--:--'
 105                 rate = float(current) / dif
 106                 eta = long((float(total) - float(current)) / rate)
 107                 (eta_mins, eta_secs) = divmod(eta, 60)
 108                 if eta_mins > 99:
 109                         return '--:--'
 110                 return '%02d:%02d' % (eta_mins, eta_secs)
 111
 112         @staticmethod
 113         def calc_speed(start, now, bytes):
 114                 dif = now - start
 115                 if bytes == 0 or dif < 0.001: # One millisecond
 116                         return '%9s' % 'N/A b/s'
 117                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 118
 119         @staticmethod
 120         def best_block_size(elapsed_time, bytes):
 121                 new_min = max(bytes / 2.0, 1.0)
 122                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 123                 if elapsed_time < 0.001:
 124                         return int(new_max)
 125                 rate = bytes / elapsed_time
 126                 if rate > new_max:
 127                         return int(new_max)
 128                 if rate < new_min:
 129                         return int(new_min)
 130                 return int(rate)
 131
 132         def set_params(self, params):
 133                 """Sets parameters."""
 134                 if type(params) != dict:
 135                         raise ValueError('params: dictionary expected')
 136                 self._params = params
 137
 138         def get_params(self):
 139                 """Get parameters."""
 140                 return self._params
 141
 142         def add_info_extractor(self, ie):
 143                 """Add an InfoExtractor object to the end of the list."""
 144                 self._ies.append(ie)
 145                 ie.set_downloader(self)
 146
 147         def download(self, url_list):
 148                 """Download a given list of URLs."""
 149                 for url in url_list:
 150                         suitable_found = False
 151                         for ie in self._ies:
 152                                 if not ie.suitable(url):
 153                                         continue
 154                                 # Suitable InfoExtractor found
 155                                 suitable_found = True
 156                                 for result in ie.extract(url):
 157                                         if result is None:
 158                                                 continue
 159                                         try:
 160                                                 filename = self._params['outtmpl'] % result
 161                                         except (KeyError), err:
 162                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
 163                                                 continue
 164                                         try:
 165                                                 self.pmkdir(filename)
 166                                         except (OSError, IOError), err:
 167                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
 168                                                 continue
 169                                         try:
 170                                                 outstream = open(filename, 'wb')
 171                                         except (OSError, IOError), err:
 172                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
 173                                                 continue
 174                                         try:
 175                                                 self._do_download(outstream, result['url'])
 176                                                 outstream.close()
 177                                         except (OSError, IOError), err:
 178                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
 179                                                 continue
 180                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 181                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
 182                                                 continue
 183                                 break
 184                         if not suitable_found:
 185                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
 186
 187         def _do_download(self, stream, url):
 188                 request = urllib2.Request(url, None, std_headers)
 189                 data = urllib2.urlopen(request)
 190                 data_len = data.info().get('Content-length', None)
 191                 data_len_str = self.format_bytes(data_len)
 192                 byte_counter = 0
 193                 block_size = 1024
 194                 start = time.time()
 195                 while True:
 196                         percent_str = self.calc_percent(byte_counter, data_len)
 197                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 198                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 199
 200                         if not self._params.get('quiet', False):
 201                                 sys.stdout.write('\r[download] %s of %s at %s ETA %s' %
 202                                                 (percent_str, data_len_str, speed_str, eta_str))
 203                                 sys.stdout.flush()
 204
 205                         before = time.time()
 206                         data_block = data.read(block_size)
 207                         after = time.time()
 208                         data_block_len = len(data_block)
 209                         if data_block_len == 0:
 210                                 break
 211                         byte_counter += data_block_len
 212                         stream.write(data_block)
 213                         block_size = self.best_block_size(after - before, data_block_len)
 214
 215                 if not self._params.get('quiet', False):
 216                         print
 217
 218                 if data_len is not None and str(byte_counter) != data_len:
 219                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 220
 221 class InfoExtractor(object):
 222         """Information Extractor class.
 223
 224         Information extractors are the classes that, given a URL, extract
 225         information from the video (or videos) the URL refers to. This
 226         information includes the real video URL, the video title and simplified
 227         title, author and others. It is returned in a list of dictionaries when
 228         calling its extract() method. It is a list because a URL can refer to
 229         more than one video (think of playlists). The dictionaries must include
 230         the following fields:
 231
 232         id:             Video identifier.
 233         url:            Final video URL.
 234         uploader:       Nickname of the video uploader.
 235         title:          Literal title.
 236         stitle:         Simplified title.
 237         ext:            Video filename extension.
 238
 239         Subclasses of this one should re-define the _real_initialize() and
 240         _real_extract() methods, as well as the suitable() static method.
 241         Probably, they should also be instantiated and added to the main
 242         downloader.
 243         """
 244
 245         _ready = False
 246         _downloader = None
 247
 248         def __init__(self, downloader=None):
 249                 """Constructor. Receives an optional downloader."""
 250                 self._ready = False
 251                 self.set_downloader(downloader)
 252
 253         @staticmethod
 254         def suitable(url):
 255                 """Receives a URL and returns True if suitable for this IE."""
 256                 return True
 257
 258         def initialize(self):
 259                 """Initializes an instance (login, etc)."""
 260                 if not self._ready:
 261                         self._real_initialize()
 262                         self._ready = True
 263
 264         def extract(self, url):
 265                 """Extracts URL information and returns it in list of dicts."""
 266                 self.initialize()
 267                 return self._real_extract(url)
 268
 269         def set_downloader(self, downloader):
 270                 """Sets the downloader for this IE."""
 271                 self._downloader = downloader
 272
 273         def to_stdout(self, message):
 274                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 275                         print message
 276
 277         def to_stderr(self, message):
 278                 sys.stderr.write('%s\n' % message)
 279
 280         def _real_initialize(self):
 281                 """Real initialization process. Redefine in subclasses."""
 282                 pass
 283
 284         def _real_extract(self, url):
 285                 """Real extraction process. Redefine in subclasses."""
 286                 pass
 287
 288 class YoutubeIE(InfoExtractor):
 289         """Information extractor for youtube.com."""
 290
 291         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 292         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 293         _NETRC_MACHINE = 'youtube'
 294
 295         def _real_initialize(self):
 296                 if self._downloader is None:
 297                         return
 298
 299                 username = None
 300                 password = None
 301                 downloader_params = self._downloader.get_params()
 302
 303                 # Attempt to use provided username and password or .netrc data
 304                 if downloader_params.get('username', None) is not None:
 305                         username = downloader_params['username']
 306                         password = downloader_params['password']
 307                 elif downloader_params.get('usenetrc', False):
 308                         try:
 309                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 310                                 if info is not None:
 311                                         username = info[0]
 312                                         password = info[2]
 313                                 else:
 314                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 315                         except (IOError, netrc.NetrcParseError), err:
 316                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
 317                                 return
 318
 319                 if username is None:
 320                         return
 321
 322                 # Log in
 323                 login_form = {  'current_form': 'loginForm',
 324                                 'next':         '/',
 325                                 'action_login': 'Log In',
 326                                 'username':     username,
 327                                 'password':     password,       }
 328                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 329                 try:
 330                         self.to_stdout('[youtube] Logging in')
 331                         login_results = urllib2.urlopen(request).read()
 332                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 333                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
 334                                 return
 335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 336                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
 337                         return
 338
 339                 # Confirm age
 340                 age_form = {    'next_url':             '/',
 341                                 'action_confirm':       'Confirm',      }
 342                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 343                 try:
 344                         self.to_stdout('[youtube] Confirming age')
 345                         age_results = urllib2.urlopen(request).read()
 346                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 347                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
 348
 349         def _real_extract(self, url):
 350                 # Extract video id from URL
 351                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
 352                 if mobj is None:
 353                         self.to_stderr('ERROR: Invalid URL: %s' % url)
 354                         return [None]
 355                 video_id = mobj.group(2)
 356
 357                 # Downloader parameters
 358                 format_param = None
 359                 if self._downloader is not None:
 360                         params = self._downloader.get_params()
 361                         format_param = params.get('format', None)
 362
 363                 # Extension
 364                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
 365
 366                 # Normalize URL, including format
 367                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 368                 if format_param is not None:
 369                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 370                 request = urllib2.Request(normalized_url, None, std_headers)
 371                 try:
 372                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
 373                         video_webpage = urllib2.urlopen(request).read()
 374                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 375                         sys.exit('ERROR: Unable to download video: %s' % str(err))
 376                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
 377
 378                 # "t" param
 379                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 380                 if mobj is None:
 381                         self.to_stderr('ERROR: Unable to extract "t" parameter')
 382                         return [None]
 383                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 384                 if format_param is not None:
 385                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 386
 387                 # uploader
 388                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 389                 if mobj is None:
 390                         self.to_stderr('ERROR: Unable to extract uploader nickname')
 391                         return [None]
 392                 video_uploader = mobj.group(1)
 393
 394                 # title
 395                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 396                 if mobj is None:
 397                         self.to_stderr('ERROR: Unable to extract video title')
 398                         return [None]
 399                 video_title = mobj.group(1).decode('utf-8')
 400                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 401
 402                 # simplified title
 403                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
 404                 simple_title = simple_title.strip(u'_')
 405
 406                 # Return information
 407                 return [{       'id':           video_id,
 408                                 'url':          video_real_url,
 409                                 'uploader':     video_uploader,
 410                                 'title':        video_title,
 411                                 'stitle':       simple_title,
 412                                 'ext':          video_extension,
 413                                 }]
 414
 415 if __name__ == '__main__':
 416         try:
 417                 # General configuration
 418                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 419                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 420
 421                 # Information extractors
 422                 youtube_ie = YoutubeIE()
 423
 424                 # File downloader
 425                 fd = FileDownloader({   'usenetrc': False,
 426                                         'username': None,
 427                                         'password': None,
 428                                         'quiet': False,
 429                                         'format': None,
 430                                         'outtmpl': '%(id)s.%(ext)s'
 431                                         })
 432                 fd.add_info_extractor(youtube_ie)
 433                 fd.download([   'http://www.youtube.com/watch?v=t7qdwI7TVe8',
 434                                 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
 435                                 'http://www.youtube.com/watch?v=DZRXe1wtC-M',   ])
 436
 437         except KeyboardInterrupt:
 438                 sys.exit('\nERROR: Interrupted by user')