]> git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl
b0b494ed680beaa13f8d065fc5e6b2b7e348e352
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class DownloadError(Exception):
29         """Download Error exception.
30         
31         This exception may be thrown by FileDownloader objects if they are not
32         configured to continue on errors. They will contain the appropriate
33         error message.
34         """
35         pass
36
37 class SameFileError(Exception):
38         """Same File exception.
39
40         This exception will be thrown by FileDownloader objects if they detect
41         multiple files would have to be downloaded to the same file on disk.
42         """
43         pass
44
45 class FileDownloader(object):
46         """File Downloader class.
47
48         File downloader objects are the ones responsible of downloading the
49         actual video file and writing it to disk if the user has requested
50         it, among some other tasks. In most cases there should be one per
51         program. As, given a video URL, the downloader doesn't know how to
52         extract all the needed information, task that InfoExtractors do, it
53         has to pass the URL to one of them.
54
55         For this, file downloader objects have a method that allows
56         InfoExtractors to be registered in a given order. When it is passed
57         a URL, the file downloader handles it to the first InfoExtractor it
58         finds that reports being able to handle it. The InfoExtractor returns
59         all the information to the FileDownloader and the latter downloads the
60         file or does whatever it's instructed to do.
61
62         File downloaders accept a lot of parameters. In order not to saturate
63         the object constructor with arguments, it receives a dictionary of
64         options instead. These options are available through the get_params()
65         method for the InfoExtractors to use. The FileDownloader also registers
66         itself as the downloader in charge for the InfoExtractors that are
67         added to it, so this is a "mutual registration".
68
69         Available options:
70
71         username:       Username for authentication purposes.
72         password:       Password for authentication purposes.
73         usenetrc:       Use netrc for authentication instead.
74         quiet:          Do not print messages to stdout.
75         forceurl:       Force printing final URL.
76         forcetitle:     Force printing title.
77         simulate:       Do not download the video files.
78         format:         Video format code.
79         outtmpl:        Template for output names.
80         ignoreerrors:   Do not stop on download errors.
81         """
82
83         _params = None
84         _ies = []
85
86         def __init__(self, params):
87                 """Create a FileDownloader object with the given options."""
88                 self._ies = []
89                 self.set_params(params)
90         
91         @staticmethod
92         def pmkdir(filename):
93                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
94                 components = filename.split(os.sep)
95                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
96                 for dir in aggregate:
97                         if not os.path.exists(dir):
98                                 os.mkdir(dir)
99         
100         @staticmethod
101         def format_bytes(bytes):
102                 if bytes is None:
103                         return 'N/A'
104                 if bytes == 0:
105                         exponent = 0
106                 else:
107                         exponent = long(math.log(float(bytes), 1024.0))
108                 suffix = 'bkMGTPEZY'[exponent]
109                 converted = float(bytes) / float(1024**exponent)
110                 return '%.2f%s' % (converted, suffix)
111
112         @staticmethod
113         def calc_percent(byte_counter, data_len):
114                 if data_len is None:
115                         return '---.-%'
116                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
117
118         @staticmethod
119         def calc_eta(start, now, total, current):
120                 if total is None:
121                         return '--:--'
122                 dif = now - start
123                 if current == 0 or dif < 0.001: # One millisecond
124                         return '--:--'
125                 rate = float(current) / dif
126                 eta = long((float(total) - float(current)) / rate)
127                 (eta_mins, eta_secs) = divmod(eta, 60)
128                 if eta_mins > 99:
129                         return '--:--'
130                 return '%02d:%02d' % (eta_mins, eta_secs)
131
132         @staticmethod
133         def calc_speed(start, now, bytes):
134                 dif = now - start
135                 if bytes == 0 or dif < 0.001: # One millisecond
136                         return '%10s' % '---b/s'
137                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
138
139         @staticmethod
140         def best_block_size(elapsed_time, bytes):
141                 new_min = max(bytes / 2.0, 1.0)
142                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
143                 if elapsed_time < 0.001:
144                         return int(new_max)
145                 rate = bytes / elapsed_time
146                 if rate > new_max:
147                         return int(new_max)
148                 if rate < new_min:
149                         return int(new_min)
150                 return int(rate)
151
152         def set_params(self, params):
153                 """Sets parameters."""
154                 if type(params) != dict:
155                         raise ValueError('params: dictionary expected')
156                 self._params = params
157         
158         def get_params(self):
159                 """Get parameters."""
160                 return self._params
161
162         def add_info_extractor(self, ie):
163                 """Add an InfoExtractor object to the end of the list."""
164                 self._ies.append(ie)
165                 ie.set_downloader(self)
166         
167         def to_stdout(self, message, skip_eol=False):
168                 """Print message to stdout if not in quiet mode."""
169                 if not self._params.get('quiet', False):
170                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
171                         sys.stdout.flush()
172         
173         def to_stderr(self, message):
174                 """Print message to stderr."""
175                 sys.stderr.write('%s\n' % message)
176         
177         def fixed_template(self):
178                 """Checks if the output template is fixed."""
179                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
180
181         def trouble(self, message=None):
182                 """Determine action to take when a download problem appears.
183
184                 Depending on if the downloader has been configured to ignore
185                 download errors or not, this method may throw an exception or
186                 not when errors are found, after printing the message. If it
187                 doesn't raise, it returns an error code suitable to be returned
188                 later as a program exit code to indicate error.
189                 """
190                 if message is not None:
191                         self.to_stderr(message)
192                 if not self._params.get('ignoreerrors', False):
193                         raise DownloadError(message)
194                 return 1
195
196         def report_destination(self, filename):
197                 """Report destination filename."""
198                 self.to_stdout('[download] Destination: %s' % filename)
199         
200         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
201                 """Report download progress."""
202                 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
203                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
204         
205         def report_finish(self):
206                 """Report download finished."""
207                 self.to_stdout('')
208
209         def download(self, url_list):
210                 """Download a given list of URLs."""
211                 retcode = 0
212                 if len(url_list) > 1 and self.fixed_template():
213                         raise SameFileError(self._params['outtmpl'])
214
215                 for url in url_list:
216                         suitable_found = False
217                         for ie in self._ies:
218                                 if not ie.suitable(url):
219                                         continue
220                                 # Suitable InfoExtractor found
221                                 suitable_found = True
222                                 all_results = ie.extract(url)
223                                 results = [x for x in all_results if x is not None]
224                                 if len(results) != len(all_results):
225                                         retcode = self.trouble()
226
227                                 if len(results) > 1 and self.fixed_template():
228                                         raise SameFileError(self._params['outtmpl'])
229
230                                 for result in results:
231
232                                         # Forced printings
233                                         if self._params.get('forcetitle', False):
234                                                 print result['title']
235                                         if self._params.get('forceurl', False):
236                                                 print result['url']
237                                                 
238                                         # Do nothing else if in simulate mode
239                                         if self._params.get('simulate', False):
240                                                 continue
241
242                                         try:
243                                                 filename = self._params['outtmpl'] % result
244                                                 self.report_destination(filename)
245                                         except (ValueError, KeyError), err:
246                                                 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
247                                                 continue
248                                         try:
249                                                 self.pmkdir(filename)
250                                         except (OSError, IOError), err:
251                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
252                                                 continue
253                                         try:
254                                                 outstream = open(filename, 'wb')
255                                         except (OSError, IOError), err:
256                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
257                                                 continue
258                                         try:
259                                                 self._do_download(outstream, result['url'])
260                                                 outstream.close()
261                                         except (OSError, IOError), err:
262                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
263                                                 continue
264                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
265                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
266                                                 continue
267                                 break
268                         if not suitable_found:
269                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
270
271                 return retcode
272         
273         def _do_download(self, stream, url):
274                 request = urllib2.Request(url, None, std_headers)
275                 data = urllib2.urlopen(request)
276                 data_len = data.info().get('Content-length', None)
277                 data_len_str = self.format_bytes(data_len)
278                 byte_counter = 0
279                 block_size = 1024
280                 start = time.time()
281                 while True:
282                         # Progress message
283                         percent_str = self.calc_percent(byte_counter, data_len)
284                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
285                         speed_str = self.calc_speed(start, time.time(), byte_counter)
286                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
287
288                         # Download and write
289                         before = time.time()
290                         data_block = data.read(block_size)
291                         after = time.time()
292                         data_block_len = len(data_block)
293                         if data_block_len == 0:
294                                 break
295                         byte_counter += data_block_len
296                         stream.write(data_block)
297                         block_size = self.best_block_size(after - before, data_block_len)
298
299                 self.report_finish()
300                 if data_len is not None and str(byte_counter) != data_len:
301                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
302
303 class InfoExtractor(object):
304         """Information Extractor class.
305
306         Information extractors are the classes that, given a URL, extract
307         information from the video (or videos) the URL refers to. This
308         information includes the real video URL, the video title and simplified
309         title, author and others. It is returned in a list of dictionaries when
310         calling its extract() method. It is a list because a URL can refer to
311         more than one video (think of playlists). The dictionaries must include
312         the following fields:
313
314         id:             Video identifier.
315         url:            Final video URL.
316         uploader:       Nickname of the video uploader.
317         title:          Literal title.
318         stitle:         Simplified title.
319         ext:            Video filename extension.
320
321         Subclasses of this one should re-define the _real_initialize() and
322         _real_extract() methods, as well as the suitable() static method.
323         Probably, they should also be instantiated and added to the main
324         downloader.
325         """
326
327         _ready = False
328         _downloader = None
329
330         def __init__(self, downloader=None):
331                 """Constructor. Receives an optional downloader."""
332                 self._ready = False
333                 self.set_downloader(downloader)
334
335         @staticmethod
336         def suitable(url):
337                 """Receives a URL and returns True if suitable for this IE."""
338                 return True
339
340         def initialize(self):
341                 """Initializes an instance (authentication, etc)."""
342                 if not self._ready:
343                         self._real_initialize()
344                         self._ready = True
345
346         def extract(self, url):
347                 """Extracts URL information and returns it in list of dicts."""
348                 self.initialize()
349                 return self._real_extract(url)
350
351         def set_downloader(self, downloader):
352                 """Sets the downloader for this IE."""
353                 self._downloader = downloader
354         
355         def to_stdout(self, message):
356                 """Print message to stdout if downloader is not in quiet mode."""
357                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
358                         print message
359         
360         def to_stderr(self, message):
361                 """Print message to stderr."""
362                 sys.stderr.write('%s\n' % message)
363
364         def _real_initialize(self):
365                 """Real initialization process. Redefine in subclasses."""
366                 pass
367
368         def _real_extract(self, url):
369                 """Real extraction process. Redefine in subclasses."""
370                 pass
371
372 class YoutubeIE(InfoExtractor):
373         """Information extractor for youtube.com."""
374
375         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
376         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
377         _NETRC_MACHINE = 'youtube'
378
379         def report_login(self):
380                 """Report attempt to log in."""
381                 self.to_stdout('[youtube] Logging in')
382         
383         def report_age_confirmation(self):
384                 """Report attempt to confirm age."""
385                 self.to_stdout('[youtube] Confirming age')
386         
387         def report_webpage_download(self, video_id):
388                 """Report attempt to download webpage."""
389                 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
390         
391         def report_information_extraction(self, video_id):
392                 """Report attempt to extract video information."""
393                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
394         
395         def report_video_url(self, video_id, video_real_url):
396                 """Report extracted video URL."""
397                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
398
399         def _real_initialize(self):
400                 if self._downloader is None:
401                         return
402
403                 username = None
404                 password = None
405                 downloader_params = self._downloader.get_params()
406
407                 # Attempt to use provided username and password or .netrc data
408                 if downloader_params.get('username', None) is not None:
409                         username = downloader_params['username']
410                         password = downloader_params['password']
411                 elif downloader_params.get('usenetrc', False):
412                         try:
413                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
414                                 if info is not None:
415                                         username = info[0]
416                                         password = info[2]
417                                 else:
418                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
419                         except (IOError, netrc.NetrcParseError), err:
420                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
421                                 return
422
423                 # No authentication to be performed
424                 if username is None:
425                         return
426
427                 # Log in
428                 login_form = {
429                                 'current_form': 'loginForm',
430                                 'next':         '/',
431                                 'action_login': 'Log In',
432                                 'username':     username,
433                                 'password':     password,
434                                 }
435                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
436                 try:
437                         self.report_login()
438                         login_results = urllib2.urlopen(request).read()
439                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
440                                 self.to_stderr('WARNING: unable to log in: bad username or password')
441                                 return
442                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
443                         self.to_stderr('WARNING: unable to log in: %s' % str(err))
444                         return
445         
446                 # Confirm age
447                 age_form = {
448                                 'next_url':             '/',
449                                 'action_confirm':       'Confirm',
450                                 }
451                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
452                 try:
453                         self.report_age_confirmation()
454                         age_results = urllib2.urlopen(request).read()
455                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
456                         self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
457                         return
458
459         def _real_extract(self, url):
460                 # Extract video id from URL
461                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
462                 if mobj is None:
463                         self.to_stderr('ERROR: invalid URL: %s' % url)
464                         return [None]
465                 video_id = mobj.group(2)
466
467                 # Downloader parameters
468                 format_param = None
469                 if self._downloader is not None:
470                         params = self._downloader.get_params()
471                         format_param = params.get('format', None)
472
473                 # Extension
474                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
475
476                 # Normalize URL, including format
477                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
478                 if format_param is not None:
479                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
480                 request = urllib2.Request(normalized_url, None, std_headers)
481                 try:
482                         self.report_webpage_download(video_id)
483                         video_webpage = urllib2.urlopen(request).read()
484                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
485                         self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
486                         return [None]
487                 self.report_information_extraction(video_id)
488                 
489                 # "t" param
490                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
491                 if mobj is None:
492                         self.to_stderr('ERROR: unable to extract "t" parameter')
493                         return [None]
494                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
495                 if format_param is not None:
496                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
497                 self.report_video_url(video_id, video_real_url)
498
499                 # uploader
500                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
501                 if mobj is None:
502                         self.to_stderr('ERROR: unable to extract uploader nickname')
503                         return [None]
504                 video_uploader = mobj.group(1)
505
506                 # title
507                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
508                 if mobj is None:
509                         self.to_stderr('ERROR: unable to extract video title')
510                         return [None]
511                 video_title = mobj.group(1).decode('utf-8')
512                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
513                 video_title = video_title.replace(os.sep, u'%')
514
515                 # simplified title
516                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
517                 simple_title = simple_title.strip(ur'_')
518
519                 # Return information
520                 return [{
521                         'id':           video_id,
522                         'url':          video_real_url,
523                         'uploader':     video_uploader,
524                         'title':        video_title,
525                         'stitle':       simple_title,
526                         'ext':          video_extension,
527                         }]
528
529 if __name__ == '__main__':
530         try:
531                 # Modules needed only when running the main program
532                 import getpass
533                 import optparse
534
535                 # General configuration
536                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
537                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
538                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
539
540                 # Parse command line
541                 parser = optparse.OptionParser(
542                                 usage='Usage: %prog [options] url...',
543                                 version='INTERNAL',
544                                 conflict_handler='resolve',
545                                 )
546                 parser.add_option('-h', '--help',
547                                 action='help', help='print this help text and exit')
548                 parser.add_option('-v', '--version',
549                                 action='version', help='print program version and exit')
550                 parser.add_option('-u', '--username',
551                                 dest='username', metavar='UN', help='account username')
552                 parser.add_option('-p', '--password',
553                                 dest='password', metavar='PW', help='account password')
554                 parser.add_option('-o', '--output',
555                                 dest='outtmpl', metavar='TPL', help='output filename template')
556                 parser.add_option('-q', '--quiet',
557                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
558                 parser.add_option('-s', '--simulate',
559                                 action='store_true', dest='simulate', help='do not download video', default=False)
560                 parser.add_option('-t', '--title',
561                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
562                 parser.add_option('-l', '--literal',
563                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
564                 parser.add_option('-n', '--netrc',
565                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
566                 parser.add_option('-g', '--get-url',
567                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
568                 parser.add_option('-e', '--get-title',
569                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
570                 parser.add_option('-f', '--format',
571                                 dest='format', metavar='FMT', help='video format code')
572                 parser.add_option('-b', '--best-quality',
573                                 action='store_const', dest='video_format', help='alias for -f 18', const='18')
574                 parser.add_option('-m', '--mobile-version',
575                                 action='store_const', dest='video_format', help='alias for -f 17', const='17')
576                 parser.add_option('-i', '--ignore-errors',
577                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
578                 (opts, args) = parser.parse_args()
579
580                 # Conflicting, missing and erroneous options
581                 if len(args) < 1:
582                         sys.exit('ERROR: you must provide at least one URL')
583                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
584                         sys.exit('ERROR: using .netrc conflicts with giving username/password')
585                 if opts.password is not None and opts.username is None:
586                         sys.exit('ERROR: account username missing')
587                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
588                         sys.exit('ERROR: using output template conflicts with using title or literal title')
589                 if opts.usetitle and opts.useliteral:
590                         sys.exit('ERROR: using title conflicts with using literal title')
591                 if opts.username is not None and opts.password is None:
592                         opts.password = getpass.getpass('Type account password and press return:')
593
594                 # Information extractors
595                 youtube_ie = YoutubeIE()
596
597                 # File downloader
598                 fd = FileDownloader({
599                         'usenetrc': opts.usenetrc,
600                         'username': opts.username,
601                         'password': opts.password,
602                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
603                         'forceurl': opts.geturl,
604                         'forcetitle': opts.gettitle,
605                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
606                         'format': opts.format,
607                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
608                                 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
609                                 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
610                                 or '%(id)s.%(ext)s'),
611                         'ignoreerrors': opts.ignoreerrors,
612                         })
613                 fd.add_info_extractor(youtube_ie)
614                 retcode = fd.download(args)
615                 sys.exit(retcode)
616
617         except DownloadError:
618                 sys.exit(1)
619         except SameFileError:
620                 sys.exit('ERROR: fixed output name but more than one file to download')
621         except KeyboardInterrupt:
622                 sys.exit('\nERROR: Interrupted by user')