2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 forceurl: Force printing final URL.
59 forcetitle: Force printing title.
60 simulate: Do not download the video files.
61 format: Video format code.
62 outtmpl: Template for output names.
63 ignoreerrors: Do not stop on download errors.
69 def __init__(self, params):
70 """Create a FileDownloader object with the given options."""
72 self.set_params(params)
76 """Create directory components in filename. Similar to Unix "mkdir -p"."""
77 components = filename.split(os.sep)
78 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
80 if not os.path.exists(dir):
84 def format_bytes(bytes):
90 exponent = long(math.log(float(bytes), 1024.0))
91 suffix = 'bkMGTPEZY'[exponent]
92 converted = float(bytes) / float(1024**exponent)
93 return '%.2f%s' % (converted, suffix)
96 def calc_percent(byte_counter, data_len):
99 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
102 def calc_eta(start, now, total, current):
106 if current == 0 or dif < 0.001: # One millisecond
108 rate = float(current) / dif
109 eta = long((float(total) - float(current)) / rate)
110 (eta_mins, eta_secs) = divmod(eta, 60)
113 return '%02d:%02d' % (eta_mins, eta_secs)
116 def calc_speed(start, now, bytes):
118 if bytes == 0 or dif < 0.001: # One millisecond
119 return '%10s' % '---b/s'
120 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
123 def best_block_size(elapsed_time, bytes):
124 new_min = max(bytes / 2.0, 1.0)
125 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
126 if elapsed_time < 0.001:
128 rate = bytes / elapsed_time
135 def set_params(self, params):
136 """Sets parameters."""
137 if type(params) != dict:
138 raise ValueError('params: dictionary expected')
139 self._params = params
141 def get_params(self):
142 """Get parameters."""
145 def add_info_extractor(self, ie):
146 """Add an InfoExtractor object to the end of the list."""
148 ie.set_downloader(self)
150 def to_stdout(self, message, skip_eol=False):
151 """Print message to stdout if not in quiet mode."""
152 if not self._params.get('quiet', False):
153 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
156 def to_stderr(self, message):
157 """Print message to stderr."""
158 sys.stderr.write('%s\n' % message)
160 def fixed_template(self):
161 """Checks if the output template is fixed."""
162 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
164 def trouble(self, message=None):
165 """Determine action to take when a download problem appears.
167 Depending on if the downloader has been configured to ignore
168 download errors or not, this method may exit the program or
169 not when errors are found, after printing the message. If it
170 doesn't exit, it returns an error code suitable to be returned
171 later as a program exit code to indicate error.
173 if message is not None:
174 self.to_stderr(message)
175 if not self._params.get('ignoreerrors', False):
179 def download(self, url_list):
180 """Download a given list of URLs."""
182 if len(url_list) > 1 and self.fixed_template():
183 sys.exit('ERROR: fixed output name but more than one file to download')
186 suitable_found = False
188 if not ie.suitable(url):
190 # Suitable InfoExtractor found
191 suitable_found = True
192 all_results = ie.extract(url)
193 results = [x for x in all_results if x is not None]
194 if len(results) != len(all_results):
195 retcode = self.trouble()
197 if len(results) > 1 and self.fixed_template():
198 sys.exit('ERROR: fixed output name but more than one file to download')
200 for result in results:
203 if self._params.get('forcetitle', False):
204 print result['title']
205 if self._params.get('forceurl', False):
208 # Do nothing else if in simulate mode
209 if self._params.get('simulate', False):
213 filename = self._params['outtmpl'] % result
214 except (ValueError, KeyError), err:
215 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
218 self.pmkdir(filename)
219 except (OSError, IOError), err:
220 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
223 outstream = open(filename, 'wb')
224 except (OSError, IOError), err:
225 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
228 self._do_download(outstream, result['url'])
230 except (OSError, IOError), err:
231 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
233 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
234 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
237 if not suitable_found:
238 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
242 def _do_download(self, stream, url):
243 request = urllib2.Request(url, None, std_headers)
244 data = urllib2.urlopen(request)
245 data_len = data.info().get('Content-length', None)
246 data_len_str = self.format_bytes(data_len)
251 percent_str = self.calc_percent(byte_counter, data_len)
252 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
253 speed_str = self.calc_speed(start, time.time(), byte_counter)
254 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
255 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
258 data_block = data.read(block_size)
260 data_block_len = len(data_block)
261 if data_block_len == 0:
263 byte_counter += data_block_len
264 stream.write(data_block)
265 block_size = self.best_block_size(after - before, data_block_len)
268 if data_len is not None and str(byte_counter) != data_len:
269 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
271 class InfoExtractor(object):
272 """Information Extractor class.
274 Information extractors are the classes that, given a URL, extract
275 information from the video (or videos) the URL refers to. This
276 information includes the real video URL, the video title and simplified
277 title, author and others. It is returned in a list of dictionaries when
278 calling its extract() method. It is a list because a URL can refer to
279 more than one video (think of playlists). The dictionaries must include
280 the following fields:
282 id: Video identifier.
283 url: Final video URL.
284 uploader: Nickname of the video uploader.
285 title: Literal title.
286 stitle: Simplified title.
287 ext: Video filename extension.
289 Subclasses of this one should re-define the _real_initialize() and
290 _real_extract() methods, as well as the suitable() static method.
291 Probably, they should also be instantiated and added to the main
298 def __init__(self, downloader=None):
299 """Constructor. Receives an optional downloader."""
301 self.set_downloader(downloader)
305 """Receives a URL and returns True if suitable for this IE."""
308 def initialize(self):
309 """Initializes an instance (authentication, etc)."""
311 self._real_initialize()
314 def extract(self, url):
315 """Extracts URL information and returns it in list of dicts."""
317 return self._real_extract(url)
319 def set_downloader(self, downloader):
320 """Sets the downloader for this IE."""
321 self._downloader = downloader
323 def to_stdout(self, message):
324 """Print message to stdout if downloader is not in quiet mode."""
325 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
328 def to_stderr(self, message):
329 """Print message to stderr."""
330 sys.stderr.write('%s\n' % message)
332 def _real_initialize(self):
333 """Real initialization process. Redefine in subclasses."""
336 def _real_extract(self, url):
337 """Real extraction process. Redefine in subclasses."""
340 class YoutubeIE(InfoExtractor):
341 """Information extractor for youtube.com."""
343 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
344 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
345 _NETRC_MACHINE = 'youtube'
347 def _real_initialize(self):
348 if self._downloader is None:
353 downloader_params = self._downloader.get_params()
355 # Attempt to use provided username and password or .netrc data
356 if downloader_params.get('username', None) is not None:
357 username = downloader_params['username']
358 password = downloader_params['password']
359 elif downloader_params.get('usenetrc', False):
361 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367 except (IOError, netrc.NetrcParseError), err:
368 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
376 'current_form': 'loginForm',
378 'action_login': 'Log In',
379 'username': username,
380 'password': password,
382 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
384 self.to_stdout('[youtube] Logging in')
385 login_results = urllib2.urlopen(request).read()
386 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
387 self.to_stderr('WARNING: unable to log in: bad username or password')
389 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
390 self.to_stderr('WARNING: unable to log in: %s' % str(err))
396 'action_confirm': 'Confirm',
398 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
400 self.to_stdout('[youtube] Confirming age')
401 age_results = urllib2.urlopen(request).read()
402 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
403 sys.exit('ERROR: unable to confirm age: %s' % str(err))
405 def _real_extract(self, url):
406 # Extract video id from URL
407 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
409 self.to_stderr('ERROR: invalid URL: %s' % url)
411 video_id = mobj.group(2)
413 # Downloader parameters
415 if self._downloader is not None:
416 params = self._downloader.get_params()
417 format_param = params.get('format', None)
420 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
422 # Normalize URL, including format
423 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
424 if format_param is not None:
425 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
426 request = urllib2.Request(normalized_url, None, std_headers)
428 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
429 video_webpage = urllib2.urlopen(request).read()
430 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
431 sys.exit('ERROR: unable to download video: %s' % str(err))
432 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
435 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
437 self.to_stderr('ERROR: unable to extract "t" parameter')
439 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
440 if format_param is not None:
441 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
442 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
445 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
447 self.to_stderr('ERROR: unable to extract uploader nickname')
449 video_uploader = mobj.group(1)
452 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
454 self.to_stderr('ERROR: unable to extract video title')
456 video_title = mobj.group(1).decode('utf-8')
457 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
458 video_title = video_title.replace(os.sep, u'%')
461 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
462 simple_title = simple_title.strip(ur'_')
467 'url': video_real_url,
468 'uploader': video_uploader,
469 'title': video_title,
470 'stitle': simple_title,
471 'ext': video_extension,
474 if __name__ == '__main__':
476 # Modules needed only when running the main program
480 # General configuration
481 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
482 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
483 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
486 parser = optparse.OptionParser(
487 usage='Usage: %prog [options] url...',
489 conflict_handler='resolve',
491 parser.add_option('-h', '--help',
492 action='help', help='print this help text and exit')
493 parser.add_option('-v', '--version',
494 action='version', help='print program version and exit')
495 parser.add_option('-u', '--username',
496 dest='username', metavar='UN', help='account username')
497 parser.add_option('-p', '--password',
498 dest='password', metavar='PW', help='account password')
499 parser.add_option('-o', '--output',
500 dest='outtmpl', metavar='TPL', help='output filename template')
501 parser.add_option('-q', '--quiet',
502 action='store_true', dest='quiet', help='activates quiet mode', default=False)
503 parser.add_option('-s', '--simulate',
504 action='store_true', dest='simulate', help='do not download video', default=False)
505 parser.add_option('-t', '--title',
506 action='store_true', dest='usetitle', help='use title in file name', default=False)
507 parser.add_option('-l', '--literal',
508 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
509 parser.add_option('-n', '--netrc',
510 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
511 parser.add_option('-g', '--get-url',
512 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
513 parser.add_option('-e', '--get-title',
514 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
515 parser.add_option('-f', '--format',
516 dest='format', metavar='FMT', help='video format code')
517 parser.add_option('-b', '--best-quality',
518 action='store_const', dest='video_format', help='alias for -f 18', const='18')
519 parser.add_option('-i', '--ignore-errors',
520 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
521 (opts, args) = parser.parse_args()
523 # Conflicting, missing and erroneous options
525 sys.exit('ERROR: you must provide at least one URL')
526 if opts.usenetrc and (opts.username is not None or opts.password is not None):
527 sys.exit('ERROR: using .netrc conflicts with giving username/password')
528 if opts.password is not None and opts.username is None:
529 sys.exit('ERROR: account username missing')
530 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
531 sys.exit('ERROR: using output template conflicts with using title or literal title')
532 if opts.usetitle and opts.useliteral:
533 sys.exit('ERROR: using title conflicts with using literal title')
534 if opts.username is not None and opts.password is None:
535 opts.password = getpass.getpass('Type account password and press return:')
537 # Information extractors
538 youtube_ie = YoutubeIE()
541 fd = FileDownloader({
542 'usenetrc': opts.usenetrc,
543 'username': opts.username,
544 'password': opts.password,
545 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
546 'forceurl': opts.geturl,
547 'forcetitle': opts.gettitle,
548 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
549 'format': opts.format,
550 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
551 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
552 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
553 or '%(id)s.%(ext)s'),
554 'ignoreerrors': opts.ignoreerrors,
556 fd.add_info_extractor(youtube_ie)
557 retcode = fd.download(args)
560 except KeyboardInterrupt:
561 sys.exit('\nERROR: Interrupted by user')