2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 forceurl: Force printing final URL.
59 forcetitle: Force printing title.
60 simulate: Do not download the video files.
61 format: Video format code.
62 outtmpl: Template for output names.
63 ignoreerrors: Do not stop on download errors.
69 def __init__(self, params):
70 """Create a FileDownloader object with the given options."""
72 self.set_params(params)
76 """Create directory components in filename. Similar to Unix "mkdir -p"."""
77 components = filename.split(os.sep)
78 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
80 if not os.path.exists(dir):
84 def format_bytes(bytes):
90 exponent = long(math.log(float(bytes), 1024.0))
91 suffix = 'bkMGTPEZY'[exponent]
92 converted = float(bytes) / float(1024**exponent)
93 return '%.2f%s' % (converted, suffix)
96 def calc_percent(byte_counter, data_len):
99 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
102 def calc_eta(start, now, total, current):
106 if current == 0 or dif < 0.001: # One millisecond
108 rate = float(current) / dif
109 eta = long((float(total) - float(current)) / rate)
110 (eta_mins, eta_secs) = divmod(eta, 60)
113 return '%02d:%02d' % (eta_mins, eta_secs)
116 def calc_speed(start, now, bytes):
118 if bytes == 0 or dif < 0.001: # One millisecond
119 return '%10s' % '---b/s'
120 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
123 def best_block_size(elapsed_time, bytes):
124 new_min = max(bytes / 2.0, 1.0)
125 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
126 if elapsed_time < 0.001:
128 rate = bytes / elapsed_time
135 def set_params(self, params):
136 """Sets parameters."""
137 if type(params) != dict:
138 raise ValueError('params: dictionary expected')
139 self._params = params
141 def get_params(self):
142 """Get parameters."""
145 def add_info_extractor(self, ie):
146 """Add an InfoExtractor object to the end of the list."""
148 ie.set_downloader(self)
150 def to_stdout(self, message, skip_eol=False):
151 """Print message to stdout if not in quiet mode."""
152 if not self._params.get('quiet', False):
153 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
156 def to_stderr(self, message):
157 """Print message to stderr."""
158 sys.stderr.write('%s\n' % message)
160 def fixed_template(self):
161 """Checks if the output template is fixed."""
162 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
164 def trouble(self, message=None):
165 """Determine action to take when a download problem appears.
167 Depending on if the downloader has been configured to ignore
168 download errors or not, this method may exit the program or
169 not when errors are found, after printing the message. If it
170 doesn't exit, it returns an error code suitable to be returned
171 later as a program exit code to indicate error.
173 if message is not None:
174 self.to_stderr(message)
175 if not self._params.get('ignoreerrors', False):
179 def download(self, url_list):
180 """Download a given list of URLs."""
182 if len(url_list) > 1 and self.fixed_template():
183 sys.exit('ERROR: fixed output name but more than one file to download')
186 suitable_found = False
188 if not ie.suitable(url):
190 # Suitable InfoExtractor found
191 suitable_found = True
192 all_results = ie.extract(url)
193 results = [x for x in all_results if x is not None]
194 if len(results) != len(all_results):
195 retcode = self.trouble()
197 if len(results) > 1 and self.fixed_template():
198 sys.exit('ERROR: fixed output name but more than one file to download')
200 for result in results:
203 if self._params.get('forcetitle', False):
204 print result['title']
205 if self._params.get('forceurl', False):
208 # Do nothing else if in simulate mode
209 if self._params.get('simulate', False):
213 filename = self._params['outtmpl'] % result
214 except (ValueError, KeyError), err:
215 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
218 self.pmkdir(filename)
219 except (OSError, IOError), err:
220 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
223 outstream = open(filename, 'wb')
224 except (OSError, IOError), err:
225 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
228 self._do_download(outstream, result['url'])
230 except (OSError, IOError), err:
231 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
233 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
234 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
237 if not suitable_found:
238 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
242 def _do_download(self, stream, url):
243 request = urllib2.Request(url, None, std_headers)
244 data = urllib2.urlopen(request)
245 data_len = data.info().get('Content-length', None)
246 data_len_str = self.format_bytes(data_len)
251 percent_str = self.calc_percent(byte_counter, data_len)
252 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
253 speed_str = self.calc_speed(start, time.time(), byte_counter)
254 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
255 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
258 data_block = data.read(block_size)
260 data_block_len = len(data_block)
261 if data_block_len == 0:
263 byte_counter += data_block_len
264 stream.write(data_block)
265 block_size = self.best_block_size(after - before, data_block_len)
268 if data_len is not None and str(byte_counter) != data_len:
269 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
271 class InfoExtractor(object):
272 """Information Extractor class.
274 Information extractors are the classes that, given a URL, extract
275 information from the video (or videos) the URL refers to. This
276 information includes the real video URL, the video title and simplified
277 title, author and others. It is returned in a list of dictionaries when
278 calling its extract() method. It is a list because a URL can refer to
279 more than one video (think of playlists). The dictionaries must include
280 the following fields:
282 id: Video identifier.
283 url: Final video URL.
284 uploader: Nickname of the video uploader.
285 title: Literal title.
286 stitle: Simplified title.
287 ext: Video filename extension.
289 Subclasses of this one should re-define the _real_initialize() and
290 _real_extract() methods, as well as the suitable() static method.
291 Probably, they should also be instantiated and added to the main
298 def __init__(self, downloader=None):
299 """Constructor. Receives an optional downloader."""
301 self.set_downloader(downloader)
305 """Receives a URL and returns True if suitable for this IE."""
308 def initialize(self):
309 """Initializes an instance (authentication, etc)."""
311 self._real_initialize()
314 def extract(self, url):
315 """Extracts URL information and returns it in list of dicts."""
317 return self._real_extract(url)
319 def set_downloader(self, downloader):
320 """Sets the downloader for this IE."""
321 self._downloader = downloader
323 def to_stdout(self, message):
324 """Print message to stdout if downloader is not in quiet mode."""
325 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
328 def to_stderr(self, message):
329 """Print message to stderr."""
330 sys.stderr.write('%s\n' % message)
332 def _real_initialize(self):
333 """Real initialization process. Redefine in subclasses."""
336 def _real_extract(self, url):
337 """Real extraction process. Redefine in subclasses."""
340 class YoutubeIE(InfoExtractor):
341 """Information extractor for youtube.com."""
343 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
344 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
345 _NETRC_MACHINE = 'youtube'
347 def _real_initialize(self):
348 if self._downloader is None:
353 downloader_params = self._downloader.get_params()
355 # Attempt to use provided username and password or .netrc data
356 if downloader_params.get('username', None) is not None:
357 username = downloader_params['username']
358 password = downloader_params['password']
359 elif downloader_params.get('usenetrc', False):
361 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
366 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
367 except (IOError, netrc.NetrcParseError), err:
368 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
371 # No authentication to be performed
377 'current_form': 'loginForm',
379 'action_login': 'Log In',
380 'username': username,
381 'password': password,
383 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
385 self.to_stdout('[youtube] Logging in')
386 login_results = urllib2.urlopen(request).read()
387 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
388 self.to_stderr('WARNING: unable to log in: bad username or password')
390 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
391 self.to_stderr('WARNING: unable to log in: %s' % str(err))
397 'action_confirm': 'Confirm',
399 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
401 self.to_stdout('[youtube] Confirming age')
402 age_results = urllib2.urlopen(request).read()
403 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
404 sys.exit('ERROR: unable to confirm age: %s' % str(err))
406 def _real_extract(self, url):
407 # Extract video id from URL
408 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
410 self.to_stderr('ERROR: invalid URL: %s' % url)
412 video_id = mobj.group(2)
414 # Downloader parameters
416 if self._downloader is not None:
417 params = self._downloader.get_params()
418 format_param = params.get('format', None)
421 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
423 # Normalize URL, including format
424 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
425 if format_param is not None:
426 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
427 request = urllib2.Request(normalized_url, None, std_headers)
429 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
430 video_webpage = urllib2.urlopen(request).read()
431 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
432 sys.exit('ERROR: unable to download video: %s' % str(err))
433 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
436 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
438 self.to_stderr('ERROR: unable to extract "t" parameter')
440 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
441 if format_param is not None:
442 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
443 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
446 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
448 self.to_stderr('ERROR: unable to extract uploader nickname')
450 video_uploader = mobj.group(1)
453 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
455 self.to_stderr('ERROR: unable to extract video title')
457 video_title = mobj.group(1).decode('utf-8')
458 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
459 video_title = video_title.replace(os.sep, u'%')
462 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
463 simple_title = simple_title.strip(ur'_')
468 'url': video_real_url,
469 'uploader': video_uploader,
470 'title': video_title,
471 'stitle': simple_title,
472 'ext': video_extension,
475 if __name__ == '__main__':
477 # Modules needed only when running the main program
481 # General configuration
482 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
483 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
484 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
487 parser = optparse.OptionParser(
488 usage='Usage: %prog [options] url...',
490 conflict_handler='resolve',
492 parser.add_option('-h', '--help',
493 action='help', help='print this help text and exit')
494 parser.add_option('-v', '--version',
495 action='version', help='print program version and exit')
496 parser.add_option('-u', '--username',
497 dest='username', metavar='UN', help='account username')
498 parser.add_option('-p', '--password',
499 dest='password', metavar='PW', help='account password')
500 parser.add_option('-o', '--output',
501 dest='outtmpl', metavar='TPL', help='output filename template')
502 parser.add_option('-q', '--quiet',
503 action='store_true', dest='quiet', help='activates quiet mode', default=False)
504 parser.add_option('-s', '--simulate',
505 action='store_true', dest='simulate', help='do not download video', default=False)
506 parser.add_option('-t', '--title',
507 action='store_true', dest='usetitle', help='use title in file name', default=False)
508 parser.add_option('-l', '--literal',
509 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
510 parser.add_option('-n', '--netrc',
511 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
512 parser.add_option('-g', '--get-url',
513 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
514 parser.add_option('-e', '--get-title',
515 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
516 parser.add_option('-f', '--format',
517 dest='format', metavar='FMT', help='video format code')
518 parser.add_option('-b', '--best-quality',
519 action='store_const', dest='video_format', help='alias for -f 18', const='18')
520 parser.add_option('-i', '--ignore-errors',
521 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
522 (opts, args) = parser.parse_args()
524 # Conflicting, missing and erroneous options
526 sys.exit('ERROR: you must provide at least one URL')
527 if opts.usenetrc and (opts.username is not None or opts.password is not None):
528 sys.exit('ERROR: using .netrc conflicts with giving username/password')
529 if opts.password is not None and opts.username is None:
530 sys.exit('ERROR: account username missing')
531 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
532 sys.exit('ERROR: using output template conflicts with using title or literal title')
533 if opts.usetitle and opts.useliteral:
534 sys.exit('ERROR: using title conflicts with using literal title')
535 if opts.username is not None and opts.password is None:
536 opts.password = getpass.getpass('Type account password and press return:')
538 # Information extractors
539 youtube_ie = YoutubeIE()
542 fd = FileDownloader({
543 'usenetrc': opts.usenetrc,
544 'username': opts.username,
545 'password': opts.password,
546 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
547 'forceurl': opts.geturl,
548 'forcetitle': opts.gettitle,
549 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
550 'format': opts.format,
551 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
552 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
553 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
554 or '%(id)s.%(ext)s'),
555 'ignoreerrors': opts.ignoreerrors,
557 fd.add_info_extractor(youtube_ie)
558 retcode = fd.download(args)
561 except KeyboardInterrupt:
562 sys.exit('\nERROR: Interrupted by user')