2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class DownloadError(Exception):
29 """Download Error exception.
31 This exception may be thrown by FileDownloader objects if they are not
32 configured to continue on errors. They will contain the appropriate
37 class SameFileError(Exception):
38 """Same File exception.
40 This exception will be thrown by FileDownloader objects if they detect
41 multiple files would have to be downloaded to the same file on disk.
45 class FileDownloader(object):
46 """File Downloader class.
48 File downloader objects are the ones responsible of downloading the
49 actual video file and writing it to disk if the user has requested
50 it, among some other tasks. In most cases there should be one per
51 program. As, given a video URL, the downloader doesn't know how to
52 extract all the needed information, task that InfoExtractors do, it
53 has to pass the URL to one of them.
55 For this, file downloader objects have a method that allows
56 InfoExtractors to be registered in a given order. When it is passed
57 a URL, the file downloader handles it to the first InfoExtractor it
58 finds that reports being able to handle it. The InfoExtractor returns
59 all the information to the FileDownloader and the latter downloads the
60 file or does whatever it's instructed to do.
62 File downloaders accept a lot of parameters. In order not to saturate
63 the object constructor with arguments, it receives a dictionary of
64 options instead. These options are available through the get_params()
65 method for the InfoExtractors to use. The FileDownloader also registers
66 itself as the downloader in charge for the InfoExtractors that are
67 added to it, so this is a "mutual registration".
71 username: Username for authentication purposes.
72 password: Password for authentication purposes.
73 usenetrc: Use netrc for authentication instead.
74 quiet: Do not print messages to stdout.
75 forceurl: Force printing final URL.
76 forcetitle: Force printing title.
77 simulate: Do not download the video files.
78 format: Video format code.
79 outtmpl: Template for output names.
80 ignoreerrors: Do not stop on download errors.
86 def __init__(self, params):
87 """Create a FileDownloader object with the given options."""
89 self.set_params(params)
93 """Create directory components in filename. Similar to Unix "mkdir -p"."""
94 components = filename.split(os.sep)
95 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
97 if not os.path.exists(dir):
101 def format_bytes(bytes):
107 exponent = long(math.log(float(bytes), 1024.0))
108 suffix = 'bkMGTPEZY'[exponent]
109 converted = float(bytes) / float(1024**exponent)
110 return '%.2f%s' % (converted, suffix)
113 def calc_percent(byte_counter, data_len):
116 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
119 def calc_eta(start, now, total, current):
123 if current == 0 or dif < 0.001: # One millisecond
125 rate = float(current) / dif
126 eta = long((float(total) - float(current)) / rate)
127 (eta_mins, eta_secs) = divmod(eta, 60)
130 return '%02d:%02d' % (eta_mins, eta_secs)
133 def calc_speed(start, now, bytes):
135 if bytes == 0 or dif < 0.001: # One millisecond
136 return '%10s' % '---b/s'
137 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
140 def best_block_size(elapsed_time, bytes):
141 new_min = max(bytes / 2.0, 1.0)
142 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
143 if elapsed_time < 0.001:
145 rate = bytes / elapsed_time
152 def set_params(self, params):
153 """Sets parameters."""
154 if type(params) != dict:
155 raise ValueError('params: dictionary expected')
156 self._params = params
158 def get_params(self):
159 """Get parameters."""
162 def add_info_extractor(self, ie):
163 """Add an InfoExtractor object to the end of the list."""
165 ie.set_downloader(self)
167 def to_stdout(self, message, skip_eol=False):
168 """Print message to stdout if not in quiet mode."""
169 if not self._params.get('quiet', False):
170 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
173 def to_stderr(self, message):
174 """Print message to stderr."""
175 sys.stderr.write('%s\n' % message)
177 def fixed_template(self):
178 """Checks if the output template is fixed."""
179 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
181 def trouble(self, message=None):
182 """Determine action to take when a download problem appears.
184 Depending on if the downloader has been configured to ignore
185 download errors or not, this method may throw an exception or
186 not when errors are found, after printing the message. If it
187 doesn't raise, it returns an error code suitable to be returned
188 later as a program exit code to indicate error.
190 if message is not None:
191 self.to_stderr(message)
192 if not self._params.get('ignoreerrors', False):
193 raise DownloadError(message)
196 def report_destination(self, filename):
197 """Report destination filename."""
198 self.to_stdout('[download] Destination: %s' % filename)
200 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
201 """Report download progress."""
202 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
203 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
205 def report_finish(self):
206 """Report download finished."""
209 def download(self, url_list):
210 """Download a given list of URLs."""
212 if len(url_list) > 1 and self.fixed_template():
213 raise SameFileError(self._params['outtmpl'])
216 suitable_found = False
218 if not ie.suitable(url):
220 # Suitable InfoExtractor found
221 suitable_found = True
222 all_results = ie.extract(url)
223 results = [x for x in all_results if x is not None]
224 if len(results) != len(all_results):
225 retcode = self.trouble()
227 if len(results) > 1 and self.fixed_template():
228 raise SameFileError(self._params['outtmpl'])
230 for result in results:
233 if self._params.get('forcetitle', False):
234 print result['title']
235 if self._params.get('forceurl', False):
238 # Do nothing else if in simulate mode
239 if self._params.get('simulate', False):
243 filename = self._params['outtmpl'] % result
244 self.report_destination(filename)
245 except (ValueError, KeyError), err:
246 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
249 self.pmkdir(filename)
250 except (OSError, IOError), err:
251 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
254 outstream = open(filename, 'wb')
255 except (OSError, IOError), err:
256 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
259 self._do_download(outstream, result['url'])
261 except (OSError, IOError), err:
262 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
265 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
268 if not suitable_found:
269 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
273 def _do_download(self, stream, url):
274 request = urllib2.Request(url, None, std_headers)
275 data = urllib2.urlopen(request)
276 data_len = data.info().get('Content-length', None)
277 data_len_str = self.format_bytes(data_len)
283 percent_str = self.calc_percent(byte_counter, data_len)
284 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
285 speed_str = self.calc_speed(start, time.time(), byte_counter)
286 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
290 data_block = data.read(block_size)
292 data_block_len = len(data_block)
293 if data_block_len == 0:
295 byte_counter += data_block_len
296 stream.write(data_block)
297 block_size = self.best_block_size(after - before, data_block_len)
300 if data_len is not None and str(byte_counter) != data_len:
301 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
303 class InfoExtractor(object):
304 """Information Extractor class.
306 Information extractors are the classes that, given a URL, extract
307 information from the video (or videos) the URL refers to. This
308 information includes the real video URL, the video title and simplified
309 title, author and others. It is returned in a list of dictionaries when
310 calling its extract() method. It is a list because a URL can refer to
311 more than one video (think of playlists). The dictionaries must include
312 the following fields:
314 id: Video identifier.
315 url: Final video URL.
316 uploader: Nickname of the video uploader.
317 title: Literal title.
318 stitle: Simplified title.
319 ext: Video filename extension.
321 Subclasses of this one should re-define the _real_initialize() and
322 _real_extract() methods, as well as the suitable() static method.
323 Probably, they should also be instantiated and added to the main
330 def __init__(self, downloader=None):
331 """Constructor. Receives an optional downloader."""
333 self.set_downloader(downloader)
337 """Receives a URL and returns True if suitable for this IE."""
340 def initialize(self):
341 """Initializes an instance (authentication, etc)."""
343 self._real_initialize()
346 def extract(self, url):
347 """Extracts URL information and returns it in list of dicts."""
349 return self._real_extract(url)
351 def set_downloader(self, downloader):
352 """Sets the downloader for this IE."""
353 self._downloader = downloader
355 def to_stdout(self, message):
356 """Print message to stdout if downloader is not in quiet mode."""
357 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
360 def to_stderr(self, message):
361 """Print message to stderr."""
362 sys.stderr.write('%s\n' % message)
364 def _real_initialize(self):
365 """Real initialization process. Redefine in subclasses."""
368 def _real_extract(self, url):
369 """Real extraction process. Redefine in subclasses."""
372 class YoutubeIE(InfoExtractor):
373 """Information extractor for youtube.com."""
375 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
376 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
377 _NETRC_MACHINE = 'youtube'
379 def report_login(self):
380 """Report attempt to log in."""
381 self.to_stdout('[youtube] Logging in')
383 def report_age_confirmation(self):
384 """Report attempt to confirm age."""
385 self.to_stdout('[youtube] Confirming age')
387 def report_webpage_download(self, video_id):
388 """Report attempt to download webpage."""
389 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
391 def report_information_extraction(self, video_id):
392 """Report attempt to extract video information."""
393 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
395 def report_video_url(self, video_id, video_real_url):
396 """Report extracted video URL."""
397 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
399 def _real_initialize(self):
400 if self._downloader is None:
405 downloader_params = self._downloader.get_params()
407 # Attempt to use provided username and password or .netrc data
408 if downloader_params.get('username', None) is not None:
409 username = downloader_params['username']
410 password = downloader_params['password']
411 elif downloader_params.get('usenetrc', False):
413 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
418 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
419 except (IOError, netrc.NetrcParseError), err:
420 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
423 # No authentication to be performed
429 'current_form': 'loginForm',
431 'action_login': 'Log In',
432 'username': username,
433 'password': password,
435 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
438 login_results = urllib2.urlopen(request).read()
439 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
440 self.to_stderr('WARNING: unable to log in: bad username or password')
442 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
443 self.to_stderr('WARNING: unable to log in: %s' % str(err))
449 'action_confirm': 'Confirm',
451 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
453 self.report_age_confirmation()
454 age_results = urllib2.urlopen(request).read()
455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
456 self.to_stderr('ERROR: unable to confirm age: %s' % str(err))
459 def _real_extract(self, url):
460 # Extract video id from URL
461 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
463 self.to_stderr('ERROR: invalid URL: %s' % url)
465 video_id = mobj.group(2)
467 # Downloader parameters
469 if self._downloader is not None:
470 params = self._downloader.get_params()
471 format_param = params.get('format', None)
474 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
476 # Normalize URL, including format
477 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
478 if format_param is not None:
479 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
480 request = urllib2.Request(normalized_url, None, std_headers)
482 self.report_webpage_download(video_id)
483 video_webpage = urllib2.urlopen(request).read()
484 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
485 self.to_stderr('ERROR: unable to download video webpage: %s' % str(err))
487 self.report_information_extraction(video_id)
490 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
492 self.to_stderr('ERROR: unable to extract "t" parameter')
494 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
495 if format_param is not None:
496 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
497 self.report_video_url(video_id, video_real_url)
500 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
502 self.to_stderr('ERROR: unable to extract uploader nickname')
504 video_uploader = mobj.group(1)
507 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
509 self.to_stderr('ERROR: unable to extract video title')
511 video_title = mobj.group(1).decode('utf-8')
512 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
513 video_title = video_title.replace(os.sep, u'%')
516 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
517 simple_title = simple_title.strip(ur'_')
522 'url': video_real_url,
523 'uploader': video_uploader,
524 'title': video_title,
525 'stitle': simple_title,
526 'ext': video_extension,
529 if __name__ == '__main__':
531 # Modules needed only when running the main program
535 # General configuration
536 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
537 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
538 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
541 parser = optparse.OptionParser(
542 usage='Usage: %prog [options] url...',
544 conflict_handler='resolve',
546 parser.add_option('-h', '--help',
547 action='help', help='print this help text and exit')
548 parser.add_option('-v', '--version',
549 action='version', help='print program version and exit')
550 parser.add_option('-u', '--username',
551 dest='username', metavar='UN', help='account username')
552 parser.add_option('-p', '--password',
553 dest='password', metavar='PW', help='account password')
554 parser.add_option('-o', '--output',
555 dest='outtmpl', metavar='TPL', help='output filename template')
556 parser.add_option('-q', '--quiet',
557 action='store_true', dest='quiet', help='activates quiet mode', default=False)
558 parser.add_option('-s', '--simulate',
559 action='store_true', dest='simulate', help='do not download video', default=False)
560 parser.add_option('-t', '--title',
561 action='store_true', dest='usetitle', help='use title in file name', default=False)
562 parser.add_option('-l', '--literal',
563 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
564 parser.add_option('-n', '--netrc',
565 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
566 parser.add_option('-g', '--get-url',
567 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
568 parser.add_option('-e', '--get-title',
569 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
570 parser.add_option('-f', '--format',
571 dest='format', metavar='FMT', help='video format code')
572 parser.add_option('-b', '--best-quality',
573 action='store_const', dest='video_format', help='alias for -f 18', const='18')
574 parser.add_option('-m', '--mobile-version',
575 action='store_const', dest='video_format', help='alias for -f 17', const='17')
576 parser.add_option('-i', '--ignore-errors',
577 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
578 (opts, args) = parser.parse_args()
580 # Conflicting, missing and erroneous options
582 sys.exit('ERROR: you must provide at least one URL')
583 if opts.usenetrc and (opts.username is not None or opts.password is not None):
584 sys.exit('ERROR: using .netrc conflicts with giving username/password')
585 if opts.password is not None and opts.username is None:
586 sys.exit('ERROR: account username missing')
587 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
588 sys.exit('ERROR: using output template conflicts with using title or literal title')
589 if opts.usetitle and opts.useliteral:
590 sys.exit('ERROR: using title conflicts with using literal title')
591 if opts.username is not None and opts.password is None:
592 opts.password = getpass.getpass('Type account password and press return:')
594 # Information extractors
595 youtube_ie = YoutubeIE()
598 fd = FileDownloader({
599 'usenetrc': opts.usenetrc,
600 'username': opts.username,
601 'password': opts.password,
602 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
603 'forceurl': opts.geturl,
604 'forcetitle': opts.gettitle,
605 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
606 'format': opts.format,
607 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl)
608 or (opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
609 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
610 or '%(id)s.%(ext)s'),
611 'ignoreerrors': opts.ignoreerrors,
613 fd.add_info_extractor(youtube_ie)
614 retcode = fd.download(args)
617 except DownloadError:
619 except SameFileError:
620 sys.exit('ERROR: fixed output name but more than one file to download')
621 except KeyboardInterrupt:
622 sys.exit('\nERROR: Interrupted by user')