2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 forceurl: Force printing final URL.
59 forcetitle: Force printing title.
60 simulate: Do not download the video files.
61 format: Video format code.
62 outtmpl: Template for output names.
68 def __init__(self, params):
70 self.set_params(params)
74 """Create directory components in filename. Similar to Unix "mkdir -p"."""
75 components = filename.split(os.sep)
76 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
78 if not os.path.exists(dir):
82 def format_bytes(bytes):
88 exponent = long(math.log(float(bytes), 1024.0))
89 suffix = 'bkMGTPEZY'[exponent]
90 converted = float(bytes) / float(1024**exponent)
91 return '%.2f%s' % (converted, suffix)
94 def calc_percent(byte_counter, data_len):
97 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
100 def calc_eta(start, now, total, current):
104 if current == 0 or dif < 0.001: # One millisecond
106 rate = float(current) / dif
107 eta = long((float(total) - float(current)) / rate)
108 (eta_mins, eta_secs) = divmod(eta, 60)
111 return '%02d:%02d' % (eta_mins, eta_secs)
114 def calc_speed(start, now, bytes):
116 if bytes == 0 or dif < 0.001: # One millisecond
117 return '%10s' % '---b/s'
118 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
121 def best_block_size(elapsed_time, bytes):
122 new_min = max(bytes / 2.0, 1.0)
123 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
124 if elapsed_time < 0.001:
126 rate = bytes / elapsed_time
133 def set_params(self, params):
134 """Sets parameters."""
135 if type(params) != dict:
136 raise ValueError('params: dictionary expected')
137 self._params = params
139 def get_params(self):
140 """Get parameters."""
143 def add_info_extractor(self, ie):
144 """Add an InfoExtractor object to the end of the list."""
146 ie.set_downloader(self)
148 def to_stdout(self, message, skip_eol=False):
149 """Print message to stdout if not in quiet mode."""
150 if not self._params.get('quiet', False):
151 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
154 def to_stderr(self, message):
155 """Print message to stderr."""
156 sys.stderr.write('%s\n' % message)
158 def fixed_template(self):
159 """Checks if the output template is fixed."""
160 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
162 def download(self, url_list):
163 """Download a given list of URLs."""
164 if len(url_list) > 1 and self.fixed_template():
165 sys.exit('ERROR: fixed output name but more than one file to download')
168 suitable_found = False
170 if not ie.suitable(url):
172 # Suitable InfoExtractor found
173 suitable_found = True
174 results = [x for x in ie.extract(url) if x is not None]
176 if len(results) > 1 and self.fixed_template():
177 sys.exit('ERROR: fixed output name but more than one file to download')
179 for result in results:
182 if self._params.get('forcetitle', False):
183 print result['title']
184 if self._params.get('forceurl', False):
187 # Do nothing else if in simulate mode
188 if self._params.get('simulate', False):
192 filename = self._params['outtmpl'] % result
193 except (ValueError, KeyError), err:
194 self.to_stderr('ERROR: invalid output template: %s' % str(err))
197 self.pmkdir(filename)
198 except (OSError, IOError), err:
199 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
202 outstream = open(filename, 'wb')
203 except (OSError, IOError), err:
204 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
207 self._do_download(outstream, result['url'])
209 except (OSError, IOError), err:
210 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
213 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
216 if not suitable_found:
217 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
219 def _do_download(self, stream, url):
220 request = urllib2.Request(url, None, std_headers)
221 data = urllib2.urlopen(request)
222 data_len = data.info().get('Content-length', None)
223 data_len_str = self.format_bytes(data_len)
228 percent_str = self.calc_percent(byte_counter, data_len)
229 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
230 speed_str = self.calc_speed(start, time.time(), byte_counter)
231 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
232 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
235 data_block = data.read(block_size)
237 data_block_len = len(data_block)
238 if data_block_len == 0:
240 byte_counter += data_block_len
241 stream.write(data_block)
242 block_size = self.best_block_size(after - before, data_block_len)
245 if data_len is not None and str(byte_counter) != data_len:
246 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
248 class InfoExtractor(object):
249 """Information Extractor class.
251 Information extractors are the classes that, given a URL, extract
252 information from the video (or videos) the URL refers to. This
253 information includes the real video URL, the video title and simplified
254 title, author and others. It is returned in a list of dictionaries when
255 calling its extract() method. It is a list because a URL can refer to
256 more than one video (think of playlists). The dictionaries must include
257 the following fields:
259 id: Video identifier.
260 url: Final video URL.
261 uploader: Nickname of the video uploader.
262 title: Literal title.
263 stitle: Simplified title.
264 ext: Video filename extension.
266 Subclasses of this one should re-define the _real_initialize() and
267 _real_extract() methods, as well as the suitable() static method.
268 Probably, they should also be instantiated and added to the main
275 def __init__(self, downloader=None):
276 """Constructor. Receives an optional downloader."""
278 self.set_downloader(downloader)
282 """Receives a URL and returns True if suitable for this IE."""
285 def initialize(self):
286 """Initializes an instance (login, etc)."""
288 self._real_initialize()
291 def extract(self, url):
292 """Extracts URL information and returns it in list of dicts."""
294 return self._real_extract(url)
296 def set_downloader(self, downloader):
297 """Sets the downloader for this IE."""
298 self._downloader = downloader
300 def to_stdout(self, message):
301 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
304 def to_stderr(self, message):
305 sys.stderr.write('%s\n' % message)
307 def _real_initialize(self):
308 """Real initialization process. Redefine in subclasses."""
311 def _real_extract(self, url):
312 """Real extraction process. Redefine in subclasses."""
315 class YoutubeIE(InfoExtractor):
316 """Information extractor for youtube.com."""
318 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
319 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
320 _NETRC_MACHINE = 'youtube'
322 def _real_initialize(self):
323 if self._downloader is None:
328 downloader_params = self._downloader.get_params()
330 # Attempt to use provided username and password or .netrc data
331 if downloader_params.get('username', None) is not None:
332 username = downloader_params['username']
333 password = downloader_params['password']
334 elif downloader_params.get('usenetrc', False):
336 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
341 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
342 except (IOError, netrc.NetrcParseError), err:
343 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
351 'current_form': 'loginForm',
353 'action_login': 'Log In',
354 'username': username,
355 'password': password,
357 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
359 self.to_stdout('[youtube] Logging in')
360 login_results = urllib2.urlopen(request).read()
361 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
362 self.to_stderr('WARNING: Unable to log in: bad username or password')
364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
365 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
371 'action_confirm': 'Confirm',
373 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
375 self.to_stdout('[youtube] Confirming age')
376 age_results = urllib2.urlopen(request).read()
377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
378 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
380 def _real_extract(self, url):
381 # Extract video id from URL
382 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
384 self.to_stderr('ERROR: Invalid URL: %s' % url)
386 video_id = mobj.group(2)
388 # Downloader parameters
390 if self._downloader is not None:
391 params = self._downloader.get_params()
392 format_param = params.get('format', None)
395 video_extension = {'18': 'mp4'}.get(format_param, 'flv')
397 # Normalize URL, including format
398 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
399 if format_param is not None:
400 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
401 request = urllib2.Request(normalized_url, None, std_headers)
403 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
404 video_webpage = urllib2.urlopen(request).read()
405 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
406 sys.exit('ERROR: Unable to download video: %s' % str(err))
407 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
410 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
412 self.to_stderr('ERROR: Unable to extract "t" parameter')
414 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
415 if format_param is not None:
416 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
417 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
420 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
422 self.to_stderr('ERROR: Unable to extract uploader nickname')
424 video_uploader = mobj.group(1)
427 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
429 self.to_stderr('ERROR: Unable to extract video title')
431 video_title = mobj.group(1).decode('utf-8')
432 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
435 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
436 simple_title = simple_title.strip(ur'_')
441 'url': video_real_url,
442 'uploader': video_uploader,
443 'title': video_title,
444 'stitle': simple_title,
445 'ext': video_extension,
448 if __name__ == '__main__':
450 # Modules needed only when running the main program
454 # General configuration
455 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
456 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
457 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
460 parser = optparse.OptionParser(
461 usage='Usage: %prog [options] url...',
463 conflict_handler='resolve',
465 parser.add_option('-h', '--help',
466 action='help', help='print this help text and exit')
467 parser.add_option('-v', '--version',
468 action='version', help='print program version and exit')
469 parser.add_option('-u', '--username',
470 dest='username', metavar='UN', help='account username')
471 parser.add_option('-p', '--password',
472 dest='password', metavar='PW', help='account password')
473 parser.add_option('-o', '--output',
474 dest='outtmpl', metavar='TPL', help='output filename template')
475 parser.add_option('-q', '--quiet',
476 action='store_true', dest='quiet', help='activates quiet mode', default=False)
477 parser.add_option('-s', '--simulate',
478 action='store_true', dest='simulate', help='do not download video', default=False)
479 parser.add_option('-t', '--title',
480 action='store_true', dest='usetitle', help='use title in file name', default=False)
481 parser.add_option('-l', '--literal',
482 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
483 parser.add_option('-n', '--netrc',
484 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
485 parser.add_option('-g', '--get-url',
486 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
487 parser.add_option('-e', '--get-title',
488 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
489 parser.add_option('-f', '--format',
490 dest='format', metavar='FMT', help='video format code')
491 parser.add_option('-b', '--best-quality',
492 action='store_const', dest='video_format', help='alias for -f 18', const='18')
493 (opts, args) = parser.parse_args()
495 # Conflicting, missing and erroneous options
497 sys.exit('ERROR: you must provide at least one URL')
498 if opts.usenetrc and (opts.username is not None or opts.password is not None):
499 sys.exit('ERROR: using .netrc conflicts with giving username/password')
500 if opts.password is not None and opts.username is None:
501 sys.exit('ERROR: account username missing')
502 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
503 sys.exit('ERROR: using output template conflicts with using title or literal title')
504 if opts.usetitle and opts.useliteral:
505 sys.exit('ERROR: using title conflicts with using literal title')
506 if opts.username is not None and opts.password is None:
507 opts.password = getpass.getpass('Type account password and press return:')
509 # Information extractors
510 youtube_ie = YoutubeIE()
513 fd = FileDownloader({
514 'usenetrc': opts.usenetrc,
515 'username': opts.username,
516 'password': opts.password,
517 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
518 'forceurl': opts.geturl,
519 'forcetitle': opts.gettitle,
520 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
521 'format': opts.format,
522 'outtmpl': ((opts.usetitle and '%(stitle)s-%(id)s.%(ext)s')
523 or (opts.useliteral and '%(title)s-%(id)s.%(ext)s')
524 or '%(id)s.%(ext)s'),
526 fd.add_info_extractor(youtube_ie)
529 except KeyboardInterrupt:
530 sys.exit('\nERROR: Interrupted by user')