2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports it's able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 format: Video format code.
59 outtmpl: Template for output names.
65 def __init__(self, params):
67 self.set_params(params)
71 """Create directory components in filename. Similar to Unix "mkdir -p"."""
72 components = filename.split(os.sep)
73 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75 if not os.path.exists(dir):
79 def format_bytes(bytes):
85 exponent = long(math.log(float(bytes), 1024.0))
86 suffix = 'bkMGTPEZY'[exponent]
88 return '%s%s' % (bytes, suffix)
89 converted = float(bytes) / float(1024**exponent)
90 return '%.2f%s' % (converted, suffix)
93 def calc_percent(byte_counter, data_len):
96 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
99 def calc_eta(start, now, total, current):
103 if current == 0 or dif < 0.001: # One millisecond
105 rate = float(current) / dif
106 eta = long((float(total) - float(current)) / rate)
107 (eta_mins, eta_secs) = divmod(eta, 60)
110 return '%02d:%02d' % (eta_mins, eta_secs)
113 def calc_speed(start, now, bytes):
115 if bytes == 0 or dif < 0.001: # One millisecond
116 return '%9s' % 'N/A b/s'
117 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
120 def best_block_size(elapsed_time, bytes):
121 new_min = max(bytes / 2.0, 1.0)
122 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
123 if elapsed_time < 0.001:
125 rate = bytes / elapsed_time
132 def set_params(self, params):
133 """Sets parameters."""
134 if type(params) != dict:
135 raise ValueError('params: dictionary expected')
136 self._params = params
138 def get_params(self):
139 """Get parameters."""
142 def add_info_extractor(self, ie):
143 """Add an InfoExtractor object to the end of the list."""
145 ie.set_downloader(self)
147 def download(self, url_list):
148 """Download a given list of URLs."""
150 suitable_found = False
152 if not ie.suitable(url):
154 # Suitable InfoExtractor found
155 suitable_found = True
156 for result in ie.extract(url):
160 filename = self._params['outtmpl'] % result
161 except (KeyError), err:
162 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
165 self.pmkdir(filename)
166 except (OSError, IOError), err:
167 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
170 outstream = open(filename, 'wb')
171 except (OSError, IOError), err:
172 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
175 self._do_download(outstream, result['url'])
177 except (OSError, IOError), err:
178 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
181 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
184 if not suitable_found:
185 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
187 def _do_download(self, stream, url):
188 request = urllib2.Request(url, None, std_headers)
189 data = urllib2.urlopen(request)
190 data_len = data.info().get('Content-length', None)
191 data_len_str = self.format_bytes(data_len)
196 percent_str = self.calc_percent(byte_counter, data_len)
197 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
198 speed_str = self.calc_speed(start, time.time(), byte_counter)
200 if not self._params.get('quiet', False):
201 sys.stdout.write('\r[download] %s of %s at %s ETA %s' %
202 (percent_str, data_len_str, speed_str, eta_str))
206 data_block = data.read(block_size)
208 data_block_len = len(data_block)
209 if data_block_len == 0:
211 byte_counter += data_block_len
212 stream.write(data_block)
213 block_size = self.best_block_size(after - before, data_block_len)
215 if not self._params.get('quiet', False):
218 if data_len is not None and str(byte_counter) != data_len:
219 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
221 class InfoExtractor(object):
222 """Information Extractor class.
224 Information extractors are the classes that, given a URL, extract
225 information from the video (or videos) the URL refers to. This
226 information includes the real video URL, the video title and simplified
227 title, author and others. It is returned in a list of dictionaries when
228 calling its extract() method. It is a list because a URL can refer to
229 more than one video (think of playlists). The dictionaries must include
230 the following fields:
232 id: Video identifier.
233 url: Final video URL.
234 uploader: Nickname of the video uploader.
235 title: Literal title.
236 stitle: Simplified title.
237 ext: Video filename extension.
239 Subclasses of this one should re-define the _real_initialize() and
240 _real_extract() methods, as well as the suitable() static method.
241 Probably, they should also be instantiated and added to the main
248 def __init__(self, downloader=None):
249 """Constructor. Receives an optional downloader."""
251 self.set_downloader(downloader)
255 """Receives a URL and returns True if suitable for this IE."""
258 def initialize(self):
259 """Initializes an instance (login, etc)."""
261 self._real_initialize()
264 def extract(self, url):
265 """Extracts URL information and returns it in list of dicts."""
267 return self._real_extract(url)
269 def set_downloader(self, downloader):
270 """Sets the downloader for this IE."""
271 self._downloader = downloader
273 def to_stdout(self, message):
274 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
277 def to_stderr(self, message):
278 sys.stderr.write('%s\n' % message)
280 def _real_initialize(self):
281 """Real initialization process. Redefine in subclasses."""
284 def _real_extract(self, url):
285 """Real extraction process. Redefine in subclasses."""
288 class YoutubeIE(InfoExtractor):
289 """Information extractor for youtube.com."""
291 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
292 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
293 _NETRC_MACHINE = 'youtube'
295 def _real_initialize(self):
296 if self._downloader is None:
301 downloader_params = self._downloader.get_params()
303 # Attempt to use provided username and password or .netrc data
304 if downloader_params.get('username', None) is not None:
305 username = downloader_params['username']
306 password = downloader_params['password']
307 elif downloader_params.get('usenetrc', False):
309 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
314 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
315 except (IOError, netrc.NetrcParseError), err:
316 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
323 login_form = { 'current_form': 'loginForm',
325 'action_login': 'Log In',
326 'username': username,
327 'password': password, }
328 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
330 self.to_stdout('[youtube] Logging in')
331 login_results = urllib2.urlopen(request).read()
332 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
333 self.to_stderr('WARNING: Unable to log in: bad username or password')
335 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
336 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
340 age_form = { 'next_url': '/',
341 'action_confirm': 'Confirm', }
342 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
344 self.to_stdout('[youtube] Confirming age')
345 age_results = urllib2.urlopen(request).read()
346 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
347 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
349 def _real_extract(self, url):
350 # Extract video id from URL
351 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
353 self.to_stderr('ERROR: Invalid URL: %s' % url)
355 video_id = mobj.group(2)
357 # Downloader parameters
359 if self._downloader is not None:
360 params = self._downloader.get_params()
361 format_param = params.get('format', None)
364 video_extension = {18: 'mp4'}.get(format_param, 'flv')
366 # Normalize URL, including format
367 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
368 if format_param is not None:
369 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
370 request = urllib2.Request(normalized_url, None, std_headers)
372 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
373 video_webpage = urllib2.urlopen(request).read()
374 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
375 sys.exit('ERROR: Unable to download video: %s' % str(err))
376 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
379 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
381 self.to_stderr('ERROR: Unable to extract "t" parameter')
383 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
384 if format_param is not None:
385 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
388 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
390 self.to_stderr('ERROR: Unable to extract uploader nickname')
392 video_uploader = mobj.group(1)
395 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
397 self.to_stderr('ERROR: Unable to extract video title')
399 video_title = mobj.group(1).decode('utf-8')
400 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
403 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
404 simple_title = simple_title.strip(u'_')
407 return [{ 'id': video_id,
408 'url': video_real_url,
409 'uploader': video_uploader,
410 'title': video_title,
411 'stitle': simple_title,
412 'ext': video_extension,
415 if __name__ == '__main__':
417 # General configuration
418 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
419 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
421 # Information extractors
422 youtube_ie = YoutubeIE()
425 fd = FileDownloader({ 'usenetrc': False,
430 'outtmpl': '%(id)s.%(ext)s'
432 fd.add_info_extractor(youtube_ie)
433 fd.download([ 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
434 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
435 'http://www.youtube.com/watch?v=DZRXe1wtC-M', ])
437 except KeyboardInterrupt:
438 sys.exit('\nERROR: Interrupted by user')