2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 simulate: Do not download the video files.
59 format: Video format code.
60 outtmpl: Template for output names.
66 def __init__(self, params):
68 self.set_params(params)
72 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73 components = filename.split(os.sep)
74 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
76 if not os.path.exists(dir):
80 def format_bytes(bytes):
86 exponent = long(math.log(float(bytes), 1024.0))
87 suffix = 'bkMGTPEZY'[exponent]
88 converted = float(bytes) / float(1024**exponent)
89 return '%.2f%s' % (converted, suffix)
92 def calc_percent(byte_counter, data_len):
95 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
98 def calc_eta(start, now, total, current):
102 if current == 0 or dif < 0.001: # One millisecond
104 rate = float(current) / dif
105 eta = long((float(total) - float(current)) / rate)
106 (eta_mins, eta_secs) = divmod(eta, 60)
109 return '%02d:%02d' % (eta_mins, eta_secs)
112 def calc_speed(start, now, bytes):
114 if bytes == 0 or dif < 0.001: # One millisecond
115 return '%10s' % '---b/s'
116 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
119 def best_block_size(elapsed_time, bytes):
120 new_min = max(bytes / 2.0, 1.0)
121 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122 if elapsed_time < 0.001:
124 rate = bytes / elapsed_time
131 def set_params(self, params):
132 """Sets parameters."""
133 if type(params) != dict:
134 raise ValueError('params: dictionary expected')
135 self._params = params
137 def get_params(self):
138 """Get parameters."""
141 def add_info_extractor(self, ie):
142 """Add an InfoExtractor object to the end of the list."""
144 ie.set_downloader(self)
146 def to_stdout(self, message, skip_eol=False):
147 """Print message to stdout if not in quiet mode."""
148 if not self._params.get('quiet', False):
149 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
152 def download(self, url_list):
153 """Download a given list of URLs."""
155 suitable_found = False
157 if not ie.suitable(url):
159 # Suitable InfoExtractor found
160 suitable_found = True
161 results = [x for x in ie.extract(url) if x is not None]
163 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
164 sys.exit('ERROR: fixed output name but more than one file to download')
166 if self._params.get('simulate', False):
169 for result in results:
171 filename = self._params['outtmpl'] % result
172 except (KeyError), err:
173 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
176 self.pmkdir(filename)
177 except (OSError, IOError), err:
178 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
181 outstream = open(filename, 'wb')
182 except (OSError, IOError), err:
183 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
186 self._do_download(outstream, result['url'])
188 except (OSError, IOError), err:
189 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
192 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
195 if not suitable_found:
196 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
198 def _do_download(self, stream, url):
199 request = urllib2.Request(url, None, std_headers)
200 data = urllib2.urlopen(request)
201 data_len = data.info().get('Content-length', None)
202 data_len_str = self.format_bytes(data_len)
207 percent_str = self.calc_percent(byte_counter, data_len)
208 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
209 speed_str = self.calc_speed(start, time.time(), byte_counter)
210 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
211 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
214 data_block = data.read(block_size)
216 data_block_len = len(data_block)
217 if data_block_len == 0:
219 byte_counter += data_block_len
220 stream.write(data_block)
221 block_size = self.best_block_size(after - before, data_block_len)
224 if data_len is not None and str(byte_counter) != data_len:
225 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
227 class InfoExtractor(object):
228 """Information Extractor class.
230 Information extractors are the classes that, given a URL, extract
231 information from the video (or videos) the URL refers to. This
232 information includes the real video URL, the video title and simplified
233 title, author and others. It is returned in a list of dictionaries when
234 calling its extract() method. It is a list because a URL can refer to
235 more than one video (think of playlists). The dictionaries must include
236 the following fields:
238 id: Video identifier.
239 url: Final video URL.
240 uploader: Nickname of the video uploader.
241 title: Literal title.
242 stitle: Simplified title.
243 ext: Video filename extension.
245 Subclasses of this one should re-define the _real_initialize() and
246 _real_extract() methods, as well as the suitable() static method.
247 Probably, they should also be instantiated and added to the main
254 def __init__(self, downloader=None):
255 """Constructor. Receives an optional downloader."""
257 self.set_downloader(downloader)
261 """Receives a URL and returns True if suitable for this IE."""
264 def initialize(self):
265 """Initializes an instance (login, etc)."""
267 self._real_initialize()
270 def extract(self, url):
271 """Extracts URL information and returns it in list of dicts."""
273 return self._real_extract(url)
275 def set_downloader(self, downloader):
276 """Sets the downloader for this IE."""
277 self._downloader = downloader
279 def to_stdout(self, message):
280 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
283 def to_stderr(self, message):
284 sys.stderr.write('%s\n' % message)
286 def _real_initialize(self):
287 """Real initialization process. Redefine in subclasses."""
290 def _real_extract(self, url):
291 """Real extraction process. Redefine in subclasses."""
294 class YoutubeIE(InfoExtractor):
295 """Information extractor for youtube.com."""
297 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
298 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
299 _NETRC_MACHINE = 'youtube'
301 def _real_initialize(self):
302 if self._downloader is None:
307 downloader_params = self._downloader.get_params()
309 # Attempt to use provided username and password or .netrc data
310 if downloader_params.get('username', None) is not None:
311 username = downloader_params['username']
312 password = downloader_params['password']
313 elif downloader_params.get('usenetrc', False):
315 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
320 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
321 except (IOError, netrc.NetrcParseError), err:
322 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
330 'current_form': 'loginForm',
332 'action_login': 'Log In',
333 'username': username,
334 'password': password,
336 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
338 self.to_stdout('[youtube] Logging in')
339 login_results = urllib2.urlopen(request).read()
340 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
341 self.to_stderr('WARNING: Unable to log in: bad username or password')
343 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
344 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
350 'action_confirm': 'Confirm',
352 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
354 self.to_stdout('[youtube] Confirming age')
355 age_results = urllib2.urlopen(request).read()
356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
357 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
359 def _real_extract(self, url):
360 # Extract video id from URL
361 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
363 self.to_stderr('ERROR: Invalid URL: %s' % url)
365 video_id = mobj.group(2)
367 # Downloader parameters
369 if self._downloader is not None:
370 params = self._downloader.get_params()
371 format_param = params.get('format', None)
374 video_extension = {18: 'mp4'}.get(format_param, 'flv')
376 # Normalize URL, including format
377 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
378 if format_param is not None:
379 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
380 request = urllib2.Request(normalized_url, None, std_headers)
382 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
383 video_webpage = urllib2.urlopen(request).read()
384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
385 sys.exit('ERROR: Unable to download video: %s' % str(err))
386 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
389 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
391 self.to_stderr('ERROR: Unable to extract "t" parameter')
393 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
394 if format_param is not None:
395 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
396 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
399 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
401 self.to_stderr('ERROR: Unable to extract uploader nickname')
403 video_uploader = mobj.group(1)
406 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
408 self.to_stderr('ERROR: Unable to extract video title')
410 video_title = mobj.group(1).decode('utf-8')
411 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
414 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
415 simple_title = simple_title.strip(u'_')
420 'url': video_real_url,
421 'uploader': video_uploader,
422 'title': video_title,
423 'stitle': simple_title,
424 'ext': video_extension,
427 if __name__ == '__main__':
429 # General configuration
430 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
431 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
433 # Information extractors
434 youtube_ie = YoutubeIE()
437 fd = FileDownloader({
444 'outtmpl': '%(id)s.%(ext)s'
446 fd.add_info_extractor(youtube_ie)
448 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
449 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
450 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
453 except KeyboardInterrupt:
454 sys.exit('\nERROR: Interrupted by user')