2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
20 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23 'Accept-Language': 'en-us,en;q=0.5',
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28 class FileDownloader(object):
29 """File Downloader class.
31 File downloader objects are the ones responsible of downloading the
32 actual video file and writing it to disk if the user has requested
33 it, among some other tasks. In most cases there should be one per
34 program. As, given a video URL, the downloader doesn't know how to
35 extract all the needed information, task that InfoExtractors do, it
36 has to pass the URL to one of them.
38 For this, file downloader objects have a method that allows
39 InfoExtractors to be registered in a given order. When it is passed
40 a URL, the file downloader handles it to the first InfoExtractor it
41 finds that reports being able to handle it. The InfoExtractor returns
42 all the information to the FileDownloader and the latter downloads the
43 file or does whatever it's instructed to do.
45 File downloaders accept a lot of parameters. In order not to saturate
46 the object constructor with arguments, it receives a dictionary of
47 options instead. These options are available through the get_params()
48 method for the InfoExtractors to use. The FileDownloader also registers
49 itself as the downloader in charge for the InfoExtractors that are
50 added to it, so this is a "mutual registration".
54 username: Username for authentication purposes.
55 password: Password for authentication purposes.
56 usenetrc: Use netrc for authentication instead.
57 quiet: Do not print messages to stdout.
58 simulate: Do not download the video files.
59 format: Video format code.
60 outtmpl: Template for output names.
66 def __init__(self, params):
68 self.set_params(params)
72 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73 components = filename.split(os.sep)
74 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
76 if not os.path.exists(dir):
80 def format_bytes(bytes):
86 exponent = long(math.log(float(bytes), 1024.0))
87 suffix = 'bkMGTPEZY'[exponent]
88 converted = float(bytes) / float(1024**exponent)
89 return '%.2f%s' % (converted, suffix)
92 def calc_percent(byte_counter, data_len):
95 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
98 def calc_eta(start, now, total, current):
102 if current == 0 or dif < 0.001: # One millisecond
104 rate = float(current) / dif
105 eta = long((float(total) - float(current)) / rate)
106 (eta_mins, eta_secs) = divmod(eta, 60)
109 return '%02d:%02d' % (eta_mins, eta_secs)
112 def calc_speed(start, now, bytes):
114 if bytes == 0 or dif < 0.001: # One millisecond
115 return '%10s' % '---b/s'
116 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
119 def best_block_size(elapsed_time, bytes):
120 new_min = max(bytes / 2.0, 1.0)
121 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122 if elapsed_time < 0.001:
124 rate = bytes / elapsed_time
131 def set_params(self, params):
132 """Sets parameters."""
133 if type(params) != dict:
134 raise ValueError('params: dictionary expected')
135 self._params = params
137 def get_params(self):
138 """Get parameters."""
141 def add_info_extractor(self, ie):
142 """Add an InfoExtractor object to the end of the list."""
144 ie.set_downloader(self)
146 def to_stdout(self, message, skip_eol=False):
147 """Print message to stdout if not in quiet mode."""
148 if not self._params.get('quiet', False):
149 sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
152 def to_stderr(self, message):
153 """Print message to stderr."""
154 sys.stderr.write('%s\n' % message)
156 def fixed_template(self):
157 """Checks if the output template is fixed."""
158 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
160 def download(self, url_list):
161 """Download a given list of URLs."""
162 if len(url_list) > 1 and self.fixed_template():
163 sys.exit('ERROR: fixed output name but more than one file to download')
166 suitable_found = False
168 if not ie.suitable(url):
170 # Suitable InfoExtractor found
171 suitable_found = True
172 results = [x for x in ie.extract(url) if x is not None]
174 if len(results) > 1 and self.fixed_template():
175 sys.exit('ERROR: fixed output name but more than one file to download')
177 if self._params.get('simulate', False):
180 for result in results:
182 filename = self._params['outtmpl'] % result
183 except (ValueError, KeyError), err:
184 self.to_stderr('ERROR: invalid output template: %s' % str(err))
187 self.pmkdir(filename)
188 except (OSError, IOError), err:
189 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
192 outstream = open(filename, 'wb')
193 except (OSError, IOError), err:
194 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
197 self._do_download(outstream, result['url'])
199 except (OSError, IOError), err:
200 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
202 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
203 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
206 if not suitable_found:
207 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
209 def _do_download(self, stream, url):
210 request = urllib2.Request(url, None, std_headers)
211 data = urllib2.urlopen(request)
212 data_len = data.info().get('Content-length', None)
213 data_len_str = self.format_bytes(data_len)
218 percent_str = self.calc_percent(byte_counter, data_len)
219 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
220 speed_str = self.calc_speed(start, time.time(), byte_counter)
221 self.to_stdout('\r[download] %s of %s at %s ETA %s' %
222 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
225 data_block = data.read(block_size)
227 data_block_len = len(data_block)
228 if data_block_len == 0:
230 byte_counter += data_block_len
231 stream.write(data_block)
232 block_size = self.best_block_size(after - before, data_block_len)
235 if data_len is not None and str(byte_counter) != data_len:
236 raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
238 class InfoExtractor(object):
239 """Information Extractor class.
241 Information extractors are the classes that, given a URL, extract
242 information from the video (or videos) the URL refers to. This
243 information includes the real video URL, the video title and simplified
244 title, author and others. It is returned in a list of dictionaries when
245 calling its extract() method. It is a list because a URL can refer to
246 more than one video (think of playlists). The dictionaries must include
247 the following fields:
249 id: Video identifier.
250 url: Final video URL.
251 uploader: Nickname of the video uploader.
252 title: Literal title.
253 stitle: Simplified title.
254 ext: Video filename extension.
256 Subclasses of this one should re-define the _real_initialize() and
257 _real_extract() methods, as well as the suitable() static method.
258 Probably, they should also be instantiated and added to the main
265 def __init__(self, downloader=None):
266 """Constructor. Receives an optional downloader."""
268 self.set_downloader(downloader)
272 """Receives a URL and returns True if suitable for this IE."""
275 def initialize(self):
276 """Initializes an instance (login, etc)."""
278 self._real_initialize()
281 def extract(self, url):
282 """Extracts URL information and returns it in list of dicts."""
284 return self._real_extract(url)
286 def set_downloader(self, downloader):
287 """Sets the downloader for this IE."""
288 self._downloader = downloader
290 def to_stdout(self, message):
291 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
294 def to_stderr(self, message):
295 sys.stderr.write('%s\n' % message)
297 def _real_initialize(self):
298 """Real initialization process. Redefine in subclasses."""
301 def _real_extract(self, url):
302 """Real extraction process. Redefine in subclasses."""
305 class YoutubeIE(InfoExtractor):
306 """Information extractor for youtube.com."""
308 _LOGIN_URL = 'http://www.youtube.com/login?next=/'
309 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
310 _NETRC_MACHINE = 'youtube'
312 def _real_initialize(self):
313 if self._downloader is None:
318 downloader_params = self._downloader.get_params()
320 # Attempt to use provided username and password or .netrc data
321 if downloader_params.get('username', None) is not None:
322 username = downloader_params['username']
323 password = downloader_params['password']
324 elif downloader_params.get('usenetrc', False):
326 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
331 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
332 except (IOError, netrc.NetrcParseError), err:
333 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
341 'current_form': 'loginForm',
343 'action_login': 'Log In',
344 'username': username,
345 'password': password,
347 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
349 self.to_stdout('[youtube] Logging in')
350 login_results = urllib2.urlopen(request).read()
351 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
352 self.to_stderr('WARNING: Unable to log in: bad username or password')
354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
355 self.to_stderr('WARNING: Unable to log in: %s' % str(err))
361 'action_confirm': 'Confirm',
363 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
365 self.to_stdout('[youtube] Confirming age')
366 age_results = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 sys.exit('ERROR: Unable to confirm age: %s' % str(err))
370 def _real_extract(self, url):
371 # Extract video id from URL
372 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
374 self.to_stderr('ERROR: Invalid URL: %s' % url)
376 video_id = mobj.group(2)
378 # Downloader parameters
380 if self._downloader is not None:
381 params = self._downloader.get_params()
382 format_param = params.get('format', None)
385 video_extension = {18: 'mp4'}.get(format_param, 'flv')
387 # Normalize URL, including format
388 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
389 if format_param is not None:
390 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
391 request = urllib2.Request(normalized_url, None, std_headers)
393 self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
394 video_webpage = urllib2.urlopen(request).read()
395 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
396 sys.exit('ERROR: Unable to download video: %s' % str(err))
397 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
400 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
402 self.to_stderr('ERROR: Unable to extract "t" parameter')
404 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
405 if format_param is not None:
406 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
407 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
410 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
412 self.to_stderr('ERROR: Unable to extract uploader nickname')
414 video_uploader = mobj.group(1)
417 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
419 self.to_stderr('ERROR: Unable to extract video title')
421 video_title = mobj.group(1).decode('utf-8')
422 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
425 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
426 simple_title = simple_title.strip(ur'_')
431 'url': video_real_url,
432 'uploader': video_uploader,
433 'title': video_title,
434 'stitle': simple_title,
435 'ext': video_extension,
438 if __name__ == '__main__':
440 # General configuration
441 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
442 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
444 # Information extractors
445 youtube_ie = YoutubeIE()
448 fd = FileDownloader({
455 'outtmpl': '%(ext)s/%(ext)s/%(id)s.%(ext)s'
457 fd.add_info_extractor(youtube_ie)
459 'http://www.youtube.com/watch?v=t7qdwI7TVe8',
460 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
461 'http://www.youtube.com/watch?v=DZRXe1wtC-M',
464 except KeyboardInterrupt:
465 sys.exit('\nERROR: Interrupted by user')