Add .to_stderr() to downloaders
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class FileDownloader(object):
29         """File Downloader class.
30
31         File downloader objects are the ones responsible of downloading the
32         actual video file and writing it to disk if the user has requested
33         it, among some other tasks. In most cases there should be one per
34         program. As, given a video URL, the downloader doesn't know how to
35         extract all the needed information, task that InfoExtractors do, it
36         has to pass the URL to one of them.
37
38         For this, file downloader objects have a method that allows
39         InfoExtractors to be registered in a given order. When it is passed
40         a URL, the file downloader handles it to the first InfoExtractor it
41         finds that reports being able to handle it. The InfoExtractor returns
42         all the information to the FileDownloader and the latter downloads the
43         file or does whatever it's instructed to do.
44
45         File downloaders accept a lot of parameters. In order not to saturate
46         the object constructor with arguments, it receives a dictionary of
47         options instead. These options are available through the get_params()
48         method for the InfoExtractors to use. The FileDownloader also registers
49         itself as the downloader in charge for the InfoExtractors that are
50         added to it, so this is a "mutual registration".
51
52         Available options:
53
54         username:       Username for authentication purposes.
55         password:       Password for authentication purposes.
56         usenetrc:       Use netrc for authentication instead.
57         quiet:          Do not print messages to stdout.
58         simulate:       Do not download the video files.
59         format:         Video format code.
60         outtmpl:        Template for output names.
61         """
62
63         _params = None
64         _ies = []
65
66         def __init__(self, params):
67                 self._ies = []
68                 self.set_params(params)
69         
70         @staticmethod
71         def pmkdir(filename):
72                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73                 components = filename.split(os.sep)
74                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75                 for dir in aggregate:
76                         if not os.path.exists(dir):
77                                 os.mkdir(dir)
78         
79         @staticmethod
80         def format_bytes(bytes):
81                 if bytes is None:
82                         return 'N/A'
83                 if bytes == 0:
84                         exponent = 0
85                 else:
86                         exponent = long(math.log(float(bytes), 1024.0))
87                 suffix = 'bkMGTPEZY'[exponent]
88                 converted = float(bytes) / float(1024**exponent)
89                 return '%.2f%s' % (converted, suffix)
90
91         @staticmethod
92         def calc_percent(byte_counter, data_len):
93                 if data_len is None:
94                         return '---.-%'
95                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
96
97         @staticmethod
98         def calc_eta(start, now, total, current):
99                 if total is None:
100                         return '--:--'
101                 dif = now - start
102                 if current == 0 or dif < 0.001: # One millisecond
103                         return '--:--'
104                 rate = float(current) / dif
105                 eta = long((float(total) - float(current)) / rate)
106                 (eta_mins, eta_secs) = divmod(eta, 60)
107                 if eta_mins > 99:
108                         return '--:--'
109                 return '%02d:%02d' % (eta_mins, eta_secs)
110
111         @staticmethod
112         def calc_speed(start, now, bytes):
113                 dif = now - start
114                 if bytes == 0 or dif < 0.001: # One millisecond
115                         return '%10s' % '---b/s'
116                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
117
118         @staticmethod
119         def best_block_size(elapsed_time, bytes):
120                 new_min = max(bytes / 2.0, 1.0)
121                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122                 if elapsed_time < 0.001:
123                         return int(new_max)
124                 rate = bytes / elapsed_time
125                 if rate > new_max:
126                         return int(new_max)
127                 if rate < new_min:
128                         return int(new_min)
129                 return int(rate)
130
131         def set_params(self, params):
132                 """Sets parameters."""
133                 if type(params) != dict:
134                         raise ValueError('params: dictionary expected')
135                 self._params = params
136         
137         def get_params(self):
138                 """Get parameters."""
139                 return self._params
140
141         def add_info_extractor(self, ie):
142                 """Add an InfoExtractor object to the end of the list."""
143                 self._ies.append(ie)
144                 ie.set_downloader(self)
145         
146         def to_stdout(self, message, skip_eol=False):
147                 """Print message to stdout if not in quiet mode."""
148                 if not self._params.get('quiet', False):
149                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
150                         sys.stdout.flush()
151         
152         def to_stderr(self, message):
153                 """Print message to stderr."""
154                 sys.stderr.write('%s\n' % message)
155
156         def download(self, url_list):
157                 """Download a given list of URLs."""
158                 for url in url_list:
159                         suitable_found = False
160                         for ie in self._ies:
161                                 if not ie.suitable(url):
162                                         continue
163                                 # Suitable InfoExtractor found
164                                 suitable_found = True
165                                 results = [x for x in ie.extract(url) if x is not None]
166
167                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
168                                         sys.exit('ERROR: fixed output name but more than one file to download')
169
170                                 if self._params.get('simulate', False):
171                                         continue
172
173                                 for result in results:
174                                         try:
175                                                 filename = self._params['outtmpl'] % result
176                                         except (KeyError), err:
177                                                 self.to_stderr('ERROR: invalid output template: %s' % str(err))
178                                                 continue
179                                         try:
180                                                 self.pmkdir(filename)
181                                         except (OSError, IOError), err:
182                                                 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
183                                                 continue
184                                         try:
185                                                 outstream = open(filename, 'wb')
186                                         except (OSError, IOError), err:
187                                                 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
188                                                 continue
189                                         try:
190                                                 self._do_download(outstream, result['url'])
191                                                 outstream.close()
192                                         except (OSError, IOError), err:
193                                                 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
194                                                 continue
195                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
196                                                 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
197                                                 continue
198                                 break
199                         if not suitable_found:
200                                 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
201         
202         def _do_download(self, stream, url):
203                 request = urllib2.Request(url, None, std_headers)
204                 data = urllib2.urlopen(request)
205                 data_len = data.info().get('Content-length', None)
206                 data_len_str = self.format_bytes(data_len)
207                 byte_counter = 0
208                 block_size = 1024
209                 start = time.time()
210                 while True:
211                         percent_str = self.calc_percent(byte_counter, data_len)
212                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
213                         speed_str = self.calc_speed(start, time.time(), byte_counter)
214                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
215                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
216
217                         before = time.time()
218                         data_block = data.read(block_size)
219                         after = time.time()
220                         data_block_len = len(data_block)
221                         if data_block_len == 0:
222                                 break
223                         byte_counter += data_block_len
224                         stream.write(data_block)
225                         block_size = self.best_block_size(after - before, data_block_len)
226
227                 self.to_stdout('')
228                 if data_len is not None and str(byte_counter) != data_len:
229                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
230
231 class InfoExtractor(object):
232         """Information Extractor class.
233
234         Information extractors are the classes that, given a URL, extract
235         information from the video (or videos) the URL refers to. This
236         information includes the real video URL, the video title and simplified
237         title, author and others. It is returned in a list of dictionaries when
238         calling its extract() method. It is a list because a URL can refer to
239         more than one video (think of playlists). The dictionaries must include
240         the following fields:
241
242         id:             Video identifier.
243         url:            Final video URL.
244         uploader:       Nickname of the video uploader.
245         title:          Literal title.
246         stitle:         Simplified title.
247         ext:            Video filename extension.
248
249         Subclasses of this one should re-define the _real_initialize() and
250         _real_extract() methods, as well as the suitable() static method.
251         Probably, they should also be instantiated and added to the main
252         downloader.
253         """
254
255         _ready = False
256         _downloader = None
257
258         def __init__(self, downloader=None):
259                 """Constructor. Receives an optional downloader."""
260                 self._ready = False
261                 self.set_downloader(downloader)
262
263         @staticmethod
264         def suitable(url):
265                 """Receives a URL and returns True if suitable for this IE."""
266                 return True
267
268         def initialize(self):
269                 """Initializes an instance (login, etc)."""
270                 if not self._ready:
271                         self._real_initialize()
272                         self._ready = True
273
274         def extract(self, url):
275                 """Extracts URL information and returns it in list of dicts."""
276                 self.initialize()
277                 return self._real_extract(url)
278
279         def set_downloader(self, downloader):
280                 """Sets the downloader for this IE."""
281                 self._downloader = downloader
282         
283         def to_stdout(self, message):
284                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
285                         print message
286         
287         def to_stderr(self, message):
288                 sys.stderr.write('%s\n' % message)
289
290         def _real_initialize(self):
291                 """Real initialization process. Redefine in subclasses."""
292                 pass
293
294         def _real_extract(self, url):
295                 """Real extraction process. Redefine in subclasses."""
296                 pass
297
298 class YoutubeIE(InfoExtractor):
299         """Information extractor for youtube.com."""
300
301         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
302         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
303         _NETRC_MACHINE = 'youtube'
304
305         def _real_initialize(self):
306                 if self._downloader is None:
307                         return
308
309                 username = None
310                 password = None
311                 downloader_params = self._downloader.get_params()
312
313                 # Attempt to use provided username and password or .netrc data
314                 if downloader_params.get('username', None) is not None:
315                         username = downloader_params['username']
316                         password = downloader_params['password']
317                 elif downloader_params.get('usenetrc', False):
318                         try:
319                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
320                                 if info is not None:
321                                         username = info[0]
322                                         password = info[2]
323                                 else:
324                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
325                         except (IOError, netrc.NetrcParseError), err:
326                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
327                                 return
328
329                 if username is None:
330                         return
331
332                 # Log in
333                 login_form = {
334                                 'current_form': 'loginForm',
335                                 'next':         '/',
336                                 'action_login': 'Log In',
337                                 'username':     username,
338                                 'password':     password,
339                                 }
340                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
341                 try:
342                         self.to_stdout('[youtube] Logging in')
343                         login_results = urllib2.urlopen(request).read()
344                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
345                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
346                                 return
347                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
348                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
349                         return
350         
351                 # Confirm age
352                 age_form = {
353                                 'next_url':             '/',
354                                 'action_confirm':       'Confirm',
355                                 }
356                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
357                 try:
358                         self.to_stdout('[youtube] Confirming age')
359                         age_results = urllib2.urlopen(request).read()
360                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
361                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
362
363         def _real_extract(self, url):
364                 # Extract video id from URL
365                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
366                 if mobj is None:
367                         self.to_stderr('ERROR: Invalid URL: %s' % url)
368                         return [None]
369                 video_id = mobj.group(2)
370
371                 # Downloader parameters
372                 format_param = None
373                 if self._downloader is not None:
374                         params = self._downloader.get_params()
375                         format_param = params.get('format', None)
376
377                 # Extension
378                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
379
380                 # Normalize URL, including format
381                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
382                 if format_param is not None:
383                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
384                 request = urllib2.Request(normalized_url, None, std_headers)
385                 try:
386                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
387                         video_webpage = urllib2.urlopen(request).read()
388                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
389                         sys.exit('ERROR: Unable to download video: %s' % str(err))
390                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
391                 
392                 # "t" param
393                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
394                 if mobj is None:
395                         self.to_stderr('ERROR: Unable to extract "t" parameter')
396                         return [None]
397                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
398                 if format_param is not None:
399                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
400                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
401
402                 # uploader
403                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
404                 if mobj is None:
405                         self.to_stderr('ERROR: Unable to extract uploader nickname')
406                         return [None]
407                 video_uploader = mobj.group(1)
408
409                 # title
410                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
411                 if mobj is None:
412                         self.to_stderr('ERROR: Unable to extract video title')
413                         return [None]
414                 video_title = mobj.group(1).decode('utf-8')
415                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
416
417                 # simplified title
418                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
419                 simple_title = simple_title.strip(u'_')
420
421                 # Return information
422                 return [{
423                         'id':           video_id,
424                         'url':          video_real_url,
425                         'uploader':     video_uploader,
426                         'title':        video_title,
427                         'stitle':       simple_title,
428                         'ext':          video_extension,
429                         }]
430
431 if __name__ == '__main__':
432         try:
433                 # General configuration
434                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
435                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
436
437                 # Information extractors
438                 youtube_ie = YoutubeIE()
439
440                 # File downloader
441                 fd = FileDownloader({
442                         'usenetrc': False,
443                         'username': None,
444                         'password': None,
445                         'quiet': False,
446                         'simulate': True,
447                         'format': None,
448                         'outtmpl': '%(id)s.%(ext)s'
449                         })
450                 fd.add_info_extractor(youtube_ie)
451                 fd.download([
452                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
453                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
454                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
455                         ])
456
457         except KeyboardInterrupt:
458                 sys.exit('\nERROR: Interrupted by user')