Check the output name is not fixed when there are several videos to download
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class FileDownloader(object):
29         """File Downloader class.
30
31         File downloader objects are the ones responsible of downloading the
32         actual video file and writing it to disk if the user has requested
33         it, among some other tasks. In most cases there should be one per
34         program. As, given a video URL, the downloader doesn't know how to
35         extract all the needed information, task that InfoExtractors do, it
36         has to pass the URL to one of them.
37
38         For this, file downloader objects have a method that allows
39         InfoExtractors to be registered in a given order. When it is passed
40         a URL, the file downloader handles it to the first InfoExtractor it
41         finds that reports being able to handle it. The InfoExtractor returns
42         all the information to the FileDownloader and the latter downloads the
43         file or does whatever it's instructed to do.
44
45         File downloaders accept a lot of parameters. In order not to saturate
46         the object constructor with arguments, it receives a dictionary of
47         options instead. These options are available through the get_params()
48         method for the InfoExtractors to use. The FileDownloader also registers
49         itself as the downloader in charge for the InfoExtractors that are
50         added to it, so this is a "mutual registration".
51
52         Available options:
53
54         username:       Username for authentication purposes.
55         password:       Password for authentication purposes.
56         usenetrc:       Use netrc for authentication instead.
57         quiet:          Do not print messages to stdout.
58         format:         Video format code.
59         outtmpl:        Template for output names.
60         """
61
62         _params = None
63         _ies = []
64
65         def __init__(self, params):
66                 self._ies = []
67                 self.set_params(params)
68         
69         @staticmethod
70         def pmkdir(filename):
71                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
72                 components = filename.split(os.sep)
73                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
74                 for dir in aggregate:
75                         if not os.path.exists(dir):
76                                 os.mkdir(dir)
77         
78         @staticmethod
79         def format_bytes(bytes):
80                 if bytes is None:
81                         return 'N/A'
82                 if bytes == 0:
83                         exponent = 0
84                 else:
85                         exponent = long(math.log(float(bytes), 1024.0))
86                 suffix = 'bkMGTPEZY'[exponent]
87                 if exponent == 0:
88                         return '%s%s' % (bytes, suffix)
89                 converted = float(bytes) / float(1024**exponent)
90                 return '%.2f%s' % (converted, suffix)
91
92         @staticmethod
93         def calc_percent(byte_counter, data_len):
94                 if data_len is None:
95                         return '---.-%'
96                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
97
98         @staticmethod
99         def calc_eta(start, now, total, current):
100                 if total is None:
101                         return '--:--'
102                 dif = now - start
103                 if current == 0 or dif < 0.001: # One millisecond
104                         return '--:--'
105                 rate = float(current) / dif
106                 eta = long((float(total) - float(current)) / rate)
107                 (eta_mins, eta_secs) = divmod(eta, 60)
108                 if eta_mins > 99:
109                         return '--:--'
110                 return '%02d:%02d' % (eta_mins, eta_secs)
111
112         @staticmethod
113         def calc_speed(start, now, bytes):
114                 dif = now - start
115                 if bytes == 0 or dif < 0.001: # One millisecond
116                         return '%9s' % 'N/A b/s'
117                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
118
119         @staticmethod
120         def best_block_size(elapsed_time, bytes):
121                 new_min = max(bytes / 2.0, 1.0)
122                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
123                 if elapsed_time < 0.001:
124                         return int(new_max)
125                 rate = bytes / elapsed_time
126                 if rate > new_max:
127                         return int(new_max)
128                 if rate < new_min:
129                         return int(new_min)
130                 return int(rate)
131
132         def set_params(self, params):
133                 """Sets parameters."""
134                 if type(params) != dict:
135                         raise ValueError('params: dictionary expected')
136                 self._params = params
137         
138         def get_params(self):
139                 """Get parameters."""
140                 return self._params
141
142         def add_info_extractor(self, ie):
143                 """Add an InfoExtractor object to the end of the list."""
144                 self._ies.append(ie)
145                 ie.set_downloader(self)
146         
147         def download(self, url_list):
148                 """Download a given list of URLs."""
149                 for url in url_list:
150                         suitable_found = False
151                         for ie in self._ies:
152                                 if not ie.suitable(url):
153                                         continue
154                                 # Suitable InfoExtractor found
155                                 suitable_found = True
156                                 results = [x for x in ie.extract(url) if x is not None]
157
158                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
159                                         sys.exit('ERROR: fixed output name but more than one file to download')
160
161                                 for result in results:
162                                         try:
163                                                 filename = self._params['outtmpl'] % result
164                                         except (KeyError), err:
165                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
166                                                 continue
167                                         try:
168                                                 self.pmkdir(filename)
169                                         except (OSError, IOError), err:
170                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
171                                                 continue
172                                         try:
173                                                 outstream = open(filename, 'wb')
174                                         except (OSError, IOError), err:
175                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
176                                                 continue
177                                         try:
178                                                 self._do_download(outstream, result['url'])
179                                                 outstream.close()
180                                         except (OSError, IOError), err:
181                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
182                                                 continue
183                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
184                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
185                                                 continue
186                                 break
187                         if not suitable_found:
188                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
189         
190         def _do_download(self, stream, url):
191                 request = urllib2.Request(url, None, std_headers)
192                 data = urllib2.urlopen(request)
193                 data_len = data.info().get('Content-length', None)
194                 data_len_str = self.format_bytes(data_len)
195                 byte_counter = 0
196                 block_size = 1024
197                 start = time.time()
198                 while True:
199                         percent_str = self.calc_percent(byte_counter, data_len)
200                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
201                         speed_str = self.calc_speed(start, time.time(), byte_counter)
202
203                         if not self._params.get('quiet', False):
204                                 sys.stdout.write('\r[download] %s of %s at %s ETA %s' %
205                                                 (percent_str, data_len_str, speed_str, eta_str))
206                                 sys.stdout.flush()
207
208                         before = time.time()
209                         data_block = data.read(block_size)
210                         after = time.time()
211                         data_block_len = len(data_block)
212                         if data_block_len == 0:
213                                 break
214                         byte_counter += data_block_len
215                         stream.write(data_block)
216                         block_size = self.best_block_size(after - before, data_block_len)
217
218                 if not self._params.get('quiet', False):
219                         print
220
221                 if data_len is not None and str(byte_counter) != data_len:
222                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
223
224 class InfoExtractor(object):
225         """Information Extractor class.
226
227         Information extractors are the classes that, given a URL, extract
228         information from the video (or videos) the URL refers to. This
229         information includes the real video URL, the video title and simplified
230         title, author and others. It is returned in a list of dictionaries when
231         calling its extract() method. It is a list because a URL can refer to
232         more than one video (think of playlists). The dictionaries must include
233         the following fields:
234
235         id:             Video identifier.
236         url:            Final video URL.
237         uploader:       Nickname of the video uploader.
238         title:          Literal title.
239         stitle:         Simplified title.
240         ext:            Video filename extension.
241
242         Subclasses of this one should re-define the _real_initialize() and
243         _real_extract() methods, as well as the suitable() static method.
244         Probably, they should also be instantiated and added to the main
245         downloader.
246         """
247
248         _ready = False
249         _downloader = None
250
251         def __init__(self, downloader=None):
252                 """Constructor. Receives an optional downloader."""
253                 self._ready = False
254                 self.set_downloader(downloader)
255
256         @staticmethod
257         def suitable(url):
258                 """Receives a URL and returns True if suitable for this IE."""
259                 return True
260
261         def initialize(self):
262                 """Initializes an instance (login, etc)."""
263                 if not self._ready:
264                         self._real_initialize()
265                         self._ready = True
266
267         def extract(self, url):
268                 """Extracts URL information and returns it in list of dicts."""
269                 self.initialize()
270                 return self._real_extract(url)
271
272         def set_downloader(self, downloader):
273                 """Sets the downloader for this IE."""
274                 self._downloader = downloader
275         
276         def to_stdout(self, message):
277                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
278                         print message
279         
280         def to_stderr(self, message):
281                 sys.stderr.write('%s\n' % message)
282
283         def _real_initialize(self):
284                 """Real initialization process. Redefine in subclasses."""
285                 pass
286
287         def _real_extract(self, url):
288                 """Real extraction process. Redefine in subclasses."""
289                 pass
290
291 class YoutubeIE(InfoExtractor):
292         """Information extractor for youtube.com."""
293
294         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
295         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
296         _NETRC_MACHINE = 'youtube'
297
298         def _real_initialize(self):
299                 if self._downloader is None:
300                         return
301
302                 username = None
303                 password = None
304                 downloader_params = self._downloader.get_params()
305
306                 # Attempt to use provided username and password or .netrc data
307                 if downloader_params.get('username', None) is not None:
308                         username = downloader_params['username']
309                         password = downloader_params['password']
310                 elif downloader_params.get('usenetrc', False):
311                         try:
312                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
313                                 if info is not None:
314                                         username = info[0]
315                                         password = info[2]
316                                 else:
317                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
318                         except (IOError, netrc.NetrcParseError), err:
319                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
320                                 return
321
322                 if username is None:
323                         return
324
325                 # Log in
326                 login_form = {  'current_form': 'loginForm',
327                                 'next':         '/',
328                                 'action_login': 'Log In',
329                                 'username':     username,
330                                 'password':     password,       }
331                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
332                 try:
333                         self.to_stdout('[youtube] Logging in')
334                         login_results = urllib2.urlopen(request).read()
335                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
336                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
337                                 return
338                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
339                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
340                         return
341         
342                 # Confirm age
343                 age_form = {    'next_url':             '/',
344                                 'action_confirm':       'Confirm',      }
345                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
346                 try:
347                         self.to_stdout('[youtube] Confirming age')
348                         age_results = urllib2.urlopen(request).read()
349                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
350                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
351
352         def _real_extract(self, url):
353                 # Extract video id from URL
354                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
355                 if mobj is None:
356                         self.to_stderr('ERROR: Invalid URL: %s' % url)
357                         return [None]
358                 video_id = mobj.group(2)
359
360                 # Downloader parameters
361                 format_param = None
362                 if self._downloader is not None:
363                         params = self._downloader.get_params()
364                         format_param = params.get('format', None)
365
366                 # Extension
367                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
368
369                 # Normalize URL, including format
370                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
371                 if format_param is not None:
372                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
373                 request = urllib2.Request(normalized_url, None, std_headers)
374                 try:
375                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
376                         video_webpage = urllib2.urlopen(request).read()
377                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
378                         sys.exit('ERROR: Unable to download video: %s' % str(err))
379                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
380                 
381                 # "t" param
382                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
383                 if mobj is None:
384                         self.to_stderr('ERROR: Unable to extract "t" parameter')
385                         return [None]
386                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
387                 if format_param is not None:
388                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
389
390                 # uploader
391                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
392                 if mobj is None:
393                         self.to_stderr('ERROR: Unable to extract uploader nickname')
394                         return [None]
395                 video_uploader = mobj.group(1)
396
397                 # title
398                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
399                 if mobj is None:
400                         self.to_stderr('ERROR: Unable to extract video title')
401                         return [None]
402                 video_title = mobj.group(1).decode('utf-8')
403                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
404
405                 # simplified title
406                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
407                 simple_title = simple_title.strip(u'_')
408
409                 # Return information
410                 return [{       'id':           video_id,
411                                 'url':          video_real_url,
412                                 'uploader':     video_uploader,
413                                 'title':        video_title,
414                                 'stitle':       simple_title,
415                                 'ext':          video_extension,
416                                 }]
417
418 if __name__ == '__main__':
419         try:
420                 # General configuration
421                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
422                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
423
424                 # Information extractors
425                 youtube_ie = YoutubeIE()
426
427                 # File downloader
428                 fd = FileDownloader({   'usenetrc': False,
429                                         'username': None,
430                                         'password': None,
431                                         'quiet': False,
432                                         'format': None,
433                                         'outtmpl': '%(id)s.%(ext)s'
434                                         })
435                 fd.add_info_extractor(youtube_ie)
436                 fd.download([   'http://www.youtube.com/watch?v=t7qdwI7TVe8',
437                                 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
438                                 'http://www.youtube.com/watch?v=DZRXe1wtC-M',   ])
439
440         except KeyboardInterrupt:
441                 sys.exit('\nERROR: Interrupted by user')