adb4234c7be89d4507426d8cab3d66866474f0a5
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class FileDownloader(object):
29         """File Downloader class.
30
31         File downloader objects are the ones responsible of downloading the
32         actual video file and writing it to disk if the user has requested
33         it, among some other tasks. In most cases there should be one per
34         program. As, given a video URL, the downloader doesn't know how to
35         extract all the needed information, task that InfoExtractors do, it
36         has to pass the URL to one of them.
37
38         For this, file downloader objects have a method that allows
39         InfoExtractors to be registered in a given order. When it is passed
40         a URL, the file downloader handles it to the first InfoExtractor it
41         finds that reports being able to handle it. The InfoExtractor returns
42         all the information to the FileDownloader and the latter downloads the
43         file or does whatever it's instructed to do.
44
45         File downloaders accept a lot of parameters. In order not to saturate
46         the object constructor with arguments, it receives a dictionary of
47         options instead. These options are available through the get_params()
48         method for the InfoExtractors to use. The FileDownloader also registers
49         itself as the downloader in charge for the InfoExtractors that are
50         added to it, so this is a "mutual registration".
51
52         Available options:
53
54         username:       Username for authentication purposes.
55         password:       Password for authentication purposes.
56         usenetrc:       Use netrc for authentication instead.
57         quiet:          Do not print messages to stdout.
58         format:         Video format code.
59         outtmpl:        Template for output names.
60         """
61
62         _params = None
63         _ies = []
64
65         def __init__(self, params):
66                 self._ies = []
67                 self.set_params(params)
68         
69         @staticmethod
70         def pmkdir(filename):
71                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
72                 components = filename.split(os.sep)
73                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
74                 for dir in aggregate:
75                         if not os.path.exists(dir):
76                                 os.mkdir(dir)
77         
78         @staticmethod
79         def format_bytes(bytes):
80                 if bytes is None:
81                         return 'N/A'
82                 if bytes == 0:
83                         exponent = 0
84                 else:
85                         exponent = long(math.log(float(bytes), 1024.0))
86                 suffix = 'bkMGTPEZY'[exponent]
87                 converted = float(bytes) / float(1024**exponent)
88                 return '%.2f%s' % (converted, suffix)
89
90         @staticmethod
91         def calc_percent(byte_counter, data_len):
92                 if data_len is None:
93                         return '---.-%'
94                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
95
96         @staticmethod
97         def calc_eta(start, now, total, current):
98                 if total is None:
99                         return '--:--'
100                 dif = now - start
101                 if current == 0 or dif < 0.001: # One millisecond
102                         return '--:--'
103                 rate = float(current) / dif
104                 eta = long((float(total) - float(current)) / rate)
105                 (eta_mins, eta_secs) = divmod(eta, 60)
106                 if eta_mins > 99:
107                         return '--:--'
108                 return '%02d:%02d' % (eta_mins, eta_secs)
109
110         @staticmethod
111         def calc_speed(start, now, bytes):
112                 dif = now - start
113                 if bytes == 0 or dif < 0.001: # One millisecond
114                         return '%10s' % '---b/s'
115                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
116
117         @staticmethod
118         def best_block_size(elapsed_time, bytes):
119                 new_min = max(bytes / 2.0, 1.0)
120                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
121                 if elapsed_time < 0.001:
122                         return int(new_max)
123                 rate = bytes / elapsed_time
124                 if rate > new_max:
125                         return int(new_max)
126                 if rate < new_min:
127                         return int(new_min)
128                 return int(rate)
129
130         def set_params(self, params):
131                 """Sets parameters."""
132                 if type(params) != dict:
133                         raise ValueError('params: dictionary expected')
134                 self._params = params
135         
136         def get_params(self):
137                 """Get parameters."""
138                 return self._params
139
140         def add_info_extractor(self, ie):
141                 """Add an InfoExtractor object to the end of the list."""
142                 self._ies.append(ie)
143                 ie.set_downloader(self)
144         
145         def to_stdout(self, message, skip_eol=False):
146                 """Print message to stdout if not in quiet mode."""
147                 if not self._params.get('quiet', False):
148                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
149                         sys.stdout.flush()
150
151         def download(self, url_list):
152                 """Download a given list of URLs."""
153                 for url in url_list:
154                         suitable_found = False
155                         for ie in self._ies:
156                                 if not ie.suitable(url):
157                                         continue
158                                 # Suitable InfoExtractor found
159                                 suitable_found = True
160                                 results = [x for x in ie.extract(url) if x is not None]
161
162                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
163                                         sys.exit('ERROR: fixed output name but more than one file to download')
164
165                                 for result in results:
166                                         try:
167                                                 filename = self._params['outtmpl'] % result
168                                         except (KeyError), err:
169                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
170                                                 continue
171                                         try:
172                                                 self.pmkdir(filename)
173                                         except (OSError, IOError), err:
174                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
175                                                 continue
176                                         try:
177                                                 outstream = open(filename, 'wb')
178                                         except (OSError, IOError), err:
179                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
180                                                 continue
181                                         try:
182                                                 self._do_download(outstream, result['url'])
183                                                 outstream.close()
184                                         except (OSError, IOError), err:
185                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
186                                                 continue
187                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
188                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
189                                                 continue
190                                 break
191                         if not suitable_found:
192                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
193         
194         def _do_download(self, stream, url):
195                 request = urllib2.Request(url, None, std_headers)
196                 data = urllib2.urlopen(request)
197                 data_len = data.info().get('Content-length', None)
198                 data_len_str = self.format_bytes(data_len)
199                 byte_counter = 0
200                 block_size = 1024
201                 start = time.time()
202                 while True:
203                         percent_str = self.calc_percent(byte_counter, data_len)
204                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
205                         speed_str = self.calc_speed(start, time.time(), byte_counter)
206                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
207                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
208
209                         before = time.time()
210                         data_block = data.read(block_size)
211                         after = time.time()
212                         data_block_len = len(data_block)
213                         if data_block_len == 0:
214                                 break
215                         byte_counter += data_block_len
216                         stream.write(data_block)
217                         block_size = self.best_block_size(after - before, data_block_len)
218
219                 self.to_stdout('')
220                 if data_len is not None and str(byte_counter) != data_len:
221                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
222
223 class InfoExtractor(object):
224         """Information Extractor class.
225
226         Information extractors are the classes that, given a URL, extract
227         information from the video (or videos) the URL refers to. This
228         information includes the real video URL, the video title and simplified
229         title, author and others. It is returned in a list of dictionaries when
230         calling its extract() method. It is a list because a URL can refer to
231         more than one video (think of playlists). The dictionaries must include
232         the following fields:
233
234         id:             Video identifier.
235         url:            Final video URL.
236         uploader:       Nickname of the video uploader.
237         title:          Literal title.
238         stitle:         Simplified title.
239         ext:            Video filename extension.
240
241         Subclasses of this one should re-define the _real_initialize() and
242         _real_extract() methods, as well as the suitable() static method.
243         Probably, they should also be instantiated and added to the main
244         downloader.
245         """
246
247         _ready = False
248         _downloader = None
249
250         def __init__(self, downloader=None):
251                 """Constructor. Receives an optional downloader."""
252                 self._ready = False
253                 self.set_downloader(downloader)
254
255         @staticmethod
256         def suitable(url):
257                 """Receives a URL and returns True if suitable for this IE."""
258                 return True
259
260         def initialize(self):
261                 """Initializes an instance (login, etc)."""
262                 if not self._ready:
263                         self._real_initialize()
264                         self._ready = True
265
266         def extract(self, url):
267                 """Extracts URL information and returns it in list of dicts."""
268                 self.initialize()
269                 return self._real_extract(url)
270
271         def set_downloader(self, downloader):
272                 """Sets the downloader for this IE."""
273                 self._downloader = downloader
274         
275         def to_stdout(self, message):
276                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
277                         print message
278         
279         def to_stderr(self, message):
280                 sys.stderr.write('%s\n' % message)
281
282         def _real_initialize(self):
283                 """Real initialization process. Redefine in subclasses."""
284                 pass
285
286         def _real_extract(self, url):
287                 """Real extraction process. Redefine in subclasses."""
288                 pass
289
290 class YoutubeIE(InfoExtractor):
291         """Information extractor for youtube.com."""
292
293         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
294         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
295         _NETRC_MACHINE = 'youtube'
296
297         def _real_initialize(self):
298                 if self._downloader is None:
299                         return
300
301                 username = None
302                 password = None
303                 downloader_params = self._downloader.get_params()
304
305                 # Attempt to use provided username and password or .netrc data
306                 if downloader_params.get('username', None) is not None:
307                         username = downloader_params['username']
308                         password = downloader_params['password']
309                 elif downloader_params.get('usenetrc', False):
310                         try:
311                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
312                                 if info is not None:
313                                         username = info[0]
314                                         password = info[2]
315                                 else:
316                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
317                         except (IOError, netrc.NetrcParseError), err:
318                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
319                                 return
320
321                 if username is None:
322                         return
323
324                 # Log in
325                 login_form = {
326                                 'current_form': 'loginForm',
327                                 'next':         '/',
328                                 'action_login': 'Log In',
329                                 'username':     username,
330                                 'password':     password,
331                                 }
332                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
333                 try:
334                         self.to_stdout('[youtube] Logging in')
335                         login_results = urllib2.urlopen(request).read()
336                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
337                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
338                                 return
339                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
340                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
341                         return
342         
343                 # Confirm age
344                 age_form = {
345                                 'next_url':             '/',
346                                 'action_confirm':       'Confirm',
347                                 }
348                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
349                 try:
350                         self.to_stdout('[youtube] Confirming age')
351                         age_results = urllib2.urlopen(request).read()
352                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
353                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
354
355         def _real_extract(self, url):
356                 # Extract video id from URL
357                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
358                 if mobj is None:
359                         self.to_stderr('ERROR: Invalid URL: %s' % url)
360                         return [None]
361                 video_id = mobj.group(2)
362
363                 # Downloader parameters
364                 format_param = None
365                 if self._downloader is not None:
366                         params = self._downloader.get_params()
367                         format_param = params.get('format', None)
368
369                 # Extension
370                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
371
372                 # Normalize URL, including format
373                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
374                 if format_param is not None:
375                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
376                 request = urllib2.Request(normalized_url, None, std_headers)
377                 try:
378                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
379                         video_webpage = urllib2.urlopen(request).read()
380                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
381                         sys.exit('ERROR: Unable to download video: %s' % str(err))
382                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
383                 
384                 # "t" param
385                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
386                 if mobj is None:
387                         self.to_stderr('ERROR: Unable to extract "t" parameter')
388                         return [None]
389                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
390                 if format_param is not None:
391                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
392                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
393
394                 # uploader
395                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
396                 if mobj is None:
397                         self.to_stderr('ERROR: Unable to extract uploader nickname')
398                         return [None]
399                 video_uploader = mobj.group(1)
400
401                 # title
402                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
403                 if mobj is None:
404                         self.to_stderr('ERROR: Unable to extract video title')
405                         return [None]
406                 video_title = mobj.group(1).decode('utf-8')
407                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
408
409                 # simplified title
410                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
411                 simple_title = simple_title.strip(u'_')
412
413                 # Return information
414                 return [{
415                         'id':           video_id,
416                         'url':          video_real_url,
417                         'uploader':     video_uploader,
418                         'title':        video_title,
419                         'stitle':       simple_title,
420                         'ext':          video_extension,
421                         }]
422
423 if __name__ == '__main__':
424         try:
425                 # General configuration
426                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
427                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
428
429                 # Information extractors
430                 youtube_ie = YoutubeIE()
431
432                 # File downloader
433                 fd = FileDownloader({
434                         'usenetrc': False,
435                         'username': None,
436                         'password': None,
437                         'quiet': False,
438                         'format': None,
439                         'outtmpl': '%(id)s.%(ext)s'
440                         })
441                 fd.add_info_extractor(youtube_ie)
442                 fd.download([
443                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
444                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
445                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
446                         ])
447
448         except KeyboardInterrupt:
449                 sys.exit('\nERROR: Interrupted by user')