45773fda2480257a7cf42b5907118ee9664d629b
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class FileDownloader(object):
29         """File Downloader class.
30
31         File downloader objects are the ones responsible of downloading the
32         actual video file and writing it to disk if the user has requested
33         it, among some other tasks. In most cases there should be one per
34         program. As, given a video URL, the downloader doesn't know how to
35         extract all the needed information, task that InfoExtractors do, it
36         has to pass the URL to one of them.
37
38         For this, file downloader objects have a method that allows
39         InfoExtractors to be registered in a given order. When it is passed
40         a URL, the file downloader handles it to the first InfoExtractor it
41         finds that reports it's able to handle it. The InfoExtractor returns
42         all the information to the FileDownloader and the latter downloads the
43         file or does whatever it's instructed to do.
44
45         File downloaders accept a lot of parameters. In order not to saturate
46         the object constructor with arguments, it receives a dictionary of
47         options instead. These options are available through the get_params()
48         method for the InfoExtractors to use. The FileDownloader also registers
49         itself as the downloader in charge for the InfoExtractors that are
50         added to it, so this is a "mutual registration".
51
52         Available options:
53
54         username:       Username for authentication purposes.
55         password:       Password for authentication purposes.
56         usenetrc:       Use netrc for authentication instead.
57         quiet:          Do not print messages to stdout.
58         format:         Video format code.
59         outtmpl:        Template for output names.
60         """
61
62         _params = None
63         _ies = []
64
65         def __init__(self, params):
66                 self._ies = []
67                 self.set_params(params)
68         
69         @staticmethod
70         def pmkdir(filename):
71                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
72                 components = filename.split(os.sep)
73                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
74                 for dir in aggregate:
75                         if not os.path.exists(dir):
76                                 os.mkdir(dir)
77         
78         @staticmethod
79         def format_bytes(bytes):
80                 if bytes is None:
81                         return 'N/A'
82                 if bytes == 0:
83                         exponent = 0
84                 else:
85                         exponent = long(math.log(float(bytes), 1024.0))
86                 suffix = 'bkMGTPEZY'[exponent]
87                 if exponent == 0:
88                         return '%s%s' % (bytes, suffix)
89                 converted = float(bytes) / float(1024**exponent)
90                 return '%.2f%s' % (converted, suffix)
91
92         @staticmethod
93         def calc_percent(byte_counter, data_len):
94                 if data_len is None:
95                         return '---.-%'
96                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
97
98         @staticmethod
99         def calc_eta(start, now, total, current):
100                 if total is None:
101                         return '--:--'
102                 dif = now - start
103                 if current == 0 or dif < 0.001: # One millisecond
104                         return '--:--'
105                 rate = float(current) / dif
106                 eta = long((float(total) - float(current)) / rate)
107                 (eta_mins, eta_secs) = divmod(eta, 60)
108                 if eta_mins > 99:
109                         return '--:--'
110                 return '%02d:%02d' % (eta_mins, eta_secs)
111
112         @staticmethod
113         def calc_speed(start, now, bytes):
114                 dif = now - start
115                 if bytes == 0 or dif < 0.001: # One millisecond
116                         return '%9s' % 'N/A b/s'
117                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
118
119         @staticmethod
120         def best_block_size(elapsed_time, bytes):
121                 new_min = max(bytes / 2.0, 1.0)
122                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
123                 if elapsed_time < 0.001:
124                         return int(new_max)
125                 rate = bytes / elapsed_time
126                 if rate > new_max:
127                         return int(new_max)
128                 if rate < new_min:
129                         return int(new_min)
130                 return int(rate)
131
132         def set_params(self, params):
133                 """Sets parameters."""
134                 if type(params) != dict:
135                         raise ValueError('params: dictionary expected')
136                 self._params = params
137         
138         def get_params(self):
139                 """Get parameters."""
140                 return self._params
141
142         def add_info_extractor(self, ie):
143                 """Add an InfoExtractor object to the end of the list."""
144                 self._ies.append(ie)
145                 ie.set_downloader(self)
146         
147         def download(self, url_list):
148                 """Download a given list of URLs."""
149                 for url in url_list:
150                         suitable_found = False
151                         for ie in self._ies:
152                                 if not ie.suitable(url):
153                                         continue
154                                 # Suitable InfoExtractor found
155                                 suitable_found = True
156                                 for result in ie.extract(url):
157                                         if result is None:
158                                                 continue
159                                         try:
160                                                 filename = self._params['outtmpl'] % result
161                                         except (KeyError), err:
162                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
163                                                 continue
164                                         try:
165                                                 self.pmkdir(filename)
166                                         except (OSError, IOError), err:
167                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
168                                                 continue
169                                         try:
170                                                 outstream = open(filename, 'wb')
171                                         except (OSError, IOError), err:
172                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
173                                                 continue
174                                         try:
175                                                 self._do_download(outstream, result['url'])
176                                                 outstream.close()
177                                         except (OSError, IOError), err:
178                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
179                                                 continue
180                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
181                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
182                                                 continue
183                                 break
184                         if not suitable_found:
185                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
186         
187         def _do_download(self, stream, url):
188                 request = urllib2.Request(url, None, std_headers)
189                 data = urllib2.urlopen(request)
190                 data_len = data.info().get('Content-length', None)
191                 data_len_str = self.format_bytes(data_len)
192                 byte_counter = 0
193                 block_size = 1024
194                 start = time.time()
195                 while True:
196                         percent_str = self.calc_percent(byte_counter, data_len)
197                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
198                         speed_str = self.calc_speed(start, time.time(), byte_counter)
199
200                         if not self._params.get('quiet', False):
201                                 sys.stdout.write('\r[download] %s of %s at %s ETA %s' %
202                                                 (percent_str, data_len_str, speed_str, eta_str))
203                                 sys.stdout.flush()
204
205                         before = time.time()
206                         data_block = data.read(block_size)
207                         after = time.time()
208                         data_block_len = len(data_block)
209                         if data_block_len == 0:
210                                 break
211                         byte_counter += data_block_len
212                         stream.write(data_block)
213                         block_size = self.best_block_size(after - before, data_block_len)
214
215                 if not self._params.get('quiet', False):
216                         print
217
218                 if data_len is not None and str(byte_counter) != data_len:
219                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
220
221 class InfoExtractor(object):
222         """Information Extractor class.
223
224         Information extractors are the classes that, given a URL, extract
225         information from the video (or videos) the URL refers to. This
226         information includes the real video URL, the video title and simplified
227         title, author and others. It is returned in a list of dictionaries when
228         calling its extract() method. It is a list because a URL can refer to
229         more than one video (think of playlists). The dictionaries must include
230         the following fields:
231
232         id:             Video identifier.
233         url:            Final video URL.
234         uploader:       Nickname of the video uploader.
235         title:          Literal title.
236         stitle:         Simplified title.
237         ext:            Video filename extension.
238
239         Subclasses of this one should re-define the _real_initialize() and
240         _real_extract() methods, as well as the suitable() static method.
241         Probably, they should also be instantiated and added to the main
242         downloader.
243         """
244
245         _ready = False
246         _downloader = None
247
248         def __init__(self, downloader=None):
249                 """Constructor. Receives an optional downloader."""
250                 self._ready = False
251                 self.set_downloader(downloader)
252
253         @staticmethod
254         def suitable(url):
255                 """Receives a URL and returns True if suitable for this IE."""
256                 return True
257
258         def initialize(self):
259                 """Initializes an instance (login, etc)."""
260                 if not self._ready:
261                         self._real_initialize()
262                         self._ready = True
263
264         def extract(self, url):
265                 """Extracts URL information and returns it in list of dicts."""
266                 self.initialize()
267                 return self._real_extract(url)
268
269         def set_downloader(self, downloader):
270                 """Sets the downloader for this IE."""
271                 self._downloader = downloader
272         
273         def to_stdout(self, message):
274                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
275                         print message
276         
277         def to_stderr(self, message):
278                 sys.stderr.write('%s\n' % message)
279
280         def _real_initialize(self):
281                 """Real initialization process. Redefine in subclasses."""
282                 pass
283
284         def _real_extract(self, url):
285                 """Real extraction process. Redefine in subclasses."""
286                 pass
287
288 class YoutubeIE(InfoExtractor):
289         """Information extractor for youtube.com."""
290
291         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
292         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
293         _NETRC_MACHINE = 'youtube'
294
295         def _real_initialize(self):
296                 if self._downloader is None:
297                         return
298
299                 username = None
300                 password = None
301                 downloader_params = self._downloader.get_params()
302
303                 # Attempt to use provided username and password or .netrc data
304                 if downloader_params.get('username', None) is not None:
305                         username = downloader_params['username']
306                         password = downloader_params['password']
307                 elif downloader_params.get('usenetrc', False):
308                         try:
309                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
310                                 if info is not None:
311                                         username = info[0]
312                                         password = info[2]
313                                 else:
314                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
315                         except (IOError, netrc.NetrcParseError), err:
316                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
317                                 return
318
319                 if username is None:
320                         return
321
322                 # Log in
323                 login_form = {  'current_form': 'loginForm',
324                                 'next':         '/',
325                                 'action_login': 'Log In',
326                                 'username':     username,
327                                 'password':     password,       }
328                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
329                 try:
330                         self.to_stdout('[youtube] Logging in')
331                         login_results = urllib2.urlopen(request).read()
332                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
333                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
334                                 return
335                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
336                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
337                         return
338         
339                 # Confirm age
340                 age_form = {    'next_url':             '/',
341                                 'action_confirm':       'Confirm',      }
342                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
343                 try:
344                         self.to_stdout('[youtube] Confirming age')
345                         age_results = urllib2.urlopen(request).read()
346                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
347                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
348
349         def _real_extract(self, url):
350                 # Extract video id from URL
351                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
352                 if mobj is None:
353                         self.to_stderr('ERROR: Invalid URL: %s' % url)
354                         return [None]
355                 video_id = mobj.group(2)
356
357                 # Downloader parameters
358                 format_param = None
359                 if self._downloader is not None:
360                         params = self._downloader.get_params()
361                         format_param = params.get('format', None)
362
363                 # Extension
364                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
365
366                 # Normalize URL, including format
367                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
368                 if format_param is not None:
369                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
370                 request = urllib2.Request(normalized_url, None, std_headers)
371                 try:
372                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
373                         video_webpage = urllib2.urlopen(request).read()
374                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
375                         sys.exit('ERROR: Unable to download video: %s' % str(err))
376                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
377                 
378                 # "t" param
379                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
380                 if mobj is None:
381                         self.to_stderr('ERROR: Unable to extract "t" parameter')
382                         return [None]
383                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
384                 if format_param is not None:
385                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
386
387                 # uploader
388                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
389                 if mobj is None:
390                         self.to_stderr('ERROR: Unable to extract uploader nickname')
391                         return [None]
392                 video_uploader = mobj.group(1)
393
394                 # title
395                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
396                 if mobj is None:
397                         self.to_stderr('ERROR: Unable to extract video title')
398                         return [None]
399                 video_title = mobj.group(1).decode('utf-8')
400                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
401
402                 # simplified title
403                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
404                 simple_title = simple_title.strip(u'_')
405
406                 # Return information
407                 return [{       'id':           video_id,
408                                 'url':          video_real_url,
409                                 'uploader':     video_uploader,
410                                 'title':        video_title,
411                                 'stitle':       simple_title,
412                                 'ext':          video_extension,
413                                 }]
414
415 if __name__ == '__main__':
416         try:
417                 # General configuration
418                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
419                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
420
421                 # Information extractors
422                 youtube_ie = YoutubeIE()
423
424                 # File downloader
425                 fd = FileDownloader({   'usenetrc': False,
426                                         'username': None,
427                                         'password': None,
428                                         'quiet': False,
429                                         'format': None,
430                                         'outtmpl': '%(id)s.%(ext)s'
431                                         })
432                 fd.add_info_extractor(youtube_ie)
433                 fd.download([   'http://www.youtube.com/watch?v=t7qdwI7TVe8',
434                                 'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
435                                 'http://www.youtube.com/watch?v=DZRXe1wtC-M',   ])
436
437         except KeyboardInterrupt:
438                 sys.exit('\nERROR: Interrupted by user')