6a79bf93a2a520ed2cf6418e42b2200918577ed9
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class FileDownloader(object):
29         """File Downloader class.
30
31         File downloader objects are the ones responsible of downloading the
32         actual video file and writing it to disk if the user has requested
33         it, among some other tasks. In most cases there should be one per
34         program. As, given a video URL, the downloader doesn't know how to
35         extract all the needed information, task that InfoExtractors do, it
36         has to pass the URL to one of them.
37
38         For this, file downloader objects have a method that allows
39         InfoExtractors to be registered in a given order. When it is passed
40         a URL, the file downloader handles it to the first InfoExtractor it
41         finds that reports being able to handle it. The InfoExtractor returns
42         all the information to the FileDownloader and the latter downloads the
43         file or does whatever it's instructed to do.
44
45         File downloaders accept a lot of parameters. In order not to saturate
46         the object constructor with arguments, it receives a dictionary of
47         options instead. These options are available through the get_params()
48         method for the InfoExtractors to use. The FileDownloader also registers
49         itself as the downloader in charge for the InfoExtractors that are
50         added to it, so this is a "mutual registration".
51
52         Available options:
53
54         username:       Username for authentication purposes.
55         password:       Password for authentication purposes.
56         usenetrc:       Use netrc for authentication instead.
57         quiet:          Do not print messages to stdout.
58         simulate:       Do not download the video files.
59         format:         Video format code.
60         outtmpl:        Template for output names.
61         """
62
63         _params = None
64         _ies = []
65
66         def __init__(self, params):
67                 self._ies = []
68                 self.set_params(params)
69         
70         @staticmethod
71         def pmkdir(filename):
72                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73                 components = filename.split(os.sep)
74                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75                 for dir in aggregate:
76                         if not os.path.exists(dir):
77                                 os.mkdir(dir)
78         
79         @staticmethod
80         def format_bytes(bytes):
81                 if bytes is None:
82                         return 'N/A'
83                 if bytes == 0:
84                         exponent = 0
85                 else:
86                         exponent = long(math.log(float(bytes), 1024.0))
87                 suffix = 'bkMGTPEZY'[exponent]
88                 converted = float(bytes) / float(1024**exponent)
89                 return '%.2f%s' % (converted, suffix)
90
91         @staticmethod
92         def calc_percent(byte_counter, data_len):
93                 if data_len is None:
94                         return '---.-%'
95                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
96
97         @staticmethod
98         def calc_eta(start, now, total, current):
99                 if total is None:
100                         return '--:--'
101                 dif = now - start
102                 if current == 0 or dif < 0.001: # One millisecond
103                         return '--:--'
104                 rate = float(current) / dif
105                 eta = long((float(total) - float(current)) / rate)
106                 (eta_mins, eta_secs) = divmod(eta, 60)
107                 if eta_mins > 99:
108                         return '--:--'
109                 return '%02d:%02d' % (eta_mins, eta_secs)
110
111         @staticmethod
112         def calc_speed(start, now, bytes):
113                 dif = now - start
114                 if bytes == 0 or dif < 0.001: # One millisecond
115                         return '%10s' % '---b/s'
116                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
117
118         @staticmethod
119         def best_block_size(elapsed_time, bytes):
120                 new_min = max(bytes / 2.0, 1.0)
121                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122                 if elapsed_time < 0.001:
123                         return int(new_max)
124                 rate = bytes / elapsed_time
125                 if rate > new_max:
126                         return int(new_max)
127                 if rate < new_min:
128                         return int(new_min)
129                 return int(rate)
130
131         def set_params(self, params):
132                 """Sets parameters."""
133                 if type(params) != dict:
134                         raise ValueError('params: dictionary expected')
135                 self._params = params
136         
137         def get_params(self):
138                 """Get parameters."""
139                 return self._params
140
141         def add_info_extractor(self, ie):
142                 """Add an InfoExtractor object to the end of the list."""
143                 self._ies.append(ie)
144                 ie.set_downloader(self)
145         
146         def to_stdout(self, message, skip_eol=False):
147                 """Print message to stdout if not in quiet mode."""
148                 if not self._params.get('quiet', False):
149                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
150                         sys.stdout.flush()
151         
152         def to_stderr(self, message):
153                 """Print message to stderr."""
154                 sys.stderr.write('%s\n' % message)
155         
156         def fixed_template(self):
157                 """Checks if the output template is fixed."""
158                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
159
160         def download(self, url_list):
161                 """Download a given list of URLs."""
162                 if len(url_list) > 1 and self.fixed_template():
163                         sys.exit('ERROR: fixed output name but more than one file to download')
164
165                 for url in url_list:
166                         suitable_found = False
167                         for ie in self._ies:
168                                 if not ie.suitable(url):
169                                         continue
170                                 # Suitable InfoExtractor found
171                                 suitable_found = True
172                                 results = [x for x in ie.extract(url) if x is not None]
173
174                                 if len(results) > 1 and self.fixed_template():
175                                         sys.exit('ERROR: fixed output name but more than one file to download')
176
177                                 if self._params.get('simulate', False):
178                                         continue
179
180                                 for result in results:
181                                         try:
182                                                 filename = self._params['outtmpl'] % result
183                                         except (ValueError, KeyError), err:
184                                                 self.to_stderr('ERROR: invalid output template: %s' % str(err))
185                                                 continue
186                                         try:
187                                                 self.pmkdir(filename)
188                                         except (OSError, IOError), err:
189                                                 self.to_stderr('ERROR: unable to create directories: %s' % str(err))
190                                                 continue
191                                         try:
192                                                 outstream = open(filename, 'wb')
193                                         except (OSError, IOError), err:
194                                                 self.to_stderr('ERROR: unable to open for writing: %s' % str(err))
195                                                 continue
196                                         try:
197                                                 self._do_download(outstream, result['url'])
198                                                 outstream.close()
199                                         except (OSError, IOError), err:
200                                                 self.to_stderr('ERROR: unable to write video data: %s' % str(err))
201                                                 continue
202                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
203                                                 self.to_stderr('ERROR: unable to download video data: %s' % str(err))
204                                                 continue
205                                 break
206                         if not suitable_found:
207                                 self.to_stderr('ERROR: no suitable InfoExtractor: %s' % url)
208         
209         def _do_download(self, stream, url):
210                 request = urllib2.Request(url, None, std_headers)
211                 data = urllib2.urlopen(request)
212                 data_len = data.info().get('Content-length', None)
213                 data_len_str = self.format_bytes(data_len)
214                 byte_counter = 0
215                 block_size = 1024
216                 start = time.time()
217                 while True:
218                         percent_str = self.calc_percent(byte_counter, data_len)
219                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
220                         speed_str = self.calc_speed(start, time.time(), byte_counter)
221                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
222                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
223
224                         before = time.time()
225                         data_block = data.read(block_size)
226                         after = time.time()
227                         data_block_len = len(data_block)
228                         if data_block_len == 0:
229                                 break
230                         byte_counter += data_block_len
231                         stream.write(data_block)
232                         block_size = self.best_block_size(after - before, data_block_len)
233
234                 self.to_stdout('')
235                 if data_len is not None and str(byte_counter) != data_len:
236                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
237
238 class InfoExtractor(object):
239         """Information Extractor class.
240
241         Information extractors are the classes that, given a URL, extract
242         information from the video (or videos) the URL refers to. This
243         information includes the real video URL, the video title and simplified
244         title, author and others. It is returned in a list of dictionaries when
245         calling its extract() method. It is a list because a URL can refer to
246         more than one video (think of playlists). The dictionaries must include
247         the following fields:
248
249         id:             Video identifier.
250         url:            Final video URL.
251         uploader:       Nickname of the video uploader.
252         title:          Literal title.
253         stitle:         Simplified title.
254         ext:            Video filename extension.
255
256         Subclasses of this one should re-define the _real_initialize() and
257         _real_extract() methods, as well as the suitable() static method.
258         Probably, they should also be instantiated and added to the main
259         downloader.
260         """
261
262         _ready = False
263         _downloader = None
264
265         def __init__(self, downloader=None):
266                 """Constructor. Receives an optional downloader."""
267                 self._ready = False
268                 self.set_downloader(downloader)
269
270         @staticmethod
271         def suitable(url):
272                 """Receives a URL and returns True if suitable for this IE."""
273                 return True
274
275         def initialize(self):
276                 """Initializes an instance (login, etc)."""
277                 if not self._ready:
278                         self._real_initialize()
279                         self._ready = True
280
281         def extract(self, url):
282                 """Extracts URL information and returns it in list of dicts."""
283                 self.initialize()
284                 return self._real_extract(url)
285
286         def set_downloader(self, downloader):
287                 """Sets the downloader for this IE."""
288                 self._downloader = downloader
289         
290         def to_stdout(self, message):
291                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
292                         print message
293         
294         def to_stderr(self, message):
295                 sys.stderr.write('%s\n' % message)
296
297         def _real_initialize(self):
298                 """Real initialization process. Redefine in subclasses."""
299                 pass
300
301         def _real_extract(self, url):
302                 """Real extraction process. Redefine in subclasses."""
303                 pass
304
305 class YoutubeIE(InfoExtractor):
306         """Information extractor for youtube.com."""
307
308         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
309         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
310         _NETRC_MACHINE = 'youtube'
311
312         def _real_initialize(self):
313                 if self._downloader is None:
314                         return
315
316                 username = None
317                 password = None
318                 downloader_params = self._downloader.get_params()
319
320                 # Attempt to use provided username and password or .netrc data
321                 if downloader_params.get('username', None) is not None:
322                         username = downloader_params['username']
323                         password = downloader_params['password']
324                 elif downloader_params.get('usenetrc', False):
325                         try:
326                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
327                                 if info is not None:
328                                         username = info[0]
329                                         password = info[2]
330                                 else:
331                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
332                         except (IOError, netrc.NetrcParseError), err:
333                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
334                                 return
335
336                 if username is None:
337                         return
338
339                 # Log in
340                 login_form = {
341                                 'current_form': 'loginForm',
342                                 'next':         '/',
343                                 'action_login': 'Log In',
344                                 'username':     username,
345                                 'password':     password,
346                                 }
347                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
348                 try:
349                         self.to_stdout('[youtube] Logging in')
350                         login_results = urllib2.urlopen(request).read()
351                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
352                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
353                                 return
354                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
355                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
356                         return
357         
358                 # Confirm age
359                 age_form = {
360                                 'next_url':             '/',
361                                 'action_confirm':       'Confirm',
362                                 }
363                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
364                 try:
365                         self.to_stdout('[youtube] Confirming age')
366                         age_results = urllib2.urlopen(request).read()
367                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
369
370         def _real_extract(self, url):
371                 # Extract video id from URL
372                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
373                 if mobj is None:
374                         self.to_stderr('ERROR: Invalid URL: %s' % url)
375                         return [None]
376                 video_id = mobj.group(2)
377
378                 # Downloader parameters
379                 format_param = None
380                 if self._downloader is not None:
381                         params = self._downloader.get_params()
382                         format_param = params.get('format', None)
383
384                 # Extension
385                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
386
387                 # Normalize URL, including format
388                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
389                 if format_param is not None:
390                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
391                 request = urllib2.Request(normalized_url, None, std_headers)
392                 try:
393                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
394                         video_webpage = urllib2.urlopen(request).read()
395                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
396                         sys.exit('ERROR: Unable to download video: %s' % str(err))
397                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
398                 
399                 # "t" param
400                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
401                 if mobj is None:
402                         self.to_stderr('ERROR: Unable to extract "t" parameter')
403                         return [None]
404                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
405                 if format_param is not None:
406                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
407                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
408
409                 # uploader
410                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
411                 if mobj is None:
412                         self.to_stderr('ERROR: Unable to extract uploader nickname')
413                         return [None]
414                 video_uploader = mobj.group(1)
415
416                 # title
417                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
418                 if mobj is None:
419                         self.to_stderr('ERROR: Unable to extract video title')
420                         return [None]
421                 video_title = mobj.group(1).decode('utf-8')
422                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
423
424                 # simplified title
425                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
426                 simple_title = simple_title.strip(ur'_')
427
428                 # Return information
429                 return [{
430                         'id':           video_id,
431                         'url':          video_real_url,
432                         'uploader':     video_uploader,
433                         'title':        video_title,
434                         'stitle':       simple_title,
435                         'ext':          video_extension,
436                         }]
437
438 if __name__ == '__main__':
439         try:
440                 # General configuration
441                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
442                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
443
444                 # Information extractors
445                 youtube_ie = YoutubeIE()
446
447                 # File downloader
448                 fd = FileDownloader({
449                         'usenetrc': False,
450                         'username': None,
451                         'password': None,
452                         'quiet': False,
453                         'simulate': False,
454                         'format': None,
455                         'outtmpl': '%(ext)s/%(ext)s/%(id)s.%(ext)s'
456                         })
457                 fd.add_info_extractor(youtube_ie)
458                 fd.download([
459                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
460                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
461                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
462                         ])
463
464         except KeyboardInterrupt:
465                 sys.exit('\nERROR: Interrupted by user')