]> git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl
7323ad4f28981c4bd7aae3930215363e5af16270
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import math
8 import netrc
9 import os
10 import os.path
11 import re
12 import socket
13 import string
14 import sys
15 import time
16 import urllib
17 import urllib2
18
19 std_headers = { 
20         'User-Agent': 'UserAgent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9) Gecko/2008052906 Firefox/3.0',
21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
23         'Accept-Language': 'en-us,en;q=0.5',
24 }
25
26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
27
28 class FileDownloader(object):
29         """File Downloader class.
30
31         File downloader objects are the ones responsible of downloading the
32         actual video file and writing it to disk if the user has requested
33         it, among some other tasks. In most cases there should be one per
34         program. As, given a video URL, the downloader doesn't know how to
35         extract all the needed information, task that InfoExtractors do, it
36         has to pass the URL to one of them.
37
38         For this, file downloader objects have a method that allows
39         InfoExtractors to be registered in a given order. When it is passed
40         a URL, the file downloader handles it to the first InfoExtractor it
41         finds that reports being able to handle it. The InfoExtractor returns
42         all the information to the FileDownloader and the latter downloads the
43         file or does whatever it's instructed to do.
44
45         File downloaders accept a lot of parameters. In order not to saturate
46         the object constructor with arguments, it receives a dictionary of
47         options instead. These options are available through the get_params()
48         method for the InfoExtractors to use. The FileDownloader also registers
49         itself as the downloader in charge for the InfoExtractors that are
50         added to it, so this is a "mutual registration".
51
52         Available options:
53
54         username:       Username for authentication purposes.
55         password:       Password for authentication purposes.
56         usenetrc:       Use netrc for authentication instead.
57         quiet:          Do not print messages to stdout.
58         simulate:       Do not download the video files.
59         format:         Video format code.
60         outtmpl:        Template for output names.
61         """
62
63         _params = None
64         _ies = []
65
66         def __init__(self, params):
67                 self._ies = []
68                 self.set_params(params)
69         
70         @staticmethod
71         def pmkdir(filename):
72                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
73                 components = filename.split(os.sep)
74                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
75                 for dir in aggregate:
76                         if not os.path.exists(dir):
77                                 os.mkdir(dir)
78         
79         @staticmethod
80         def format_bytes(bytes):
81                 if bytes is None:
82                         return 'N/A'
83                 if bytes == 0:
84                         exponent = 0
85                 else:
86                         exponent = long(math.log(float(bytes), 1024.0))
87                 suffix = 'bkMGTPEZY'[exponent]
88                 converted = float(bytes) / float(1024**exponent)
89                 return '%.2f%s' % (converted, suffix)
90
91         @staticmethod
92         def calc_percent(byte_counter, data_len):
93                 if data_len is None:
94                         return '---.-%'
95                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
96
97         @staticmethod
98         def calc_eta(start, now, total, current):
99                 if total is None:
100                         return '--:--'
101                 dif = now - start
102                 if current == 0 or dif < 0.001: # One millisecond
103                         return '--:--'
104                 rate = float(current) / dif
105                 eta = long((float(total) - float(current)) / rate)
106                 (eta_mins, eta_secs) = divmod(eta, 60)
107                 if eta_mins > 99:
108                         return '--:--'
109                 return '%02d:%02d' % (eta_mins, eta_secs)
110
111         @staticmethod
112         def calc_speed(start, now, bytes):
113                 dif = now - start
114                 if bytes == 0 or dif < 0.001: # One millisecond
115                         return '%10s' % '---b/s'
116                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
117
118         @staticmethod
119         def best_block_size(elapsed_time, bytes):
120                 new_min = max(bytes / 2.0, 1.0)
121                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
122                 if elapsed_time < 0.001:
123                         return int(new_max)
124                 rate = bytes / elapsed_time
125                 if rate > new_max:
126                         return int(new_max)
127                 if rate < new_min:
128                         return int(new_min)
129                 return int(rate)
130
131         def set_params(self, params):
132                 """Sets parameters."""
133                 if type(params) != dict:
134                         raise ValueError('params: dictionary expected')
135                 self._params = params
136         
137         def get_params(self):
138                 """Get parameters."""
139                 return self._params
140
141         def add_info_extractor(self, ie):
142                 """Add an InfoExtractor object to the end of the list."""
143                 self._ies.append(ie)
144                 ie.set_downloader(self)
145         
146         def to_stdout(self, message, skip_eol=False):
147                 """Print message to stdout if not in quiet mode."""
148                 if not self._params.get('quiet', False):
149                         sys.stdout.write('%s%s' % (message, ['\n', ''][skip_eol]))
150                         sys.stdout.flush()
151
152         def download(self, url_list):
153                 """Download a given list of URLs."""
154                 for url in url_list:
155                         suitable_found = False
156                         for ie in self._ies:
157                                 if not ie.suitable(url):
158                                         continue
159                                 # Suitable InfoExtractor found
160                                 suitable_found = True
161                                 results = [x for x in ie.extract(url) if x is not None]
162
163                                 if (len(url_list) > 1 or len(results) > 1) and re.search(r'%\(.+?\)s', self._params['outtmpl']) is None:
164                                         sys.exit('ERROR: fixed output name but more than one file to download')
165
166                                 if self._params.get('simulate', False):
167                                         continue
168
169                                 for result in results:
170                                         try:
171                                                 filename = self._params['outtmpl'] % result
172                                         except (KeyError), err:
173                                                 sys.stderr.write('ERROR: invalid output template: %s\n' % str(err))
174                                                 continue
175                                         try:
176                                                 self.pmkdir(filename)
177                                         except (OSError, IOError), err:
178                                                 sys.stderr.write('ERROR: unable to create directories: %s\n' % str(err))
179                                                 continue
180                                         try:
181                                                 outstream = open(filename, 'wb')
182                                         except (OSError, IOError), err:
183                                                 sys.stderr.write('ERROR: unable to open for writing: %s\n' % str(err))
184                                                 continue
185                                         try:
186                                                 self._do_download(outstream, result['url'])
187                                                 outstream.close()
188                                         except (OSError, IOError), err:
189                                                 sys.stderr.write('ERROR: unable to write video data: %s\n' % str(err))
190                                                 continue
191                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
192                                                 sys.stderr.write('ERROR: unable to download video data: %s\n' % str(err))
193                                                 continue
194                                 break
195                         if not suitable_found:
196                                 sys.stderr.write('ERROR: no suitable InfoExtractor: %s\n' % url)
197         
198         def _do_download(self, stream, url):
199                 request = urllib2.Request(url, None, std_headers)
200                 data = urllib2.urlopen(request)
201                 data_len = data.info().get('Content-length', None)
202                 data_len_str = self.format_bytes(data_len)
203                 byte_counter = 0
204                 block_size = 1024
205                 start = time.time()
206                 while True:
207                         percent_str = self.calc_percent(byte_counter, data_len)
208                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
209                         speed_str = self.calc_speed(start, time.time(), byte_counter)
210                         self.to_stdout('\r[download] %s of %s at %s ETA %s' %
211                                         (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
212
213                         before = time.time()
214                         data_block = data.read(block_size)
215                         after = time.time()
216                         data_block_len = len(data_block)
217                         if data_block_len == 0:
218                                 break
219                         byte_counter += data_block_len
220                         stream.write(data_block)
221                         block_size = self.best_block_size(after - before, data_block_len)
222
223                 self.to_stdout('')
224                 if data_len is not None and str(byte_counter) != data_len:
225                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
226
227 class InfoExtractor(object):
228         """Information Extractor class.
229
230         Information extractors are the classes that, given a URL, extract
231         information from the video (or videos) the URL refers to. This
232         information includes the real video URL, the video title and simplified
233         title, author and others. It is returned in a list of dictionaries when
234         calling its extract() method. It is a list because a URL can refer to
235         more than one video (think of playlists). The dictionaries must include
236         the following fields:
237
238         id:             Video identifier.
239         url:            Final video URL.
240         uploader:       Nickname of the video uploader.
241         title:          Literal title.
242         stitle:         Simplified title.
243         ext:            Video filename extension.
244
245         Subclasses of this one should re-define the _real_initialize() and
246         _real_extract() methods, as well as the suitable() static method.
247         Probably, they should also be instantiated and added to the main
248         downloader.
249         """
250
251         _ready = False
252         _downloader = None
253
254         def __init__(self, downloader=None):
255                 """Constructor. Receives an optional downloader."""
256                 self._ready = False
257                 self.set_downloader(downloader)
258
259         @staticmethod
260         def suitable(url):
261                 """Receives a URL and returns True if suitable for this IE."""
262                 return True
263
264         def initialize(self):
265                 """Initializes an instance (login, etc)."""
266                 if not self._ready:
267                         self._real_initialize()
268                         self._ready = True
269
270         def extract(self, url):
271                 """Extracts URL information and returns it in list of dicts."""
272                 self.initialize()
273                 return self._real_extract(url)
274
275         def set_downloader(self, downloader):
276                 """Sets the downloader for this IE."""
277                 self._downloader = downloader
278         
279         def to_stdout(self, message):
280                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
281                         print message
282         
283         def to_stderr(self, message):
284                 sys.stderr.write('%s\n' % message)
285
286         def _real_initialize(self):
287                 """Real initialization process. Redefine in subclasses."""
288                 pass
289
290         def _real_extract(self, url):
291                 """Real extraction process. Redefine in subclasses."""
292                 pass
293
294 class YoutubeIE(InfoExtractor):
295         """Information extractor for youtube.com."""
296
297         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
298         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
299         _NETRC_MACHINE = 'youtube'
300
301         def _real_initialize(self):
302                 if self._downloader is None:
303                         return
304
305                 username = None
306                 password = None
307                 downloader_params = self._downloader.get_params()
308
309                 # Attempt to use provided username and password or .netrc data
310                 if downloader_params.get('username', None) is not None:
311                         username = downloader_params['username']
312                         password = downloader_params['password']
313                 elif downloader_params.get('usenetrc', False):
314                         try:
315                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
316                                 if info is not None:
317                                         username = info[0]
318                                         password = info[2]
319                                 else:
320                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
321                         except (IOError, netrc.NetrcParseError), err:
322                                 self.to_stderr('WARNING: parsing .netrc: %s' % str(err))
323                                 return
324
325                 if username is None:
326                         return
327
328                 # Log in
329                 login_form = {
330                                 'current_form': 'loginForm',
331                                 'next':         '/',
332                                 'action_login': 'Log In',
333                                 'username':     username,
334                                 'password':     password,
335                                 }
336                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
337                 try:
338                         self.to_stdout('[youtube] Logging in')
339                         login_results = urllib2.urlopen(request).read()
340                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
341                                 self.to_stderr('WARNING: Unable to log in: bad username or password')
342                                 return
343                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
344                         self.to_stderr('WARNING: Unable to log in: %s' % str(err))
345                         return
346         
347                 # Confirm age
348                 age_form = {
349                                 'next_url':             '/',
350                                 'action_confirm':       'Confirm',
351                                 }
352                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
353                 try:
354                         self.to_stdout('[youtube] Confirming age')
355                         age_results = urllib2.urlopen(request).read()
356                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
357                         sys.exit('ERROR: Unable to confirm age: %s' % str(err))
358
359         def _real_extract(self, url):
360                 # Extract video id from URL
361                 mobj = re.match(r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$', url)
362                 if mobj is None:
363                         self.to_stderr('ERROR: Invalid URL: %s' % url)
364                         return [None]
365                 video_id = mobj.group(2)
366
367                 # Downloader parameters
368                 format_param = None
369                 if self._downloader is not None:
370                         params = self._downloader.get_params()
371                         format_param = params.get('format', None)
372
373                 # Extension
374                 video_extension = {18: 'mp4'}.get(format_param, 'flv')
375
376                 # Normalize URL, including format
377                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
378                 if format_param is not None:
379                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
380                 request = urllib2.Request(normalized_url, None, std_headers)
381                 try:
382                         self.to_stdout('[youtube] %s: Downloading video webpage' % video_id)
383                         video_webpage = urllib2.urlopen(request).read()
384                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
385                         sys.exit('ERROR: Unable to download video: %s' % str(err))
386                 self.to_stdout('[youtube] %s: Extracting video information' % video_id)
387                 
388                 # "t" param
389                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
390                 if mobj is None:
391                         self.to_stderr('ERROR: Unable to extract "t" parameter')
392                         return [None]
393                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
394                 if format_param is not None:
395                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
396                 self.to_stdout('[youtube] %s: URL: %s' % (video_id, video_real_url))
397
398                 # uploader
399                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
400                 if mobj is None:
401                         self.to_stderr('ERROR: Unable to extract uploader nickname')
402                         return [None]
403                 video_uploader = mobj.group(1)
404
405                 # title
406                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
407                 if mobj is None:
408                         self.to_stderr('ERROR: Unable to extract video title')
409                         return [None]
410                 video_title = mobj.group(1).decode('utf-8')
411                 video_title = re.sub(u'&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
412
413                 # simplified title
414                 simple_title = re.sub(u'([^%s]+)' % simple_title_chars, u'_', video_title)
415                 simple_title = simple_title.strip(u'_')
416
417                 # Return information
418                 return [{
419                         'id':           video_id,
420                         'url':          video_real_url,
421                         'uploader':     video_uploader,
422                         'title':        video_title,
423                         'stitle':       simple_title,
424                         'ext':          video_extension,
425                         }]
426
427 if __name__ == '__main__':
428         try:
429                 # General configuration
430                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
431                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
432
433                 # Information extractors
434                 youtube_ie = YoutubeIE()
435
436                 # File downloader
437                 fd = FileDownloader({
438                         'usenetrc': False,
439                         'username': None,
440                         'password': None,
441                         'quiet': False,
442                         'simulate': True,
443                         'format': None,
444                         'outtmpl': '%(id)s.%(ext)s'
445                         })
446                 fd.add_info_extractor(youtube_ie)
447                 fd.download([
448                         'http://www.youtube.com/watch?v=t7qdwI7TVe8',
449                         'http://www.youtube.com/watch?v=IJyn3pRcy_Q',
450                         'http://www.youtube.com/watch?v=DZRXe1wtC-M',
451                         ])
452
453         except KeyboardInterrupt:
454                 sys.exit('\nERROR: Interrupted by user')