]> git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl
3e7f13956d534fd76478a68594f31f8e552e62ed
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # License: Public domain code
5 import htmlentitydefs
6 import httplib
7 import locale
8 import math
9 import netrc
10 import os
11 import os.path
12 import re
13 import socket
14 import string
15 import sys
16 import time
17 import urllib
18 import urllib2
19
20 std_headers = { 
21         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
22         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
23         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
24         'Accept-Language': 'en-us,en;q=0.5',
25 }
26
27 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
28
29 class DownloadError(Exception):
30         """Download Error exception.
31         
32         This exception may be thrown by FileDownloader objects if they are not
33         configured to continue on errors. They will contain the appropriate
34         error message.
35         """
36         pass
37
38 class SameFileError(Exception):
39         """Same File exception.
40
41         This exception will be thrown by FileDownloader objects if they detect
42         multiple files would have to be downloaded to the same file on disk.
43         """
44         pass
45
46 class PostProcessingError(Exception):
47         """Post Processing exception.
48
49         This exception may be raised by PostProcessor's .run() method to
50         indicate an error in the postprocessing task.
51         """
52         pass
53
54 class FileDownloader(object):
55         """File Downloader class.
56
57         File downloader objects are the ones responsible of downloading the
58         actual video file and writing it to disk if the user has requested
59         it, among some other tasks. In most cases there should be one per
60         program. As, given a video URL, the downloader doesn't know how to
61         extract all the needed information, task that InfoExtractors do, it
62         has to pass the URL to one of them.
63
64         For this, file downloader objects have a method that allows
65         InfoExtractors to be registered in a given order. When it is passed
66         a URL, the file downloader handles it to the first InfoExtractor it
67         finds that reports being able to handle it. The InfoExtractor returns
68         all the information to the FileDownloader and the latter downloads the
69         file or does whatever it's instructed to do.
70
71         File downloaders accept a lot of parameters. In order not to saturate
72         the object constructor with arguments, it receives a dictionary of
73         options instead. These options are available through the get_params()
74         method for the InfoExtractors to use. The FileDownloader also registers
75         itself as the downloader in charge for the InfoExtractors that are
76         added to it, so this is a "mutual registration".
77
78         Available options:
79
80         username:       Username for authentication purposes.
81         password:       Password for authentication purposes.
82         usenetrc:       Use netrc for authentication instead.
83         quiet:          Do not print messages to stdout.
84         forceurl:       Force printing final URL.
85         forcetitle:     Force printing title.
86         simulate:       Do not download the video files.
87         format:         Video format code.
88         outtmpl:        Template for output names.
89         ignoreerrors:   Do not stop on download errors.
90         ratelimit:      Download speed limit, in bytes/sec.
91         """
92
93         _params = None
94         _ies = []
95         _pps = []
96
97         def __init__(self, params):
98                 """Create a FileDownloader object with the given options."""
99                 self._ies = []
100                 self._pps = []
101                 self.set_params(params)
102         
103         @staticmethod
104         def pmkdir(filename):
105                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
106                 components = filename.split(os.sep)
107                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
108                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
109                 for dir in aggregate:
110                         if not os.path.exists(dir):
111                                 os.mkdir(dir)
112         
113         @staticmethod
114         def format_bytes(bytes):
115                 if bytes is None:
116                         return 'N/A'
117                 if bytes == 0:
118                         exponent = 0
119                 else:
120                         exponent = long(math.log(float(bytes), 1024.0))
121                 suffix = 'bkMGTPEZY'[exponent]
122                 converted = float(bytes) / float(1024**exponent)
123                 return '%.2f%s' % (converted, suffix)
124
125         @staticmethod
126         def calc_percent(byte_counter, data_len):
127                 if data_len is None:
128                         return '---.-%'
129                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
130
131         @staticmethod
132         def calc_eta(start, now, total, current):
133                 if total is None:
134                         return '--:--'
135                 dif = now - start
136                 if current == 0 or dif < 0.001: # One millisecond
137                         return '--:--'
138                 rate = float(current) / dif
139                 eta = long((float(total) - float(current)) / rate)
140                 (eta_mins, eta_secs) = divmod(eta, 60)
141                 if eta_mins > 99:
142                         return '--:--'
143                 return '%02d:%02d' % (eta_mins, eta_secs)
144
145         @staticmethod
146         def calc_speed(start, now, bytes):
147                 dif = now - start
148                 if bytes == 0 or dif < 0.001: # One millisecond
149                         return '%10s' % '---b/s'
150                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
151
152         @staticmethod
153         def best_block_size(elapsed_time, bytes):
154                 new_min = max(bytes / 2.0, 1.0)
155                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
156                 if elapsed_time < 0.001:
157                         return int(new_max)
158                 rate = bytes / elapsed_time
159                 if rate > new_max:
160                         return int(new_max)
161                 if rate < new_min:
162                         return int(new_min)
163                 return int(rate)
164
165         @staticmethod
166         def parse_bytes(bytestr):
167                 """Parse a string indicating a byte quantity into a long integer."""
168                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
169                 if matchobj is None:
170                         return None
171                 number = float(matchobj.group(1))
172                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
173                 return long(round(number * multiplier))
174
175         def set_params(self, params):
176                 """Sets parameters."""
177                 if type(params) != dict:
178                         raise ValueError('params: dictionary expected')
179                 self._params = params
180         
181         def get_params(self):
182                 """Get parameters."""
183                 return self._params
184
185         def add_info_extractor(self, ie):
186                 """Add an InfoExtractor object to the end of the list."""
187                 self._ies.append(ie)
188                 ie.set_downloader(self)
189         
190         def add_post_processor(self, pp):
191                 """Add a PostProcessor object to the end of the chain."""
192                 self._pps.append(pp)
193                 pp.set_downloader(self)
194         
195         def to_stdout(self, message, skip_eol=False):
196                 """Print message to stdout if not in quiet mode."""
197                 if not self._params.get('quiet', False):
198                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
199                         sys.stdout.flush()
200         
201         def to_stderr(self, message):
202                 """Print message to stderr."""
203                 print >>sys.stderr, message
204         
205         def fixed_template(self):
206                 """Checks if the output template is fixed."""
207                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
208
209         def trouble(self, message=None):
210                 """Determine action to take when a download problem appears.
211
212                 Depending on if the downloader has been configured to ignore
213                 download errors or not, this method may throw an exception or
214                 not when errors are found, after printing the message. If it
215                 doesn't raise, it returns an error code suitable to be returned
216                 later as a program exit code to indicate error.
217                 """
218                 if message is not None:
219                         self.to_stderr(message)
220                 if not self._params.get('ignoreerrors', False):
221                         raise DownloadError(message)
222                 return 1
223
224         def slow_down(self, start_time, byte_counter):
225                 """Sleep if the download speed is over the rate limit."""
226                 rate_limit = self._params.get('ratelimit', None)
227                 if rate_limit is None or byte_counter == 0:
228                         return
229                 now = time.time()
230                 elapsed = now - start_time
231                 if elapsed <= 0.0:
232                         return
233                 speed = float(byte_counter) / elapsed
234                 if speed > rate_limit:
235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
236
237         def report_destination(self, filename):
238                 """Report destination filename."""
239                 self.to_stdout(u'[download] Destination: %s' % filename)
240         
241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
242                 """Report download progress."""
243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
245         
246         def report_finish(self):
247                 """Report download finished."""
248                 self.to_stdout(u'')
249
250         def download(self, url_list):
251                 """Download a given list of URLs."""
252                 retcode = 0
253                 if len(url_list) > 1 and self.fixed_template():
254                         raise SameFileError(self._params['outtmpl'])
255
256                 for url in url_list:
257                         suitable_found = False
258                         for ie in self._ies:
259                                 if not ie.suitable(url):
260                                         continue
261                                 # Suitable InfoExtractor found
262                                 suitable_found = True
263                                 all_results = ie.extract(url)
264                                 results = [x for x in all_results if x is not None]
265                                 if len(results) != len(all_results):
266                                         retcode = self.trouble()
267
268                                 if len(results) > 1 and self.fixed_template():
269                                         raise SameFileError(self._params['outtmpl'])
270
271                                 for result in results:
272                                         # Forced printings
273                                         if self._params.get('forcetitle', False):
274                                                 print result['title']
275                                         if self._params.get('forceurl', False):
276                                                 print result['url']
277                                                 
278                                         # Do nothing else if in simulate mode
279                                         if self._params.get('simulate', False):
280                                                 continue
281
282                                         try:
283                                                 filename = self._params['outtmpl'] % result
284                                                 self.report_destination(filename)
285                                         except (ValueError, KeyError), err:
286                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
287                                                 continue
288                                         try:
289                                                 self.pmkdir(filename)
290                                         except (OSError, IOError), err:
291                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
292                                                 continue
293                                         try:
294                                                 outstream = open(filename, 'wb')
295                                         except (OSError, IOError), err:
296                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
297                                                 continue
298                                         try:
299                                                 self._do_download(outstream, result['url'])
300                                                 outstream.close()
301                                         except (OSError, IOError), err:
302                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
303                                                 continue
304                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
306                                                 continue
307                                         try:
308                                                 self.post_process(filename, result)
309                                         except (PostProcessingError), err:
310                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
311                                                 continue
312
313                                 break
314                         if not suitable_found:
315                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
316
317                 return retcode
318
319         def post_process(self, filename, ie_info):
320                 """Run the postprocessing chain on the given file."""
321                 info = dict(ie_info)
322                 info['filepath'] = filename
323                 for pp in self._pps:
324                         info = pp.run(info)
325                         if info is None:
326                                 break
327         
328         def _do_download(self, stream, url):
329                 request = urllib2.Request(url, None, std_headers)
330                 data = urllib2.urlopen(request)
331                 data_len = data.info().get('Content-length', None)
332                 data_len_str = self.format_bytes(data_len)
333                 byte_counter = 0
334                 block_size = 1024
335                 start = time.time()
336                 while True:
337                         # Progress message
338                         percent_str = self.calc_percent(byte_counter, data_len)
339                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
340                         speed_str = self.calc_speed(start, time.time(), byte_counter)
341                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
342
343                         # Download and write
344                         before = time.time()
345                         data_block = data.read(block_size)
346                         after = time.time()
347                         data_block_len = len(data_block)
348                         if data_block_len == 0:
349                                 break
350                         byte_counter += data_block_len
351                         stream.write(data_block)
352                         block_size = self.best_block_size(after - before, data_block_len)
353
354                         # Apply rate limit
355                         self.slow_down(start, byte_counter)
356
357                 self.report_finish()
358                 if data_len is not None and str(byte_counter) != data_len:
359                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
360
361 class InfoExtractor(object):
362         """Information Extractor class.
363
364         Information extractors are the classes that, given a URL, extract
365         information from the video (or videos) the URL refers to. This
366         information includes the real video URL, the video title and simplified
367         title, author and others. It is returned in a list of dictionaries when
368         calling its extract() method. It is a list because a URL can refer to
369         more than one video (think of playlists). The dictionaries must include
370         the following fields:
371
372         id:             Video identifier.
373         url:            Final video URL.
374         uploader:       Nickname of the video uploader.
375         title:          Literal title.
376         stitle:         Simplified title.
377         ext:            Video filename extension.
378
379         Subclasses of this one should re-define the _real_initialize() and
380         _real_extract() methods, as well as the suitable() static method.
381         Probably, they should also be instantiated and added to the main
382         downloader.
383         """
384
385         _ready = False
386         _downloader = None
387
388         def __init__(self, downloader=None):
389                 """Constructor. Receives an optional downloader."""
390                 self._ready = False
391                 self.set_downloader(downloader)
392
393         @staticmethod
394         def suitable(url):
395                 """Receives a URL and returns True if suitable for this IE."""
396                 return False
397
398         def initialize(self):
399                 """Initializes an instance (authentication, etc)."""
400                 if not self._ready:
401                         self._real_initialize()
402                         self._ready = True
403
404         def extract(self, url):
405                 """Extracts URL information and returns it in list of dicts."""
406                 self.initialize()
407                 return self._real_extract(url)
408
409         def set_downloader(self, downloader):
410                 """Sets the downloader for this IE."""
411                 self._downloader = downloader
412         
413         def to_stdout(self, message):
414                 """Print message to stdout if downloader is not in quiet mode."""
415                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
416                         print message
417         
418         def to_stderr(self, message):
419                 """Print message to stderr."""
420                 print >>sys.stderr, message
421
422         def _real_initialize(self):
423                 """Real initialization process. Redefine in subclasses."""
424                 pass
425
426         def _real_extract(self, url):
427                 """Real extraction process. Redefine in subclasses."""
428                 pass
429
430 class YoutubeIE(InfoExtractor):
431         """Information extractor for youtube.com."""
432
433         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
434         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
435         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
436         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
437         _NETRC_MACHINE = 'youtube'
438
439         @staticmethod
440         def suitable(url):
441                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
442
443         def report_lang(self):
444                 """Report attempt to set language."""
445                 self.to_stdout(u'[youtube] Setting language')
446
447         def report_login(self):
448                 """Report attempt to log in."""
449                 self.to_stdout(u'[youtube] Logging in')
450         
451         def report_age_confirmation(self):
452                 """Report attempt to confirm age."""
453                 self.to_stdout(u'[youtube] Confirming age')
454         
455         def report_webpage_download(self, video_id):
456                 """Report attempt to download webpage."""
457                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
458         
459         def report_information_extraction(self, video_id):
460                 """Report attempt to extract video information."""
461                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
462         
463         def report_video_url(self, video_id, video_real_url):
464                 """Report extracted video URL."""
465                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
466
467         def _real_initialize(self):
468                 if self._downloader is None:
469                         return
470
471                 username = None
472                 password = None
473                 downloader_params = self._downloader.get_params()
474
475                 # Attempt to use provided username and password or .netrc data
476                 if downloader_params.get('username', None) is not None:
477                         username = downloader_params['username']
478                         password = downloader_params['password']
479                 elif downloader_params.get('usenetrc', False):
480                         try:
481                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
482                                 if info is not None:
483                                         username = info[0]
484                                         password = info[2]
485                                 else:
486                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
487                         except (IOError, netrc.NetrcParseError), err:
488                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
489                                 return
490
491                 # No authentication to be performed
492                 if username is None:
493                         return
494
495                 # Set language
496                 request = urllib2.Request(self._LOGIN_URL, None, std_headers)
497                 try:
498                         self.report_lang()
499                         urllib2.urlopen(request).read()
500                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
501                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
502                         return
503
504                 # Log in
505                 login_form = {
506                                 'current_form': 'loginForm',
507                                 'next':         '/',
508                                 'action_login': 'Log In',
509                                 'username':     username,
510                                 'password':     password,
511                                 }
512                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
513                 try:
514                         self.report_login()
515                         login_results = urllib2.urlopen(request).read()
516                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
517                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
518                                 return
519                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
520                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
521                         return
522         
523                 # Confirm age
524                 age_form = {
525                                 'next_url':             '/',
526                                 'action_confirm':       'Confirm',
527                                 }
528                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
529                 try:
530                         self.report_age_confirmation()
531                         age_results = urllib2.urlopen(request).read()
532                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
533                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
534                         return
535
536         def _real_extract(self, url):
537                 # Extract video id from URL
538                 mobj = re.match(self._VALID_URL, url)
539                 if mobj is None:
540                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
541                         return [None]
542                 video_id = mobj.group(2)
543
544                 # Downloader parameters
545                 format_param = None
546                 if self._downloader is not None:
547                         params = self._downloader.get_params()
548                         format_param = params.get('format', None)
549
550                 # Extension
551                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
552
553                 # Normalize URL, including format
554                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
555                 if format_param is not None:
556                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
557                 request = urllib2.Request(normalized_url, None, std_headers)
558                 try:
559                         self.report_webpage_download(video_id)
560                         video_webpage = urllib2.urlopen(request).read()
561                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
562                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
563                         return [None]
564                 self.report_information_extraction(video_id)
565                 
566                 # "t" param
567                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
568                 if mobj is None:
569                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
570                         return [None]
571                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
572                 if format_param is not None:
573                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
574                 self.report_video_url(video_id, video_real_url)
575
576                 # uploader
577                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
578                 if mobj is None:
579                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
580                         return [None]
581                 video_uploader = mobj.group(1)
582
583                 # title
584                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
585                 if mobj is None:
586                         self.to_stderr(u'ERROR: unable to extract video title')
587                         return [None]
588                 video_title = mobj.group(1).decode('utf-8')
589                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
590                 video_title = video_title.replace(os.sep, u'%')
591
592                 # simplified title
593                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
594                 simple_title = simple_title.strip(ur'_')
595
596                 # Return information
597                 return [{
598                         'id':           video_id.decode('utf-8'),
599                         'url':          video_real_url.decode('utf-8'),
600                         'uploader':     video_uploader.decode('utf-8'),
601                         'title':        video_title,
602                         'stitle':       simple_title,
603                         'ext':          video_extension.decode('utf-8'),
604                         }]
605
606 class MetacafeIE(InfoExtractor):
607         """Information Extractor for metacafe.com."""
608
609         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
610         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
611         _youtube_ie = None
612
613         def __init__(self, youtube_ie, downloader=None):
614                 InfoExtractor.__init__(self, downloader)
615                 self._youtube_ie = youtube_ie
616
617         @staticmethod
618         def suitable(url):
619                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
620
621         def report_disclaimer(self):
622                 """Report disclaimer retrieval."""
623                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
624
625         def report_age_confirmation(self):
626                 """Report attempt to confirm age."""
627                 self.to_stdout(u'[metacafe] Confirming age')
628         
629         def report_download_webpage(self, video_id):
630                 """Report webpage download."""
631                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
632         
633         def report_extraction(self, video_id):
634                 """Report information extraction."""
635                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
636
637         def _real_initialize(self):
638                 # Retrieve disclaimer
639                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
640                 try:
641                         self.report_disclaimer()
642                         disclaimer = urllib2.urlopen(request).read()
643                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
644                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
645                         return
646
647                 # Confirm age
648                 disclaimer_form = {
649                         'filters': '0',
650                         'submit': "Continue - I'm over 18",
651                         }
652                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
653                 try:
654                         self.report_age_confirmation()
655                         disclaimer = urllib2.urlopen(request).read()
656                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
657                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
658                         return
659         
660         def _real_extract(self, url):
661                 # Extract id and simplified title from URL
662                 mobj = re.match(self._VALID_URL, url)
663                 if mobj is None:
664                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
665                         return [None]
666
667                 video_id = mobj.group(1)
668
669                 # Check if video comes from YouTube
670                 mobj2 = re.match(r'^yt-(.*)$', video_id)
671                 if mobj2 is not None:
672                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
673
674                 simple_title = mobj.group(2).decode('utf-8')
675                 video_extension = 'flv'
676
677                 # Retrieve video webpage to extract further information
678                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
679                 try:
680                         self.report_download_webpage(video_id)
681                         webpage = urllib2.urlopen(request).read()
682                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
683                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
684                         return [None]
685
686                 # Extract URL, uploader and title from webpage
687                 self.report_extraction(video_id)
688                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
689                 if mobj is None:
690                         self.to_stderr(u'ERROR: unable to extract media URL')
691                         return [None]
692                 mediaURL = mobj.group(1).replace('\\', '')
693
694                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
695                 if mobj is None:
696                         self.to_stderr(u'ERROR: unable to extract gdaKey')
697                         return [None]
698                 gdaKey = mobj.group(1)
699
700                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
701
702                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
703                 if mobj is None:
704                         self.to_stderr(u'ERROR: unable to extract title')
705                         return [None]
706                 video_title = mobj.group(1).decode('utf-8')
707
708                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
709                 if mobj is None:
710                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
711                         return [None]
712                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
713
714                 # Return information
715                 return [{
716                         'id':           video_id.decode('utf-8'),
717                         'url':          video_url.decode('utf-8'),
718                         'uploader':     video_uploader.decode('utf-8'),
719                         'title':        video_title,
720                         'stitle':       simple_title,
721                         'ext':          video_extension.decode('utf-8'),
722                         }]
723
724
725 class YoutubeSearchIE(InfoExtractor):
726         """Information Extractor for YouTube search queries."""
727         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
728         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
729         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
730         _MORE_PAGES_INDICATOR = r'>Next</a>'
731         _youtube_ie = None
732
733         def __init__(self, youtube_ie, downloader=None): 
734                 InfoExtractor.__init__(self, downloader)
735                 self._youtube_ie = youtube_ie
736         
737         @staticmethod
738         def suitable(url):
739                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
740
741         def report_download_page(self, query, pagenum):
742                 """Report attempt to download playlist page with given number."""
743                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
744
745         def _real_initialize(self):
746                 self._youtube_ie.initialize()
747         
748         def _real_extract(self, query):
749                 mobj = re.match(self._VALID_QUERY, query)
750                 if mobj is None:
751                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
752                         return [None]
753
754                 prefix, query = query.split(':')
755                 prefix = prefix[8:]
756                 if prefix == '': 
757                         return self._download_n_results(query, 1)
758                 elif prefix == 'all': 
759                         return self._download_n_results(query, -1)
760                 else: 
761                         try:
762                                 n = int(prefix)
763                                 if n <= 0:
764                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
765                                         return [None]
766                                 return self._download_n_results(query, n)
767                         except ValueError: # parsing prefix as int fails
768                                 return self._download_n_results(query, 1)
769
770         def _download_n_results(self, query, n):
771                 """Downloads a specified number of results for a query"""
772
773                 video_ids = []
774                 already_seen = set()
775                 pagenum = 1
776
777                 while True:
778                         self.report_download_page(query, pagenum)
779                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
780                         request = urllib2.Request(result_url, None, std_headers)
781                         try:
782                                 page = urllib2.urlopen(request).read()
783                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
784                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
785                                 return [None]
786
787                         # Extract video identifiers
788                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
789                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
790                                 if video_id not in already_seen:
791                                         video_ids.append(video_id)
792                                         already_seen.add(video_id)
793                                         if len(video_ids) == n:
794                                                 # Specified n videos reached
795                                                 information = []
796                                                 for id in video_ids:
797                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
798                                                 return information
799
800                         if self._MORE_PAGES_INDICATOR not in page:
801                                 information = []
802                                 for id in video_ids:
803                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
804                                 return information
805
806                         pagenum = pagenum + 1
807
808 class YoutubePlaylistIE(InfoExtractor):
809         """Information Extractor for YouTube playlists."""
810
811         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
812         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
813         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
814         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
815         _youtube_ie = None
816
817         def __init__(self, youtube_ie, downloader=None):
818                 InfoExtractor.__init__(self, downloader)
819                 self._youtube_ie = youtube_ie
820         
821         @staticmethod
822         def suitable(url):
823                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
824
825         def report_download_page(self, playlist_id, pagenum):
826                 """Report attempt to download playlist page with given number."""
827                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
828
829         def _real_initialize(self):
830                 self._youtube_ie.initialize()
831         
832         def _real_extract(self, url):
833                 # Extract playlist id
834                 mobj = re.match(self._VALID_URL, url)
835                 if mobj is None:
836                         self.to_stderr(u'ERROR: invalid url: %s' % url)
837                         return [None]
838
839                 # Download playlist pages
840                 playlist_id = mobj.group(1)
841                 video_ids = []
842                 pagenum = 1
843
844                 while True:
845                         self.report_download_page(playlist_id, pagenum)
846                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
847                         try:
848                                 page = urllib2.urlopen(request).read()
849                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
851                                 return [None]
852
853                         # Extract video identifiers
854                         ids_in_page = []
855                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
856                                 if mobj.group(1) not in ids_in_page:
857                                         ids_in_page.append(mobj.group(1))
858                         video_ids.extend(ids_in_page)
859
860                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
861                                 break
862                         pagenum = pagenum + 1
863
864                 information = []
865                 for id in video_ids:
866                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
867                 return information
868
869 class PostProcessor(object):
870         """Post Processor class.
871
872         PostProcessor objects can be added to downloaders with their
873         add_post_processor() method. When the downloader has finished a
874         successful download, it will take its internal chain of PostProcessors
875         and start calling the run() method on each one of them, first with
876         an initial argument and then with the returned value of the previous
877         PostProcessor.
878
879         The chain will be stopped if one of them ever returns None or the end
880         of the chain is reached.
881
882         PostProcessor objects follow a "mutual registration" process similar
883         to InfoExtractor objects.
884         """
885
886         _downloader = None
887
888         def __init__(self, downloader=None):
889                 self._downloader = downloader
890
891         def to_stdout(self, message):
892                 """Print message to stdout if downloader is not in quiet mode."""
893                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
894                         print message
895         
896         def to_stderr(self, message):
897                 """Print message to stderr."""
898                 print >>sys.stderr, message
899
900         def set_downloader(self, downloader):
901                 """Sets the downloader for this PP."""
902                 self._downloader = downloader
903         
904         def run(self, information):
905                 """Run the PostProcessor.
906
907                 The "information" argument is a dictionary like the ones
908                 returned by InfoExtractors. The only difference is that this
909                 one has an extra field called "filepath" that points to the
910                 downloaded file.
911
912                 When this method returns None, the postprocessing chain is
913                 stopped. However, this method may return an information
914                 dictionary that will be passed to the next postprocessing
915                 object in the chain. It can be the one it received after
916                 changing some fields.
917
918                 In addition, this method may raise a PostProcessingError
919                 exception that will be taken into account by the downloader
920                 it was called from.
921                 """
922                 return information # by default, do nothing
923         
924 ### MAIN PROGRAM ###
925 if __name__ == '__main__':
926         try:
927                 # Modules needed only when running the main program
928                 import getpass
929                 import optparse
930
931                 # General configuration
932                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
933                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
934                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
935
936                 # Parse command line
937                 parser = optparse.OptionParser(
938                                 usage='Usage: %prog [options] url...',
939                                 version='2009.01.31',
940                                 conflict_handler='resolve',
941                                 )
942                 parser.add_option('-h', '--help',
943                                 action='help', help='print this help text and exit')
944                 parser.add_option('-v', '--version',
945                                 action='version', help='print program version and exit')
946                 parser.add_option('-u', '--username',
947                                 dest='username', metavar='UN', help='account username')
948                 parser.add_option('-p', '--password',
949                                 dest='password', metavar='PW', help='account password')
950                 parser.add_option('-o', '--output',
951                                 dest='outtmpl', metavar='TPL', help='output filename template')
952                 parser.add_option('-q', '--quiet',
953                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
954                 parser.add_option('-s', '--simulate',
955                                 action='store_true', dest='simulate', help='do not download video', default=False)
956                 parser.add_option('-t', '--title',
957                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
958                 parser.add_option('-l', '--literal',
959                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
960                 parser.add_option('-n', '--netrc',
961                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
962                 parser.add_option('-g', '--get-url',
963                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
964                 parser.add_option('-e', '--get-title',
965                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
966                 parser.add_option('-f', '--format',
967                                 dest='format', metavar='FMT', help='video format code')
968                 parser.add_option('-b', '--best-quality',
969                                 action='store_const', dest='format', help='alias for -f 18', const='18')
970                 parser.add_option('-m', '--mobile-version',
971                                 action='store_const', dest='format', help='alias for -f 17', const='17')
972                 parser.add_option('-i', '--ignore-errors',
973                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
974                 parser.add_option('-r', '--rate-limit',
975                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
976                 parser.add_option('-a', '--batch-file',
977                                 dest='batchfile', metavar='F', help='file containing URLs to download')
978                 (opts, args) = parser.parse_args()
979
980                 # Batch file verification
981                 batchurls = []
982                 if opts.batchfile is not None:
983                         try:
984                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
985                         except IOError:
986                                 sys.exit(u'ERROR: batch file could not be read')
987                 all_urls = batchurls + args
988
989                 # Conflicting, missing and erroneous options
990                 if len(all_urls) < 1:
991                         sys.exit(u'ERROR: you must provide at least one URL')
992                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
993                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
994                 if opts.password is not None and opts.username is None:
995                         sys.exit(u'ERROR: account username missing')
996                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
997                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
998                 if opts.usetitle and opts.useliteral:
999                         sys.exit(u'ERROR: using title conflicts with using literal title')
1000                 if opts.username is not None and opts.password is None:
1001                         opts.password = getpass.getpass(u'Type account password and press return:')
1002                 if opts.ratelimit is not None:
1003                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1004                         if numeric_limit is None:
1005                                 sys.exit(u'ERROR: invalid rate limit specified')
1006                         opts.ratelimit = numeric_limit
1007
1008                 # Information extractors
1009                 youtube_ie = YoutubeIE()
1010                 metacafe_ie = MetacafeIE(youtube_ie)
1011                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1012                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1013
1014                 # File downloader
1015                 charset = locale.getdefaultlocale()[1]
1016                 if charset is None:
1017                         charset = 'ascii'
1018                 fd = FileDownloader({
1019                         'usenetrc': opts.usenetrc,
1020                         'username': opts.username,
1021                         'password': opts.password,
1022                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1023                         'forceurl': opts.geturl,
1024                         'forcetitle': opts.gettitle,
1025                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1026                         'format': opts.format,
1027                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1028                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1029                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1030                                 or u'%(id)s.%(ext)s'),
1031                         'ignoreerrors': opts.ignoreerrors,
1032                         'ratelimit': opts.ratelimit,
1033                         })
1034                 fd.add_info_extractor(youtube_search_ie)
1035                 fd.add_info_extractor(youtube_pl_ie)
1036                 fd.add_info_extractor(metacafe_ie)
1037                 fd.add_info_extractor(youtube_ie)
1038                 retcode = fd.download(all_urls)
1039                 sys.exit(retcode)
1040
1041         except DownloadError:
1042                 sys.exit(1)
1043         except SameFileError:
1044                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1045         except KeyboardInterrupt:
1046                 sys.exit(u'\nERROR: Interrupted by user')