]> git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl
e20e59bf8c3fda8d320803c83794d5697cbb24f0
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class FileDownloader(object):
56         """File Downloader class.
57
58         File downloader objects are the ones responsible of downloading the
59         actual video file and writing it to disk if the user has requested
60         it, among some other tasks. In most cases there should be one per
61         program. As, given a video URL, the downloader doesn't know how to
62         extract all the needed information, task that InfoExtractors do, it
63         has to pass the URL to one of them.
64
65         For this, file downloader objects have a method that allows
66         InfoExtractors to be registered in a given order. When it is passed
67         a URL, the file downloader handles it to the first InfoExtractor it
68         finds that reports being able to handle it. The InfoExtractor extracts
69         all the information about the video or videos the URL refers to, and
70         asks the FileDownloader to process the video information, possibly
71         downloading the video.
72
73         File downloaders accept a lot of parameters. In order not to saturate
74         the object constructor with arguments, it receives a dictionary of
75         options instead. These options are available through the params
76         attribute for the InfoExtractors to use. The FileDownloader also
77         registers itself as the downloader in charge for the InfoExtractors
78         that are added to it, so this is a "mutual registration".
79
80         Available options:
81
82         username:       Username for authentication purposes.
83         password:       Password for authentication purposes.
84         usenetrc:       Use netrc for authentication instead.
85         quiet:          Do not print messages to stdout.
86         forceurl:       Force printing final URL.
87         forcetitle:     Force printing title.
88         simulate:       Do not download the video files.
89         format:         Video format code.
90         outtmpl:        Template for output names.
91         ignoreerrors:   Do not stop on download errors.
92         ratelimit:      Download speed limit, in bytes/sec.
93         nooverwrites:   Prevent overwriting files.
94         """
95
96         params = None
97         _ies = []
98         _pps = []
99         _download_retcode = None
100
101         def __init__(self, params):
102                 """Create a FileDownloader object with the given options."""
103                 self._ies = []
104                 self._pps = []
105                 self._download_retcode = 0
106                 self.params = params
107         
108         @staticmethod
109         def pmkdir(filename):
110                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
111                 components = filename.split(os.sep)
112                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
113                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
114                 for dir in aggregate:
115                         if not os.path.exists(dir):
116                                 os.mkdir(dir)
117         
118         @staticmethod
119         def format_bytes(bytes):
120                 if bytes is None:
121                         return 'N/A'
122                 if bytes == 0:
123                         exponent = 0
124                 else:
125                         exponent = long(math.log(float(bytes), 1024.0))
126                 suffix = 'bkMGTPEZY'[exponent]
127                 converted = float(bytes) / float(1024**exponent)
128                 return '%.2f%s' % (converted, suffix)
129
130         @staticmethod
131         def calc_percent(byte_counter, data_len):
132                 if data_len is None:
133                         return '---.-%'
134                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
135
136         @staticmethod
137         def calc_eta(start, now, total, current):
138                 if total is None:
139                         return '--:--'
140                 dif = now - start
141                 if current == 0 or dif < 0.001: # One millisecond
142                         return '--:--'
143                 rate = float(current) / dif
144                 eta = long((float(total) - float(current)) / rate)
145                 (eta_mins, eta_secs) = divmod(eta, 60)
146                 if eta_mins > 99:
147                         return '--:--'
148                 return '%02d:%02d' % (eta_mins, eta_secs)
149
150         @staticmethod
151         def calc_speed(start, now, bytes):
152                 dif = now - start
153                 if bytes == 0 or dif < 0.001: # One millisecond
154                         return '%10s' % '---b/s'
155                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
156
157         @staticmethod
158         def best_block_size(elapsed_time, bytes):
159                 new_min = max(bytes / 2.0, 1.0)
160                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
161                 if elapsed_time < 0.001:
162                         return int(new_max)
163                 rate = bytes / elapsed_time
164                 if rate > new_max:
165                         return int(new_max)
166                 if rate < new_min:
167                         return int(new_min)
168                 return int(rate)
169
170         @staticmethod
171         def parse_bytes(bytestr):
172                 """Parse a string indicating a byte quantity into a long integer."""
173                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
174                 if matchobj is None:
175                         return None
176                 number = float(matchobj.group(1))
177                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
178                 return long(round(number * multiplier))
179
180         def add_info_extractor(self, ie):
181                 """Add an InfoExtractor object to the end of the list."""
182                 self._ies.append(ie)
183                 ie.set_downloader(self)
184         
185         def add_post_processor(self, pp):
186                 """Add a PostProcessor object to the end of the chain."""
187                 self._pps.append(pp)
188                 pp.set_downloader(self)
189         
190         def to_stdout(self, message, skip_eol=False):
191                 """Print message to stdout if not in quiet mode."""
192                 if not self.params.get('quiet', False):
193                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
194                         sys.stdout.flush()
195         
196         def to_stderr(self, message):
197                 """Print message to stderr."""
198                 print >>sys.stderr, message
199         
200         def fixed_template(self):
201                 """Checks if the output template is fixed."""
202                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
203
204         def trouble(self, message=None):
205                 """Determine action to take when a download problem appears.
206
207                 Depending on if the downloader has been configured to ignore
208                 download errors or not, this method may throw an exception or
209                 not when errors are found, after printing the message.
210                 """
211                 if message is not None:
212                         self.to_stderr(message)
213                 if not self.params.get('ignoreerrors', False):
214                         raise DownloadError(message)
215                 self._download_retcode = 1
216
217         def slow_down(self, start_time, byte_counter):
218                 """Sleep if the download speed is over the rate limit."""
219                 rate_limit = self.params.get('ratelimit', None)
220                 if rate_limit is None or byte_counter == 0:
221                         return
222                 now = time.time()
223                 elapsed = now - start_time
224                 if elapsed <= 0.0:
225                         return
226                 speed = float(byte_counter) / elapsed
227                 if speed > rate_limit:
228                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
229
230         def report_destination(self, filename):
231                 """Report destination filename."""
232                 self.to_stdout(u'[download] Destination: %s' % filename)
233         
234         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
235                 """Report download progress."""
236                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
237                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
238         
239         def report_finish(self):
240                 """Report download finished."""
241                 self.to_stdout(u'')
242
243         def process_info(self, info_dict):
244                 """Process a single dictionary returned by an InfoExtractor."""
245                 # Forced printings
246                 if self.params.get('forcetitle', False):
247                         print info_dict['title'].encode(locale.getpreferredencoding())
248                 if self.params.get('forceurl', False):
249                         print info_dict['url'].encode(locale.getpreferredencoding())
250                         
251                 # Do nothing else if in simulate mode
252                 if self.params.get('simulate', False):
253                         return
254
255                 try:
256                         filename = self.params['outtmpl'] % info_dict
257                         self.report_destination(filename)
258                 except (ValueError, KeyError), err:
259                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
260                 if self.params['nooverwrites'] and os.path.exists(filename):
261                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
262                         return
263                 try:
264                         self.pmkdir(filename)
265                 except (OSError, IOError), err:
266                         self.trouble('ERROR: unable to create directories: %s' % str(err))
267                         return
268                 try:
269                         outstream = open(filename, 'wb')
270                 except (OSError, IOError), err:
271                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
272                         return
273                 try:
274                         self._do_download(outstream, info_dict['url'])
275                         outstream.close()
276                 except (OSError, IOError), err:
277                         self.trouble('ERROR: unable to write video data: %s' % str(err))
278                         return
279                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
280                         self.trouble('ERROR: unable to download video data: %s' % str(err))
281                         return
282                 try:
283                         self.post_process(filename, info_dict)
284                 except (PostProcessingError), err:
285                         self.trouble('ERROR: postprocessing: %s' % str(err))
286                         return
287
288                 return
289
290         def download(self, url_list):
291                 """Download a given list of URLs."""
292                 if len(url_list) > 1 and self.fixed_template():
293                         raise SameFileError(self.params['outtmpl'])
294
295                 for url in url_list:
296                         suitable_found = False
297                         for ie in self._ies:
298                                 # Go to next InfoExtractor if not suitable
299                                 if not ie.suitable(url):
300                                         continue
301
302                                 # Suitable InfoExtractor found
303                                 suitable_found = True
304
305                                 # Extract information from URL and process it
306                                 ie.extract(url)
307
308                                 # Suitable InfoExtractor had been found; go to next URL
309                                 break
310
311                         if not suitable_found:
312                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
313
314                 return self._download_retcode
315
316         def post_process(self, filename, ie_info):
317                 """Run the postprocessing chain on the given file."""
318                 info = dict(ie_info)
319                 info['filepath'] = filename
320                 for pp in self._pps:
321                         info = pp.run(info)
322                         if info is None:
323                                 break
324         
325         def _do_download(self, stream, url):
326                 request = urllib2.Request(url, None, std_headers)
327                 data = urllib2.urlopen(request)
328                 data_len = data.info().get('Content-length', None)
329                 data_len_str = self.format_bytes(data_len)
330                 byte_counter = 0
331                 block_size = 1024
332                 start = time.time()
333                 while True:
334                         # Progress message
335                         percent_str = self.calc_percent(byte_counter, data_len)
336                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
337                         speed_str = self.calc_speed(start, time.time(), byte_counter)
338                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
339
340                         # Download and write
341                         before = time.time()
342                         data_block = data.read(block_size)
343                         after = time.time()
344                         data_block_len = len(data_block)
345                         if data_block_len == 0:
346                                 break
347                         byte_counter += data_block_len
348                         stream.write(data_block)
349                         block_size = self.best_block_size(after - before, data_block_len)
350
351                         # Apply rate limit
352                         self.slow_down(start, byte_counter)
353
354                 self.report_finish()
355                 if data_len is not None and str(byte_counter) != data_len:
356                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
357
358 class InfoExtractor(object):
359         """Information Extractor class.
360
361         Information extractors are the classes that, given a URL, extract
362         information from the video (or videos) the URL refers to. This
363         information includes the real video URL, the video title and simplified
364         title, author and others. The information is stored in a dictionary
365         which is then passed to the FileDownloader. The FileDownloader
366         processes this information possibly downloading the video to the file
367         system, among other possible outcomes. The dictionaries must include
368         the following fields:
369
370         id:             Video identifier.
371         url:            Final video URL.
372         uploader:       Nickname of the video uploader.
373         title:          Literal title.
374         stitle:         Simplified title.
375         ext:            Video filename extension.
376
377         Subclasses of this one should re-define the _real_initialize() and
378         _real_extract() methods, as well as the suitable() static method.
379         Probably, they should also be instantiated and added to the main
380         downloader.
381         """
382
383         _ready = False
384         _downloader = None
385
386         def __init__(self, downloader=None):
387                 """Constructor. Receives an optional downloader."""
388                 self._ready = False
389                 self.set_downloader(downloader)
390
391         @staticmethod
392         def suitable(url):
393                 """Receives a URL and returns True if suitable for this IE."""
394                 return False
395
396         def initialize(self):
397                 """Initializes an instance (authentication, etc)."""
398                 if not self._ready:
399                         self._real_initialize()
400                         self._ready = True
401
402         def extract(self, url):
403                 """Extracts URL information and returns it in list of dicts."""
404                 self.initialize()
405                 return self._real_extract(url)
406
407         def set_downloader(self, downloader):
408                 """Sets the downloader for this IE."""
409                 self._downloader = downloader
410         
411         def _real_initialize(self):
412                 """Real initialization process. Redefine in subclasses."""
413                 pass
414
415         def _real_extract(self, url):
416                 """Real extraction process. Redefine in subclasses."""
417                 pass
418
419 class YoutubeIE(InfoExtractor):
420         """Information extractor for youtube.com."""
421
422         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
423         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
424         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
425         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
426         _NETRC_MACHINE = 'youtube'
427
428         @staticmethod
429         def suitable(url):
430                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
431
432         @staticmethod
433         def htmlentity_transform(matchobj):
434                 """Transforms an HTML entity to a Unicode character."""
435                 entity = matchobj.group(1)
436
437                 # Known non-numeric HTML entity
438                 if entity in htmlentitydefs.name2codepoint:
439                         return unichr(htmlentitydefs.name2codepoint[entity])
440
441                 # Unicode character
442                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
443                 if mobj is not None:
444                         numstr = mobj.group(1)
445                         if numstr.startswith(u'x'):
446                                 base = 16
447                                 numstr = u'0%s' % numstr
448                         else:
449                                 base = 10
450                         return unichr(long(numstr, base))
451
452                 # Unknown entity in name, return its literal representation
453                 return (u'&%s;' % entity)
454
455         def report_lang(self):
456                 """Report attempt to set language."""
457                 self._downloader.to_stdout(u'[youtube] Setting language')
458
459         def report_login(self):
460                 """Report attempt to log in."""
461                 self._downloader.to_stdout(u'[youtube] Logging in')
462         
463         def report_age_confirmation(self):
464                 """Report attempt to confirm age."""
465                 self._downloader.to_stdout(u'[youtube] Confirming age')
466         
467         def report_webpage_download(self, video_id):
468                 """Report attempt to download webpage."""
469                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
470         
471         def report_information_extraction(self, video_id):
472                 """Report attempt to extract video information."""
473                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
474         
475         def report_video_url(self, video_id, video_real_url):
476                 """Report extracted video URL."""
477                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
478         
479         def _real_initialize(self):
480                 if self._downloader is None:
481                         return
482
483                 username = None
484                 password = None
485                 downloader_params = self._downloader.params
486
487                 # Attempt to use provided username and password or .netrc data
488                 if downloader_params.get('username', None) is not None:
489                         username = downloader_params['username']
490                         password = downloader_params['password']
491                 elif downloader_params.get('usenetrc', False):
492                         try:
493                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
494                                 if info is not None:
495                                         username = info[0]
496                                         password = info[2]
497                                 else:
498                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
499                         except (IOError, netrc.NetrcParseError), err:
500                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
501                                 return
502
503                 # Set language
504                 request = urllib2.Request(self._LANG_URL, None, std_headers)
505                 try:
506                         self.report_lang()
507                         urllib2.urlopen(request).read()
508                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
509                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
510                         return
511
512                 # No authentication to be performed
513                 if username is None:
514                         return
515
516                 # Log in
517                 login_form = {
518                                 'current_form': 'loginForm',
519                                 'next':         '/',
520                                 'action_login': 'Log In',
521                                 'username':     username,
522                                 'password':     password,
523                                 }
524                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
525                 try:
526                         self.report_login()
527                         login_results = urllib2.urlopen(request).read()
528                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
529                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
530                                 return
531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
532                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
533                         return
534         
535                 # Confirm age
536                 age_form = {
537                                 'next_url':             '/',
538                                 'action_confirm':       'Confirm',
539                                 }
540                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
541                 try:
542                         self.report_age_confirmation()
543                         age_results = urllib2.urlopen(request).read()
544                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
545                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
546                         return
547
548         def _real_extract(self, url):
549                 # Extract video id from URL
550                 mobj = re.match(self._VALID_URL, url)
551                 if mobj is None:
552                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
553                         return
554                 video_id = mobj.group(2)
555
556                 # Downloader parameters
557                 format_param = None
558                 if self._downloader is not None:
559                         params = self._downloader.params
560                         format_param = params.get('format', None)
561
562                 # Extension
563                 video_extension = {
564                         '17': '3gp',
565                         '18': 'mp4',
566                         '22': 'mp4',
567                 }.get(format_param, 'flv')
568
569                 # Normalize URL, including format
570                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
571                 if format_param is not None:
572                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
573                 request = urllib2.Request(normalized_url, None, std_headers)
574                 try:
575                         self.report_webpage_download(video_id)
576                         video_webpage = urllib2.urlopen(request).read()
577                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
578                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
579                         return
580                 self.report_information_extraction(video_id)
581                 
582                 # "t" param
583                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
584                 if mobj is None:
585                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
586                         return
587                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
588                 if format_param is not None:
589                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
590                 self.report_video_url(video_id, video_real_url)
591
592                 # uploader
593                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
594                 if mobj is None:
595                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
596                         return
597                 video_uploader = mobj.group(1)
598
599                 # title
600                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
601                 if mobj is None:
602                         self._downloader.trouble(u'ERROR: unable to extract video title')
603                         return
604                 video_title = mobj.group(1).decode('utf-8')
605                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
606                 video_title = video_title.replace(os.sep, u'%')
607
608                 # simplified title
609                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
610                 simple_title = simple_title.strip(ur'_')
611
612                 # Process video information
613                 self._downloader.process_info({
614                         'id':           video_id.decode('utf-8'),
615                         'url':          video_real_url.decode('utf-8'),
616                         'uploader':     video_uploader.decode('utf-8'),
617                         'title':        video_title,
618                         'stitle':       simple_title,
619                         'ext':          video_extension.decode('utf-8'),
620                         })
621
622 class MetacafeIE(InfoExtractor):
623         """Information Extractor for metacafe.com."""
624
625         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
626         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
627         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
628         _youtube_ie = None
629
630         def __init__(self, youtube_ie, downloader=None):
631                 InfoExtractor.__init__(self, downloader)
632                 self._youtube_ie = youtube_ie
633
634         @staticmethod
635         def suitable(url):
636                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
637
638         def report_disclaimer(self):
639                 """Report disclaimer retrieval."""
640                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
641
642         def report_age_confirmation(self):
643                 """Report attempt to confirm age."""
644                 self._downloader.to_stdout(u'[metacafe] Confirming age')
645         
646         def report_download_webpage(self, video_id):
647                 """Report webpage download."""
648                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
649         
650         def report_extraction(self, video_id):
651                 """Report information extraction."""
652                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
653
654         def _real_initialize(self):
655                 # Retrieve disclaimer
656                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
657                 try:
658                         self.report_disclaimer()
659                         disclaimer = urllib2.urlopen(request).read()
660                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
661                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
662                         return
663
664                 # Confirm age
665                 disclaimer_form = {
666                         'filters': '0',
667                         'submit': "Continue - I'm over 18",
668                         }
669                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
670                 try:
671                         self.report_age_confirmation()
672                         disclaimer = urllib2.urlopen(request).read()
673                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
674                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
675                         return
676         
677         def _real_extract(self, url):
678                 # Extract id and simplified title from URL
679                 mobj = re.match(self._VALID_URL, url)
680                 if mobj is None:
681                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
682                         return
683
684                 video_id = mobj.group(1)
685
686                 # Check if video comes from YouTube
687                 mobj2 = re.match(r'^yt-(.*)$', video_id)
688                 if mobj2 is not None:
689                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
690                         return
691
692                 simple_title = mobj.group(2).decode('utf-8')
693                 video_extension = 'flv'
694
695                 # Retrieve video webpage to extract further information
696                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
697                 try:
698                         self.report_download_webpage(video_id)
699                         webpage = urllib2.urlopen(request).read()
700                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
701                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
702                         return
703
704                 # Extract URL, uploader and title from webpage
705                 self.report_extraction(video_id)
706                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
707                 if mobj is None:
708                         self._downloader.trouble(u'ERROR: unable to extract media URL')
709                         return
710                 mediaURL = urllib.unquote(mobj.group(1))
711
712                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
713                 if mobj is None:
714                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
715                         return
716                 gdaKey = mobj.group(1)
717
718                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
719
720                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
721                 if mobj is None:
722                         self._downloader.trouble(u'ERROR: unable to extract title')
723                         return
724                 video_title = mobj.group(1).decode('utf-8')
725
726                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
727                 if mobj is None:
728                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
729                         return
730                 video_uploader = mobj.group(1)
731
732                 # Process video information
733                 self._downloader.process_info({
734                         'id':           video_id.decode('utf-8'),
735                         'url':          video_url.decode('utf-8'),
736                         'uploader':     video_uploader.decode('utf-8'),
737                         'title':        video_title,
738                         'stitle':       simple_title,
739                         'ext':          video_extension.decode('utf-8'),
740                         })
741
742
743 class YoutubeSearchIE(InfoExtractor):
744         """Information Extractor for YouTube search queries."""
745         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
746         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
747         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
748         _MORE_PAGES_INDICATOR = r'>Next</a>'
749         _youtube_ie = None
750         _max_youtube_results = 1000
751
752         def __init__(self, youtube_ie, downloader=None):
753                 InfoExtractor.__init__(self, downloader)
754                 self._youtube_ie = youtube_ie
755         
756         @staticmethod
757         def suitable(url):
758                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
759
760         def report_download_page(self, query, pagenum):
761                 """Report attempt to download playlist page with given number."""
762                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
763
764         def _real_initialize(self):
765                 self._youtube_ie.initialize()
766         
767         def _real_extract(self, query):
768                 mobj = re.match(self._VALID_QUERY, query)
769                 if mobj is None:
770                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
771                         return
772
773                 prefix, query = query.split(':')
774                 prefix = prefix[8:]
775                 if prefix == '':
776                         self._download_n_results(query, 1)
777                         return
778                 elif prefix == 'all':
779                         self._download_n_results(query, self._max_youtube_results)
780                         return
781                 else:
782                         try:
783                                 n = int(prefix)
784                                 if n <= 0:
785                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
786                                         return
787                                 elif n > self._max_youtube_results:
788                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
789                                         n = self._max_youtube_results
790                                 self._download_n_results(query, n)
791                                 return
792                         except ValueError: # parsing prefix as int fails
793                                 self._download_n_results(query, 1)
794                                 return
795
796         def _download_n_results(self, query, n):
797                 """Downloads a specified number of results for a query"""
798
799                 video_ids = []
800                 already_seen = set()
801                 pagenum = 1
802
803                 while True:
804                         self.report_download_page(query, pagenum)
805                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
806                         request = urllib2.Request(result_url, None, std_headers)
807                         try:
808                                 page = urllib2.urlopen(request).read()
809                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
810                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
811                                 return
812
813                         # Extract video identifiers
814                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
815                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
816                                 if video_id not in already_seen:
817                                         video_ids.append(video_id)
818                                         already_seen.add(video_id)
819                                         if len(video_ids) == n:
820                                                 # Specified n videos reached
821                                                 for id in video_ids:
822                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
823                                                 return
824
825                         if self._MORE_PAGES_INDICATOR not in page:
826                                 for id in video_ids:
827                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
828                                 return
829
830                         pagenum = pagenum + 1
831
832 class YoutubePlaylistIE(InfoExtractor):
833         """Information Extractor for YouTube playlists."""
834
835         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
836         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
837         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
838         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
839         _youtube_ie = None
840
841         def __init__(self, youtube_ie, downloader=None):
842                 InfoExtractor.__init__(self, downloader)
843                 self._youtube_ie = youtube_ie
844         
845         @staticmethod
846         def suitable(url):
847                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
848
849         def report_download_page(self, playlist_id, pagenum):
850                 """Report attempt to download playlist page with given number."""
851                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
852
853         def _real_initialize(self):
854                 self._youtube_ie.initialize()
855         
856         def _real_extract(self, url):
857                 # Extract playlist id
858                 mobj = re.match(self._VALID_URL, url)
859                 if mobj is None:
860                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
861                         return
862
863                 # Download playlist pages
864                 playlist_id = mobj.group(1)
865                 video_ids = []
866                 pagenum = 1
867
868                 while True:
869                         self.report_download_page(playlist_id, pagenum)
870                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
871                         try:
872                                 page = urllib2.urlopen(request).read()
873                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
874                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
875                                 return
876
877                         # Extract video identifiers
878                         ids_in_page = []
879                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
880                                 if mobj.group(1) not in ids_in_page:
881                                         ids_in_page.append(mobj.group(1))
882                         video_ids.extend(ids_in_page)
883
884                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
885                                 break
886                         pagenum = pagenum + 1
887
888                 for id in video_ids:
889                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
890                 return
891
892 class PostProcessor(object):
893         """Post Processor class.
894
895         PostProcessor objects can be added to downloaders with their
896         add_post_processor() method. When the downloader has finished a
897         successful download, it will take its internal chain of PostProcessors
898         and start calling the run() method on each one of them, first with
899         an initial argument and then with the returned value of the previous
900         PostProcessor.
901
902         The chain will be stopped if one of them ever returns None or the end
903         of the chain is reached.
904
905         PostProcessor objects follow a "mutual registration" process similar
906         to InfoExtractor objects.
907         """
908
909         _downloader = None
910
911         def __init__(self, downloader=None):
912                 self._downloader = downloader
913
914         def set_downloader(self, downloader):
915                 """Sets the downloader for this PP."""
916                 self._downloader = downloader
917         
918         def run(self, information):
919                 """Run the PostProcessor.
920
921                 The "information" argument is a dictionary like the ones
922                 returned by InfoExtractors. The only difference is that this
923                 one has an extra field called "filepath" that points to the
924                 downloaded file.
925
926                 When this method returns None, the postprocessing chain is
927                 stopped. However, this method may return an information
928                 dictionary that will be passed to the next postprocessing
929                 object in the chain. It can be the one it received after
930                 changing some fields.
931
932                 In addition, this method may raise a PostProcessingError
933                 exception that will be taken into account by the downloader
934                 it was called from.
935                 """
936                 return information # by default, do nothing
937         
938 ### MAIN PROGRAM ###
939 if __name__ == '__main__':
940         try:
941                 # Modules needed only when running the main program
942                 import getpass
943                 import optparse
944
945                 # General configuration
946                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
947                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
948                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
949
950                 # Parse command line
951                 parser = optparse.OptionParser(
952                                 usage='Usage: %prog [options] url...',
953                                 version='INTERNAL',
954                                 conflict_handler='resolve',
955                                 )
956                 parser.add_option('-h', '--help',
957                                 action='help', help='print this help text and exit')
958                 parser.add_option('-v', '--version',
959                                 action='version', help='print program version and exit')
960                 parser.add_option('-u', '--username',
961                                 dest='username', metavar='UN', help='account username')
962                 parser.add_option('-p', '--password',
963                                 dest='password', metavar='PW', help='account password')
964                 parser.add_option('-o', '--output',
965                                 dest='outtmpl', metavar='TPL', help='output filename template')
966                 parser.add_option('-q', '--quiet',
967                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
968                 parser.add_option('-s', '--simulate',
969                                 action='store_true', dest='simulate', help='do not download video', default=False)
970                 parser.add_option('-t', '--title',
971                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
972                 parser.add_option('-l', '--literal',
973                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
974                 parser.add_option('-n', '--netrc',
975                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
976                 parser.add_option('-g', '--get-url',
977                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
978                 parser.add_option('-e', '--get-title',
979                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
980                 parser.add_option('-f', '--format',
981                                 dest='format', metavar='FMT', help='video format code')
982                 parser.add_option('-m', '--mobile-version',
983                                 action='store_const', dest='format', help='alias for -f 17', const='17')
984                 parser.add_option('-d', '--high-def',
985                                 action='store_const', dest='format', help='alias for -f 22', const='22')
986                 parser.add_option('-i', '--ignore-errors',
987                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
988                 parser.add_option('-r', '--rate-limit',
989                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
990                 parser.add_option('-a', '--batch-file',
991                                 dest='batchfile', metavar='F', help='file containing URLs to download')
992                 parser.add_option('-w', '--no-overwrites',
993                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
994                 (opts, args) = parser.parse_args()
995
996                 # Batch file verification
997                 batchurls = []
998                 if opts.batchfile is not None:
999                         try:
1000                                 batchurls = open(opts.batchfile, 'r').readlines()
1001                                 batchurls = [x.strip() for x in batchurls]
1002                                 batchurls = [x for x in batchurls if len(x) > 0]
1003                         except IOError:
1004                                 sys.exit(u'ERROR: batch file could not be read')
1005                 all_urls = batchurls + args
1006
1007                 # Conflicting, missing and erroneous options
1008                 if len(all_urls) < 1:
1009                         sys.exit(u'ERROR: you must provide at least one URL')
1010                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1011                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1012                 if opts.password is not None and opts.username is None:
1013                         sys.exit(u'ERROR: account username missing')
1014                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1015                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1016                 if opts.usetitle and opts.useliteral:
1017                         sys.exit(u'ERROR: using title conflicts with using literal title')
1018                 if opts.username is not None and opts.password is None:
1019                         opts.password = getpass.getpass(u'Type account password and press return:')
1020                 if opts.ratelimit is not None:
1021                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1022                         if numeric_limit is None:
1023                                 sys.exit(u'ERROR: invalid rate limit specified')
1024                         opts.ratelimit = numeric_limit
1025
1026                 # Information extractors
1027                 youtube_ie = YoutubeIE()
1028                 metacafe_ie = MetacafeIE(youtube_ie)
1029                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1030                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1031
1032                 # File downloader
1033                 fd = FileDownloader({
1034                         'usenetrc': opts.usenetrc,
1035                         'username': opts.username,
1036                         'password': opts.password,
1037                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1038                         'forceurl': opts.geturl,
1039                         'forcetitle': opts.gettitle,
1040                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1041                         'format': opts.format,
1042                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1043                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1044                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1045                                 or u'%(id)s.%(ext)s'),
1046                         'ignoreerrors': opts.ignoreerrors,
1047                         'ratelimit': opts.ratelimit,
1048                         'nooverwrites': opts.nooverwrites,
1049                         })
1050                 fd.add_info_extractor(youtube_search_ie)
1051                 fd.add_info_extractor(youtube_pl_ie)
1052                 fd.add_info_extractor(metacafe_ie)
1053                 fd.add_info_extractor(youtube_ie)
1054                 retcode = fd.download(all_urls)
1055                 sys.exit(retcode)
1056
1057         except DownloadError:
1058                 sys.exit(1)
1059         except SameFileError:
1060                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1061         except KeyboardInterrupt:
1062                 sys.exit(u'\nERROR: Interrupted by user')