Remove old ignore patterns which are no longer needed
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = { 
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class FileDownloader(object):
56         """File Downloader class.
57
58         File downloader objects are the ones responsible of downloading the
59         actual video file and writing it to disk if the user has requested
60         it, among some other tasks. In most cases there should be one per
61         program. As, given a video URL, the downloader doesn't know how to
62         extract all the needed information, task that InfoExtractors do, it
63         has to pass the URL to one of them.
64
65         For this, file downloader objects have a method that allows
66         InfoExtractors to be registered in a given order. When it is passed
67         a URL, the file downloader handles it to the first InfoExtractor it
68         finds that reports being able to handle it. The InfoExtractor returns
69         all the information to the FileDownloader and the latter downloads the
70         file or does whatever it's instructed to do.
71
72         File downloaders accept a lot of parameters. In order not to saturate
73         the object constructor with arguments, it receives a dictionary of
74         options instead. These options are available through the params
75         attribute for the InfoExtractors to use. The FileDownloader also
76         registers itself as the downloader in charge for the InfoExtractors
77         that are added to it, so this is a "mutual registration".
78
79         Available options:
80
81         username:       Username for authentication purposes.
82         password:       Password for authentication purposes.
83         usenetrc:       Use netrc for authentication instead.
84         quiet:          Do not print messages to stdout.
85         forceurl:       Force printing final URL.
86         forcetitle:     Force printing title.
87         simulate:       Do not download the video files.
88         format:         Video format code.
89         outtmpl:        Template for output names.
90         ignoreerrors:   Do not stop on download errors.
91         ratelimit:      Download speed limit, in bytes/sec.
92         nooverwrites:   Prevent overwriting files.
93         """
94
95         params = None
96         _ies = []
97         _pps = []
98
99         def __init__(self, params):
100                 """Create a FileDownloader object with the given options."""
101                 self._ies = []
102                 self._pps = []
103                 self.params = params
104         
105         @staticmethod
106         def pmkdir(filename):
107                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
108                 components = filename.split(os.sep)
109                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
110                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
111                 for dir in aggregate:
112                         if not os.path.exists(dir):
113                                 os.mkdir(dir)
114         
115         @staticmethod
116         def format_bytes(bytes):
117                 if bytes is None:
118                         return 'N/A'
119                 if bytes == 0:
120                         exponent = 0
121                 else:
122                         exponent = long(math.log(float(bytes), 1024.0))
123                 suffix = 'bkMGTPEZY'[exponent]
124                 converted = float(bytes) / float(1024**exponent)
125                 return '%.2f%s' % (converted, suffix)
126
127         @staticmethod
128         def calc_percent(byte_counter, data_len):
129                 if data_len is None:
130                         return '---.-%'
131                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
132
133         @staticmethod
134         def calc_eta(start, now, total, current):
135                 if total is None:
136                         return '--:--'
137                 dif = now - start
138                 if current == 0 or dif < 0.001: # One millisecond
139                         return '--:--'
140                 rate = float(current) / dif
141                 eta = long((float(total) - float(current)) / rate)
142                 (eta_mins, eta_secs) = divmod(eta, 60)
143                 if eta_mins > 99:
144                         return '--:--'
145                 return '%02d:%02d' % (eta_mins, eta_secs)
146
147         @staticmethod
148         def calc_speed(start, now, bytes):
149                 dif = now - start
150                 if bytes == 0 or dif < 0.001: # One millisecond
151                         return '%10s' % '---b/s'
152                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
153
154         @staticmethod
155         def best_block_size(elapsed_time, bytes):
156                 new_min = max(bytes / 2.0, 1.0)
157                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
158                 if elapsed_time < 0.001:
159                         return int(new_max)
160                 rate = bytes / elapsed_time
161                 if rate > new_max:
162                         return int(new_max)
163                 if rate < new_min:
164                         return int(new_min)
165                 return int(rate)
166
167         @staticmethod
168         def parse_bytes(bytestr):
169                 """Parse a string indicating a byte quantity into a long integer."""
170                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
171                 if matchobj is None:
172                         return None
173                 number = float(matchobj.group(1))
174                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
175                 return long(round(number * multiplier))
176
177         def add_info_extractor(self, ie):
178                 """Add an InfoExtractor object to the end of the list."""
179                 self._ies.append(ie)
180                 ie.set_downloader(self)
181         
182         def add_post_processor(self, pp):
183                 """Add a PostProcessor object to the end of the chain."""
184                 self._pps.append(pp)
185                 pp.set_downloader(self)
186         
187         def to_stdout(self, message, skip_eol=False):
188                 """Print message to stdout if not in quiet mode."""
189                 if not self.params.get('quiet', False):
190                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
191                         sys.stdout.flush()
192         
193         def to_stderr(self, message):
194                 """Print message to stderr."""
195                 print >>sys.stderr, message
196         
197         def fixed_template(self):
198                 """Checks if the output template is fixed."""
199                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
200
201         def trouble(self, message=None):
202                 """Determine action to take when a download problem appears.
203
204                 Depending on if the downloader has been configured to ignore
205                 download errors or not, this method may throw an exception or
206                 not when errors are found, after printing the message. If it
207                 doesn't raise, it returns an error code suitable to be returned
208                 later as a program exit code to indicate error.
209                 """
210                 if message is not None:
211                         self.to_stderr(message)
212                 if not self.params.get('ignoreerrors', False):
213                         raise DownloadError(message)
214                 return 1
215
216         def slow_down(self, start_time, byte_counter):
217                 """Sleep if the download speed is over the rate limit."""
218                 rate_limit = self.params.get('ratelimit', None)
219                 if rate_limit is None or byte_counter == 0:
220                         return
221                 now = time.time()
222                 elapsed = now - start_time
223                 if elapsed <= 0.0:
224                         return
225                 speed = float(byte_counter) / elapsed
226                 if speed > rate_limit:
227                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
228
229         def report_destination(self, filename):
230                 """Report destination filename."""
231                 self.to_stdout(u'[download] Destination: %s' % filename)
232         
233         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
234                 """Report download progress."""
235                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
236                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
237         
238         def report_finish(self):
239                 """Report download finished."""
240                 self.to_stdout(u'')
241
242         def download(self, url_list):
243                 """Download a given list of URLs."""
244                 retcode = 0
245                 if len(url_list) > 1 and self.fixed_template():
246                         raise SameFileError(self.params['outtmpl'])
247
248                 for url in url_list:
249                         suitable_found = False
250                         for ie in self._ies:
251                                 if not ie.suitable(url):
252                                         continue
253                                 # Suitable InfoExtractor found
254                                 suitable_found = True
255                                 all_results = ie.extract(url)
256                                 results = [x for x in all_results if x is not None]
257                                 if len(results) != len(all_results):
258                                         retcode = self.trouble()
259
260                                 if len(results) > 1 and self.fixed_template():
261                                         raise SameFileError(self.params['outtmpl'])
262
263                                 for result in results:
264                                         # Forced printings
265                                         if self.params.get('forcetitle', False):
266                                                 print result['title']
267                                         if self.params.get('forceurl', False):
268                                                 print result['url']
269                                                 
270                                         # Do nothing else if in simulate mode
271                                         if self.params.get('simulate', False):
272                                                 continue
273
274                                         try:
275                                                 filename = self.params['outtmpl'] % result
276                                                 self.report_destination(filename)
277                                         except (ValueError, KeyError), err:
278                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
279                                                 continue
280                                         if self.params['nooverwrites'] and os.path.exists(filename):
281                                                 self.to_stderr('WARNING: file exists: %s; skipping' % filename)
282                                                 continue
283                                         try:
284                                                 self.pmkdir(filename)
285                                         except (OSError, IOError), err:
286                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
287                                                 continue
288                                         try:
289                                                 outstream = open(filename, 'wb')
290                                         except (OSError, IOError), err:
291                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
292                                                 continue
293                                         try:
294                                                 self._do_download(outstream, result['url'])
295                                                 outstream.close()
296                                         except (OSError, IOError), err:
297                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
298                                                 continue
299                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
300                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
301                                                 continue
302                                         try:
303                                                 self.post_process(filename, result)
304                                         except (PostProcessingError), err:
305                                                 retcode = self.trouble('ERROR: postprocessing: %s' % str(err))
306                                                 continue
307
308                                 break
309                         if not suitable_found:
310                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
311
312                 return retcode
313
314         def post_process(self, filename, ie_info):
315                 """Run the postprocessing chain on the given file."""
316                 info = dict(ie_info)
317                 info['filepath'] = filename
318                 for pp in self._pps:
319                         info = pp.run(info)
320                         if info is None:
321                                 break
322         
323         def _do_download(self, stream, url):
324                 request = urllib2.Request(url, None, std_headers)
325                 data = urllib2.urlopen(request)
326                 data_len = data.info().get('Content-length', None)
327                 data_len_str = self.format_bytes(data_len)
328                 byte_counter = 0
329                 block_size = 1024
330                 start = time.time()
331                 while True:
332                         # Progress message
333                         percent_str = self.calc_percent(byte_counter, data_len)
334                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
335                         speed_str = self.calc_speed(start, time.time(), byte_counter)
336                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
337
338                         # Download and write
339                         before = time.time()
340                         data_block = data.read(block_size)
341                         after = time.time()
342                         data_block_len = len(data_block)
343                         if data_block_len == 0:
344                                 break
345                         byte_counter += data_block_len
346                         stream.write(data_block)
347                         block_size = self.best_block_size(after - before, data_block_len)
348
349                         # Apply rate limit
350                         self.slow_down(start, byte_counter)
351
352                 self.report_finish()
353                 if data_len is not None and str(byte_counter) != data_len:
354                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
355
356 class InfoExtractor(object):
357         """Information Extractor class.
358
359         Information extractors are the classes that, given a URL, extract
360         information from the video (or videos) the URL refers to. This
361         information includes the real video URL, the video title and simplified
362         title, author and others. It is returned in a list of dictionaries when
363         calling its extract() method. It is a list because a URL can refer to
364         more than one video (think of playlists). The dictionaries must include
365         the following fields:
366
367         id:             Video identifier.
368         url:            Final video URL.
369         uploader:       Nickname of the video uploader.
370         title:          Literal title.
371         stitle:         Simplified title.
372         ext:            Video filename extension.
373
374         Subclasses of this one should re-define the _real_initialize() and
375         _real_extract() methods, as well as the suitable() static method.
376         Probably, they should also be instantiated and added to the main
377         downloader.
378         """
379
380         _ready = False
381         _downloader = None
382
383         def __init__(self, downloader=None):
384                 """Constructor. Receives an optional downloader."""
385                 self._ready = False
386                 self.set_downloader(downloader)
387
388         @staticmethod
389         def suitable(url):
390                 """Receives a URL and returns True if suitable for this IE."""
391                 return False
392
393         def initialize(self):
394                 """Initializes an instance (authentication, etc)."""
395                 if not self._ready:
396                         self._real_initialize()
397                         self._ready = True
398
399         def extract(self, url):
400                 """Extracts URL information and returns it in list of dicts."""
401                 self.initialize()
402                 return self._real_extract(url)
403
404         def set_downloader(self, downloader):
405                 """Sets the downloader for this IE."""
406                 self._downloader = downloader
407         
408         def to_stdout(self, message):
409                 """Print message to stdout if downloader is not in quiet mode."""
410                 if self._downloader is None or not self._downloader.params.get('quiet', False):
411                         print message
412         
413         def to_stderr(self, message):
414                 """Print message to stderr."""
415                 print >>sys.stderr, message
416
417         def _real_initialize(self):
418                 """Real initialization process. Redefine in subclasses."""
419                 pass
420
421         def _real_extract(self, url):
422                 """Real extraction process. Redefine in subclasses."""
423                 pass
424
425 class YoutubeIE(InfoExtractor):
426         """Information extractor for youtube.com."""
427
428         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
429         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
430         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
431         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
432         _NETRC_MACHINE = 'youtube'
433
434         @staticmethod
435         def suitable(url):
436                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
437
438         def report_lang(self):
439                 """Report attempt to set language."""
440                 self.to_stdout(u'[youtube] Setting language')
441
442         def report_login(self):
443                 """Report attempt to log in."""
444                 self.to_stdout(u'[youtube] Logging in')
445         
446         def report_age_confirmation(self):
447                 """Report attempt to confirm age."""
448                 self.to_stdout(u'[youtube] Confirming age')
449         
450         def report_webpage_download(self, video_id):
451                 """Report attempt to download webpage."""
452                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
453         
454         def report_information_extraction(self, video_id):
455                 """Report attempt to extract video information."""
456                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
457         
458         def report_video_url(self, video_id, video_real_url):
459                 """Report extracted video URL."""
460                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
461
462         def _real_initialize(self):
463                 if self._downloader is None:
464                         return
465
466                 username = None
467                 password = None
468                 downloader_params = self._downloader.params
469
470                 # Attempt to use provided username and password or .netrc data
471                 if downloader_params.get('username', None) is not None:
472                         username = downloader_params['username']
473                         password = downloader_params['password']
474                 elif downloader_params.get('usenetrc', False):
475                         try:
476                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
477                                 if info is not None:
478                                         username = info[0]
479                                         password = info[2]
480                                 else:
481                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
482                         except (IOError, netrc.NetrcParseError), err:
483                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
484                                 return
485
486                 # Set language
487                 request = urllib2.Request(self._LANG_URL, None, std_headers)
488                 try:
489                         self.report_lang()
490                         urllib2.urlopen(request).read()
491                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
492                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
493                         return
494
495                 # No authentication to be performed
496                 if username is None:
497                         return
498
499                 # Log in
500                 login_form = {
501                                 'current_form': 'loginForm',
502                                 'next':         '/',
503                                 'action_login': 'Log In',
504                                 'username':     username,
505                                 'password':     password,
506                                 }
507                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
508                 try:
509                         self.report_login()
510                         login_results = urllib2.urlopen(request).read()
511                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
512                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
513                                 return
514                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
515                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
516                         return
517         
518                 # Confirm age
519                 age_form = {
520                                 'next_url':             '/',
521                                 'action_confirm':       'Confirm',
522                                 }
523                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
524                 try:
525                         self.report_age_confirmation()
526                         age_results = urllib2.urlopen(request).read()
527                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
528                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
529                         return
530
531         def _real_extract(self, url):
532                 # Extract video id from URL
533                 mobj = re.match(self._VALID_URL, url)
534                 if mobj is None:
535                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
536                         return [None]
537                 video_id = mobj.group(2)
538
539                 # Downloader parameters
540                 format_param = None
541                 if self._downloader is not None:
542                         params = self._downloader.params
543                         format_param = params.get('format', None)
544
545                 # Extension
546                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
547
548                 # Normalize URL, including format
549                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
550                 if format_param is not None:
551                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
552                 request = urllib2.Request(normalized_url, None, std_headers)
553                 try:
554                         self.report_webpage_download(video_id)
555                         video_webpage = urllib2.urlopen(request).read()
556                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
557                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
558                         return [None]
559                 self.report_information_extraction(video_id)
560                 
561                 # "t" param
562                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
563                 if mobj is None:
564                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
565                         return [None]
566                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
567                 if format_param is not None:
568                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
569                 self.report_video_url(video_id, video_real_url)
570
571                 # uploader
572                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
573                 if mobj is None:
574                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
575                         return [None]
576                 video_uploader = mobj.group(1)
577
578                 # title
579                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
580                 if mobj is None:
581                         self.to_stderr(u'ERROR: unable to extract video title')
582                         return [None]
583                 video_title = mobj.group(1).decode('utf-8')
584                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
585                 video_title = video_title.replace(os.sep, u'%')
586
587                 # simplified title
588                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
589                 simple_title = simple_title.strip(ur'_')
590
591                 # Return information
592                 return [{
593                         'id':           video_id.decode('utf-8'),
594                         'url':          video_real_url.decode('utf-8'),
595                         'uploader':     video_uploader.decode('utf-8'),
596                         'title':        video_title,
597                         'stitle':       simple_title,
598                         'ext':          video_extension.decode('utf-8'),
599                         }]
600
601 class MetacafeIE(InfoExtractor):
602         """Information Extractor for metacafe.com."""
603
604         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
605         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
606         _youtube_ie = None
607
608         def __init__(self, youtube_ie, downloader=None):
609                 InfoExtractor.__init__(self, downloader)
610                 self._youtube_ie = youtube_ie
611
612         @staticmethod
613         def suitable(url):
614                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
615
616         def report_disclaimer(self):
617                 """Report disclaimer retrieval."""
618                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
619
620         def report_age_confirmation(self):
621                 """Report attempt to confirm age."""
622                 self.to_stdout(u'[metacafe] Confirming age')
623         
624         def report_download_webpage(self, video_id):
625                 """Report webpage download."""
626                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
627         
628         def report_extraction(self, video_id):
629                 """Report information extraction."""
630                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
631
632         def _real_initialize(self):
633                 # Retrieve disclaimer
634                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
635                 try:
636                         self.report_disclaimer()
637                         disclaimer = urllib2.urlopen(request).read()
638                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
639                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
640                         return
641
642                 # Confirm age
643                 disclaimer_form = {
644                         'filters': '0',
645                         'submit': "Continue - I'm over 18",
646                         }
647                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
648                 try:
649                         self.report_age_confirmation()
650                         disclaimer = urllib2.urlopen(request).read()
651                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
652                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
653                         return
654         
655         def _real_extract(self, url):
656                 # Extract id and simplified title from URL
657                 mobj = re.match(self._VALID_URL, url)
658                 if mobj is None:
659                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
660                         return [None]
661
662                 video_id = mobj.group(1)
663
664                 # Check if video comes from YouTube
665                 mobj2 = re.match(r'^yt-(.*)$', video_id)
666                 if mobj2 is not None:
667                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
668
669                 simple_title = mobj.group(2).decode('utf-8')
670                 video_extension = 'flv'
671
672                 # Retrieve video webpage to extract further information
673                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
674                 try:
675                         self.report_download_webpage(video_id)
676                         webpage = urllib2.urlopen(request).read()
677                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
678                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
679                         return [None]
680
681                 # Extract URL, uploader and title from webpage
682                 self.report_extraction(video_id)
683                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
684                 if mobj is None:
685                         self.to_stderr(u'ERROR: unable to extract media URL')
686                         return [None]
687                 mediaURL = mobj.group(1).replace('\\', '')
688
689                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
690                 if mobj is None:
691                         self.to_stderr(u'ERROR: unable to extract gdaKey')
692                         return [None]
693                 gdaKey = mobj.group(1)
694
695                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
696
697                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
698                 if mobj is None:
699                         self.to_stderr(u'ERROR: unable to extract title')
700                         return [None]
701                 video_title = mobj.group(1).decode('utf-8')
702
703                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
704                 if mobj is None:
705                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
706                         return [None]
707                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
708
709                 # Return information
710                 return [{
711                         'id':           video_id.decode('utf-8'),
712                         'url':          video_url.decode('utf-8'),
713                         'uploader':     video_uploader.decode('utf-8'),
714                         'title':        video_title,
715                         'stitle':       simple_title,
716                         'ext':          video_extension.decode('utf-8'),
717                         }]
718
719
720 class YoutubeSearchIE(InfoExtractor):
721         """Information Extractor for YouTube search queries."""
722         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
723         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
724         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
725         _MORE_PAGES_INDICATOR = r'>Next</a>'
726         _youtube_ie = None
727
728         def __init__(self, youtube_ie, downloader=None): 
729                 InfoExtractor.__init__(self, downloader)
730                 self._youtube_ie = youtube_ie
731         
732         @staticmethod
733         def suitable(url):
734                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
735
736         def report_download_page(self, query, pagenum):
737                 """Report attempt to download playlist page with given number."""
738                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
739
740         def _real_initialize(self):
741                 self._youtube_ie.initialize()
742         
743         def _real_extract(self, query):
744                 mobj = re.match(self._VALID_QUERY, query)
745                 if mobj is None:
746                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
747                         return [None]
748
749                 prefix, query = query.split(':')
750                 prefix = prefix[8:]
751                 if prefix == '': 
752                         return self._download_n_results(query, 1)
753                 elif prefix == 'all': 
754                         return self._download_n_results(query, -1)
755                 else: 
756                         try:
757                                 n = int(prefix)
758                                 if n <= 0:
759                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
760                                         return [None]
761                                 return self._download_n_results(query, n)
762                         except ValueError: # parsing prefix as int fails
763                                 return self._download_n_results(query, 1)
764
765         def _download_n_results(self, query, n):
766                 """Downloads a specified number of results for a query"""
767
768                 video_ids = []
769                 already_seen = set()
770                 pagenum = 1
771
772                 while True:
773                         self.report_download_page(query, pagenum)
774                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
775                         request = urllib2.Request(result_url, None, std_headers)
776                         try:
777                                 page = urllib2.urlopen(request).read()
778                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
779                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
780                                 return [None]
781
782                         # Extract video identifiers
783                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
784                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
785                                 if video_id not in already_seen:
786                                         video_ids.append(video_id)
787                                         already_seen.add(video_id)
788                                         if len(video_ids) == n:
789                                                 # Specified n videos reached
790                                                 information = []
791                                                 for id in video_ids:
792                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
793                                                 return information
794
795                         if self._MORE_PAGES_INDICATOR not in page:
796                                 information = []
797                                 for id in video_ids:
798                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
799                                 return information
800
801                         pagenum = pagenum + 1
802
803 class YoutubePlaylistIE(InfoExtractor):
804         """Information Extractor for YouTube playlists."""
805
806         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
807         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
808         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
809         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
810         _youtube_ie = None
811
812         def __init__(self, youtube_ie, downloader=None):
813                 InfoExtractor.__init__(self, downloader)
814                 self._youtube_ie = youtube_ie
815         
816         @staticmethod
817         def suitable(url):
818                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
819
820         def report_download_page(self, playlist_id, pagenum):
821                 """Report attempt to download playlist page with given number."""
822                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
823
824         def _real_initialize(self):
825                 self._youtube_ie.initialize()
826         
827         def _real_extract(self, url):
828                 # Extract playlist id
829                 mobj = re.match(self._VALID_URL, url)
830                 if mobj is None:
831                         self.to_stderr(u'ERROR: invalid url: %s' % url)
832                         return [None]
833
834                 # Download playlist pages
835                 playlist_id = mobj.group(1)
836                 video_ids = []
837                 pagenum = 1
838
839                 while True:
840                         self.report_download_page(playlist_id, pagenum)
841                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
842                         try:
843                                 page = urllib2.urlopen(request).read()
844                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
845                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
846                                 return [None]
847
848                         # Extract video identifiers
849                         ids_in_page = []
850                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
851                                 if mobj.group(1) not in ids_in_page:
852                                         ids_in_page.append(mobj.group(1))
853                         video_ids.extend(ids_in_page)
854
855                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
856                                 break
857                         pagenum = pagenum + 1
858
859                 information = []
860                 for id in video_ids:
861                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
862                 return information
863
864 class PostProcessor(object):
865         """Post Processor class.
866
867         PostProcessor objects can be added to downloaders with their
868         add_post_processor() method. When the downloader has finished a
869         successful download, it will take its internal chain of PostProcessors
870         and start calling the run() method on each one of them, first with
871         an initial argument and then with the returned value of the previous
872         PostProcessor.
873
874         The chain will be stopped if one of them ever returns None or the end
875         of the chain is reached.
876
877         PostProcessor objects follow a "mutual registration" process similar
878         to InfoExtractor objects.
879         """
880
881         _downloader = None
882
883         def __init__(self, downloader=None):
884                 self._downloader = downloader
885
886         def to_stdout(self, message):
887                 """Print message to stdout if downloader is not in quiet mode."""
888                 if self._downloader is None or not self._downloader.params.get('quiet', False):
889                         print message
890         
891         def to_stderr(self, message):
892                 """Print message to stderr."""
893                 print >>sys.stderr, message
894
895         def set_downloader(self, downloader):
896                 """Sets the downloader for this PP."""
897                 self._downloader = downloader
898         
899         def run(self, information):
900                 """Run the PostProcessor.
901
902                 The "information" argument is a dictionary like the ones
903                 returned by InfoExtractors. The only difference is that this
904                 one has an extra field called "filepath" that points to the
905                 downloaded file.
906
907                 When this method returns None, the postprocessing chain is
908                 stopped. However, this method may return an information
909                 dictionary that will be passed to the next postprocessing
910                 object in the chain. It can be the one it received after
911                 changing some fields.
912
913                 In addition, this method may raise a PostProcessingError
914                 exception that will be taken into account by the downloader
915                 it was called from.
916                 """
917                 return information # by default, do nothing
918         
919 ### MAIN PROGRAM ###
920 if __name__ == '__main__':
921         try:
922                 # Modules needed only when running the main program
923                 import getpass
924                 import optparse
925
926                 # General configuration
927                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
928                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
929                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
930
931                 # Parse command line
932                 parser = optparse.OptionParser(
933                                 usage='Usage: %prog [options] url...',
934                                 version='INTERNAL',
935                                 conflict_handler='resolve',
936                                 )
937                 parser.add_option('-h', '--help',
938                                 action='help', help='print this help text and exit')
939                 parser.add_option('-v', '--version',
940                                 action='version', help='print program version and exit')
941                 parser.add_option('-u', '--username',
942                                 dest='username', metavar='UN', help='account username')
943                 parser.add_option('-p', '--password',
944                                 dest='password', metavar='PW', help='account password')
945                 parser.add_option('-o', '--output',
946                                 dest='outtmpl', metavar='TPL', help='output filename template')
947                 parser.add_option('-q', '--quiet',
948                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
949                 parser.add_option('-s', '--simulate',
950                                 action='store_true', dest='simulate', help='do not download video', default=False)
951                 parser.add_option('-t', '--title',
952                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
953                 parser.add_option('-l', '--literal',
954                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
955                 parser.add_option('-n', '--netrc',
956                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
957                 parser.add_option('-g', '--get-url',
958                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
959                 parser.add_option('-e', '--get-title',
960                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
961                 parser.add_option('-f', '--format',
962                                 dest='format', metavar='FMT', help='video format code')
963                 parser.add_option('-b', '--best-quality',
964                                 action='store_const', dest='format', help='alias for -f 18', const='18')
965                 parser.add_option('-m', '--mobile-version',
966                                 action='store_const', dest='format', help='alias for -f 17', const='17')
967                 parser.add_option('-i', '--ignore-errors',
968                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
969                 parser.add_option('-r', '--rate-limit',
970                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
971                 parser.add_option('-a', '--batch-file',
972                                 dest='batchfile', metavar='F', help='file containing URLs to download')
973                 parser.add_option('-w', '--no-overwrites',
974                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
975                 (opts, args) = parser.parse_args()
976
977                 # Batch file verification
978                 batchurls = []
979                 if opts.batchfile is not None:
980                         try:
981                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
982                         except IOError:
983                                 sys.exit(u'ERROR: batch file could not be read')
984                 all_urls = batchurls + args
985
986                 # Conflicting, missing and erroneous options
987                 if len(all_urls) < 1:
988                         sys.exit(u'ERROR: you must provide at least one URL')
989                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
990                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
991                 if opts.password is not None and opts.username is None:
992                         sys.exit(u'ERROR: account username missing')
993                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
994                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
995                 if opts.usetitle and opts.useliteral:
996                         sys.exit(u'ERROR: using title conflicts with using literal title')
997                 if opts.username is not None and opts.password is None:
998                         opts.password = getpass.getpass(u'Type account password and press return:')
999                 if opts.ratelimit is not None:
1000                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1001                         if numeric_limit is None:
1002                                 sys.exit(u'ERROR: invalid rate limit specified')
1003                         opts.ratelimit = numeric_limit
1004
1005                 # Information extractors
1006                 youtube_ie = YoutubeIE()
1007                 metacafe_ie = MetacafeIE(youtube_ie)
1008                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1009                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1010
1011                 # File downloader
1012                 charset = locale.getdefaultlocale()[1]
1013                 if charset is None:
1014                         charset = 'ascii'
1015                 fd = FileDownloader({
1016                         'usenetrc': opts.usenetrc,
1017                         'username': opts.username,
1018                         'password': opts.password,
1019                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1020                         'forceurl': opts.geturl,
1021                         'forcetitle': opts.gettitle,
1022                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1023                         'format': opts.format,
1024                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1025                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1026                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1027                                 or u'%(id)s.%(ext)s'),
1028                         'ignoreerrors': opts.ignoreerrors,
1029                         'ratelimit': opts.ratelimit,
1030                         'nooverwrites': opts.nooverwrites,
1031                         })
1032                 fd.add_info_extractor(youtube_search_ie)
1033                 fd.add_info_extractor(youtube_pl_ie)
1034                 fd.add_info_extractor(metacafe_ie)
1035                 fd.add_info_extractor(youtube_ie)
1036                 retcode = fd.download(all_urls)
1037                 sys.exit(retcode)
1038
1039         except DownloadError:
1040                 sys.exit(1)
1041         except SameFileError:
1042                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1043         except KeyboardInterrupt:
1044                 sys.exit(u'\nERROR: Interrupted by user')