c526071e5c6f110a5222cec60a236d5865e0c7d0
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class FileDownloader(object):
56         """File Downloader class.
57
58         File downloader objects are the ones responsible of downloading the
59         actual video file and writing it to disk if the user has requested
60         it, among some other tasks. In most cases there should be one per
61         program. As, given a video URL, the downloader doesn't know how to
62         extract all the needed information, task that InfoExtractors do, it
63         has to pass the URL to one of them.
64
65         For this, file downloader objects have a method that allows
66         InfoExtractors to be registered in a given order. When it is passed
67         a URL, the file downloader handles it to the first InfoExtractor it
68         finds that reports being able to handle it. The InfoExtractor returns
69         all the information to the FileDownloader and the latter downloads the
70         file or does whatever it's instructed to do.
71
72         File downloaders accept a lot of parameters. In order not to saturate
73         the object constructor with arguments, it receives a dictionary of
74         options instead. These options are available through the params
75         attribute for the InfoExtractors to use. The FileDownloader also
76         registers itself as the downloader in charge for the InfoExtractors
77         that are added to it, so this is a "mutual registration".
78
79         Available options:
80
81         username:       Username for authentication purposes.
82         password:       Password for authentication purposes.
83         usenetrc:       Use netrc for authentication instead.
84         quiet:          Do not print messages to stdout.
85         forceurl:       Force printing final URL.
86         forcetitle:     Force printing title.
87         simulate:       Do not download the video files.
88         format:         Video format code.
89         outtmpl:        Template for output names.
90         ignoreerrors:   Do not stop on download errors.
91         ratelimit:      Download speed limit, in bytes/sec.
92         nooverwrites:   Prevent overwriting files.
93         """
94
95         params = None
96         _ies = []
97         _pps = []
98
99         def __init__(self, params):
100                 """Create a FileDownloader object with the given options."""
101                 self._ies = []
102                 self._pps = []
103                 self.params = params
104         
105         @staticmethod
106         def pmkdir(filename):
107                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
108                 components = filename.split(os.sep)
109                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
110                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
111                 for dir in aggregate:
112                         if not os.path.exists(dir):
113                                 os.mkdir(dir)
114         
115         @staticmethod
116         def format_bytes(bytes):
117                 if bytes is None:
118                         return 'N/A'
119                 if bytes == 0:
120                         exponent = 0
121                 else:
122                         exponent = long(math.log(float(bytes), 1024.0))
123                 suffix = 'bkMGTPEZY'[exponent]
124                 converted = float(bytes) / float(1024**exponent)
125                 return '%.2f%s' % (converted, suffix)
126
127         @staticmethod
128         def calc_percent(byte_counter, data_len):
129                 if data_len is None:
130                         return '---.-%'
131                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
132
133         @staticmethod
134         def calc_eta(start, now, total, current):
135                 if total is None:
136                         return '--:--'
137                 dif = now - start
138                 if current == 0 or dif < 0.001: # One millisecond
139                         return '--:--'
140                 rate = float(current) / dif
141                 eta = long((float(total) - float(current)) / rate)
142                 (eta_mins, eta_secs) = divmod(eta, 60)
143                 if eta_mins > 99:
144                         return '--:--'
145                 return '%02d:%02d' % (eta_mins, eta_secs)
146
147         @staticmethod
148         def calc_speed(start, now, bytes):
149                 dif = now - start
150                 if bytes == 0 or dif < 0.001: # One millisecond
151                         return '%10s' % '---b/s'
152                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
153
154         @staticmethod
155         def best_block_size(elapsed_time, bytes):
156                 new_min = max(bytes / 2.0, 1.0)
157                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
158                 if elapsed_time < 0.001:
159                         return int(new_max)
160                 rate = bytes / elapsed_time
161                 if rate > new_max:
162                         return int(new_max)
163                 if rate < new_min:
164                         return int(new_min)
165                 return int(rate)
166
167         @staticmethod
168         def parse_bytes(bytestr):
169                 """Parse a string indicating a byte quantity into a long integer."""
170                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
171                 if matchobj is None:
172                         return None
173                 number = float(matchobj.group(1))
174                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
175                 return long(round(number * multiplier))
176
177         def add_info_extractor(self, ie):
178                 """Add an InfoExtractor object to the end of the list."""
179                 self._ies.append(ie)
180                 ie.set_downloader(self)
181         
182         def add_post_processor(self, pp):
183                 """Add a PostProcessor object to the end of the chain."""
184                 self._pps.append(pp)
185                 pp.set_downloader(self)
186         
187         def to_stdout(self, message, skip_eol=False):
188                 """Print message to stdout if not in quiet mode."""
189                 if not self.params.get('quiet', False):
190                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
191                         sys.stdout.flush()
192         
193         def to_stderr(self, message):
194                 """Print message to stderr."""
195                 print >>sys.stderr, message
196         
197         def fixed_template(self):
198                 """Checks if the output template is fixed."""
199                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
200
201         def trouble(self, message=None):
202                 """Determine action to take when a download problem appears.
203
204                 Depending on if the downloader has been configured to ignore
205                 download errors or not, this method may throw an exception or
206                 not when errors are found, after printing the message. If it
207                 doesn't raise, it returns an error code suitable to be returned
208                 later as a program exit code to indicate error.
209                 """
210                 if message is not None:
211                         self.to_stderr(message)
212                 if not self.params.get('ignoreerrors', False):
213                         raise DownloadError(message)
214                 return 1
215
216         def slow_down(self, start_time, byte_counter):
217                 """Sleep if the download speed is over the rate limit."""
218                 rate_limit = self.params.get('ratelimit', None)
219                 if rate_limit is None or byte_counter == 0:
220                         return
221                 now = time.time()
222                 elapsed = now - start_time
223                 if elapsed <= 0.0:
224                         return
225                 speed = float(byte_counter) / elapsed
226                 if speed > rate_limit:
227                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
228
229         def report_destination(self, filename):
230                 """Report destination filename."""
231                 self.to_stdout(u'[download] Destination: %s' % filename)
232         
233         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
234                 """Report download progress."""
235                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
236                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
237         
238         def report_finish(self):
239                 """Report download finished."""
240                 self.to_stdout(u'')
241
242         def process_info(self, info_dict):
243                 """Process a single dictionary returned by an InfoExtractor."""
244                 # Forced printings
245                 if self.params.get('forcetitle', False):
246                         print info_dict['title']
247                 if self.params.get('forceurl', False):
248                         print info_dict['url']
249                         
250                 # Do nothing else if in simulate mode
251                 if self.params.get('simulate', False):
252                         return 0
253
254                 try:
255                         filename = self.params['outtmpl'] % info_dict
256                         self.report_destination(filename)
257                 except (ValueError, KeyError), err:
258                         return self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
259                 if self.params['nooverwrites'] and os.path.exists(filename):
260                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
261                         return 0
262                 try:
263                         self.pmkdir(filename)
264                 except (OSError, IOError), err:
265                         return self.trouble('ERROR: unable to create directories: %s' % str(err))
266                 try:
267                         outstream = open(filename, 'wb')
268                 except (OSError, IOError), err:
269                         return self.trouble('ERROR: unable to open for writing: %s' % str(err))
270                 try:
271                         self._do_download(outstream, info_dict['url'])
272                         outstream.close()
273                 except (OSError, IOError), err:
274                         return self.trouble('ERROR: unable to write video data: %s' % str(err))
275                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
276                         return self.trouble('ERROR: unable to download video data: %s' % str(err))
277                 try:
278                         self.post_process(filename, info_dict)
279                 except (PostProcessingError), err:
280                         return self.trouble('ERROR: postprocessing: %s' % str(err))
281
282                 return 0
283
284         def download(self, url_list):
285                 """Download a given list of URLs."""
286                 retcode = 0
287                 if len(url_list) > 1 and self.fixed_template():
288                         raise SameFileError(self.params['outtmpl'])
289
290                 for url in url_list:
291                         suitable_found = False
292                         for ie in self._ies:
293                                 # Go to next InfoExtractor if not suitable
294                                 if not ie.suitable(url):
295                                         continue
296
297                                 # Suitable InfoExtractor found
298                                 suitable_found = True
299
300                                 # Extract information from URL
301                                 all_results = ie.extract(url)
302                                 results = [x for x in all_results if x is not None]
303
304                                 # See if there were problems extracting any information
305                                 if len(results) != len(all_results):
306                                         retcode = self.trouble()
307
308                                 # Two results could go to the same file
309                                 if len(results) > 1 and self.fixed_template():
310                                         raise SameFileError(self.params['outtmpl'])
311
312                                 # Process each result
313                                 for result in results:
314                                         result = self.process_info(result)
315
316                                         # Do not overwrite an error code with a success code
317                                         if result != 0:
318                                                 retcode = result
319
320                                 # Suitable InfoExtractor had been found; go to next URL
321                                 break
322
323                         if not suitable_found:
324                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
325
326                 return retcode
327
328         def post_process(self, filename, ie_info):
329                 """Run the postprocessing chain on the given file."""
330                 info = dict(ie_info)
331                 info['filepath'] = filename
332                 for pp in self._pps:
333                         info = pp.run(info)
334                         if info is None:
335                                 break
336         
337         def _do_download(self, stream, url):
338                 request = urllib2.Request(url, None, std_headers)
339                 data = urllib2.urlopen(request)
340                 data_len = data.info().get('Content-length', None)
341                 data_len_str = self.format_bytes(data_len)
342                 byte_counter = 0
343                 block_size = 1024
344                 start = time.time()
345                 while True:
346                         # Progress message
347                         percent_str = self.calc_percent(byte_counter, data_len)
348                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
349                         speed_str = self.calc_speed(start, time.time(), byte_counter)
350                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
351
352                         # Download and write
353                         before = time.time()
354                         data_block = data.read(block_size)
355                         after = time.time()
356                         data_block_len = len(data_block)
357                         if data_block_len == 0:
358                                 break
359                         byte_counter += data_block_len
360                         stream.write(data_block)
361                         block_size = self.best_block_size(after - before, data_block_len)
362
363                         # Apply rate limit
364                         self.slow_down(start, byte_counter)
365
366                 self.report_finish()
367                 if data_len is not None and str(byte_counter) != data_len:
368                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
369
370 class InfoExtractor(object):
371         """Information Extractor class.
372
373         Information extractors are the classes that, given a URL, extract
374         information from the video (or videos) the URL refers to. This
375         information includes the real video URL, the video title and simplified
376         title, author and others. It is returned in a list of dictionaries when
377         calling its extract() method. It is a list because a URL can refer to
378         more than one video (think of playlists). The dictionaries must include
379         the following fields:
380
381         id:             Video identifier.
382         url:            Final video URL.
383         uploader:       Nickname of the video uploader.
384         title:          Literal title.
385         stitle:         Simplified title.
386         ext:            Video filename extension.
387
388         Subclasses of this one should re-define the _real_initialize() and
389         _real_extract() methods, as well as the suitable() static method.
390         Probably, they should also be instantiated and added to the main
391         downloader.
392         """
393
394         _ready = False
395         _downloader = None
396
397         def __init__(self, downloader=None):
398                 """Constructor. Receives an optional downloader."""
399                 self._ready = False
400                 self.set_downloader(downloader)
401
402         @staticmethod
403         def suitable(url):
404                 """Receives a URL and returns True if suitable for this IE."""
405                 return False
406
407         def initialize(self):
408                 """Initializes an instance (authentication, etc)."""
409                 if not self._ready:
410                         self._real_initialize()
411                         self._ready = True
412
413         def extract(self, url):
414                 """Extracts URL information and returns it in list of dicts."""
415                 self.initialize()
416                 return self._real_extract(url)
417
418         def set_downloader(self, downloader):
419                 """Sets the downloader for this IE."""
420                 self._downloader = downloader
421         
422         def to_stdout(self, message):
423                 """Print message to stdout if downloader is not in quiet mode."""
424                 if self._downloader is None or not self._downloader.params.get('quiet', False):
425                         print message
426         
427         def to_stderr(self, message):
428                 """Print message to stderr."""
429                 print >>sys.stderr, message
430
431         def _real_initialize(self):
432                 """Real initialization process. Redefine in subclasses."""
433                 pass
434
435         def _real_extract(self, url):
436                 """Real extraction process. Redefine in subclasses."""
437                 pass
438
439 class YoutubeIE(InfoExtractor):
440         """Information extractor for youtube.com."""
441
442         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
443         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
444         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
445         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
446         _NETRC_MACHINE = 'youtube'
447
448         @staticmethod
449         def suitable(url):
450                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
451
452         @staticmethod
453         def htmlentity_transform(matchobj):
454                 """Transforms an HTML entity to a Unicode character."""
455                 entity = matchobj.group(1)
456
457                 # Known non-numeric HTML entity
458                 if entity in htmlentitydefs.name2codepoint:
459                         return unichr(htmlentitydefs.name2codepoint[entity])
460
461                 # Unicode character
462                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
463                 if mobj is not None:
464                         numstr = mobj.group(1)
465                         if numstr.startswith(u'x'):
466                                 base = 16
467                                 numstr = u'0%s' % numstr
468                         else:
469                                 base = 10
470                         return unichr(long(numstr, base))
471
472                 # Unknown entity in name, return its literal representation
473                 return (u'&%s;' % entity)
474
475         def report_lang(self):
476                 """Report attempt to set language."""
477                 self.to_stdout(u'[youtube] Setting language')
478
479         def report_login(self):
480                 """Report attempt to log in."""
481                 self.to_stdout(u'[youtube] Logging in')
482         
483         def report_age_confirmation(self):
484                 """Report attempt to confirm age."""
485                 self.to_stdout(u'[youtube] Confirming age')
486         
487         def report_webpage_download(self, video_id):
488                 """Report attempt to download webpage."""
489                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
490         
491         def report_information_extraction(self, video_id):
492                 """Report attempt to extract video information."""
493                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
494         
495         def report_video_url(self, video_id, video_real_url):
496                 """Report extracted video URL."""
497                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
498         
499         def _real_initialize(self):
500                 if self._downloader is None:
501                         return
502
503                 username = None
504                 password = None
505                 downloader_params = self._downloader.params
506
507                 # Attempt to use provided username and password or .netrc data
508                 if downloader_params.get('username', None) is not None:
509                         username = downloader_params['username']
510                         password = downloader_params['password']
511                 elif downloader_params.get('usenetrc', False):
512                         try:
513                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
514                                 if info is not None:
515                                         username = info[0]
516                                         password = info[2]
517                                 else:
518                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
519                         except (IOError, netrc.NetrcParseError), err:
520                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
521                                 return
522
523                 # Set language
524                 request = urllib2.Request(self._LANG_URL, None, std_headers)
525                 try:
526                         self.report_lang()
527                         urllib2.urlopen(request).read()
528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529                         self.to_stderr(u'WARNING: unable to set language: %s' % str(err))
530                         return
531
532                 # No authentication to be performed
533                 if username is None:
534                         return
535
536                 # Log in
537                 login_form = {
538                                 'current_form': 'loginForm',
539                                 'next':         '/',
540                                 'action_login': 'Log In',
541                                 'username':     username,
542                                 'password':     password,
543                                 }
544                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
545                 try:
546                         self.report_login()
547                         login_results = urllib2.urlopen(request).read()
548                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
549                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
550                                 return
551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
552                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
553                         return
554         
555                 # Confirm age
556                 age_form = {
557                                 'next_url':             '/',
558                                 'action_confirm':       'Confirm',
559                                 }
560                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
561                 try:
562                         self.report_age_confirmation()
563                         age_results = urllib2.urlopen(request).read()
564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
565                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
566                         return
567
568         def _real_extract(self, url):
569                 # Extract video id from URL
570                 mobj = re.match(self._VALID_URL, url)
571                 if mobj is None:
572                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
573                         return [None]
574                 video_id = mobj.group(2)
575
576                 # Downloader parameters
577                 format_param = None
578                 if self._downloader is not None:
579                         params = self._downloader.params
580                         format_param = params.get('format', None)
581
582                 # Extension
583                 video_extension = {
584                         '17': '3gp',
585                         '18': 'mp4',
586                         '22': 'mp4',
587                 }.get(format_param, 'flv')
588
589                 # Normalize URL, including format
590                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
591                 if format_param is not None:
592                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
593                 request = urllib2.Request(normalized_url, None, std_headers)
594                 try:
595                         self.report_webpage_download(video_id)
596                         video_webpage = urllib2.urlopen(request).read()
597                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
598                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
599                         return [None]
600                 self.report_information_extraction(video_id)
601                 
602                 # "t" param
603                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
604                 if mobj is None:
605                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
606                         return [None]
607                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
608                 if format_param is not None:
609                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
610                 self.report_video_url(video_id, video_real_url)
611
612                 # uploader
613                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
614                 if mobj is None:
615                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
616                         return [None]
617                 video_uploader = mobj.group(1)
618
619                 # title
620                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
621                 if mobj is None:
622                         self.to_stderr(u'ERROR: unable to extract video title')
623                         return [None]
624                 video_title = mobj.group(1).decode('utf-8')
625                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
626                 video_title = video_title.replace(os.sep, u'%')
627
628                 # simplified title
629                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
630                 simple_title = simple_title.strip(ur'_')
631
632                 # Return information
633                 return [{
634                         'id':           video_id.decode('utf-8'),
635                         'url':          video_real_url.decode('utf-8'),
636                         'uploader':     video_uploader.decode('utf-8'),
637                         'title':        video_title,
638                         'stitle':       simple_title,
639                         'ext':          video_extension.decode('utf-8'),
640                         }]
641
642 class MetacafeIE(InfoExtractor):
643         """Information Extractor for metacafe.com."""
644
645         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
646         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
647         _youtube_ie = None
648
649         def __init__(self, youtube_ie, downloader=None):
650                 InfoExtractor.__init__(self, downloader)
651                 self._youtube_ie = youtube_ie
652
653         @staticmethod
654         def suitable(url):
655                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
656
657         def report_disclaimer(self):
658                 """Report disclaimer retrieval."""
659                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
660
661         def report_age_confirmation(self):
662                 """Report attempt to confirm age."""
663                 self.to_stdout(u'[metacafe] Confirming age')
664         
665         def report_download_webpage(self, video_id):
666                 """Report webpage download."""
667                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
668         
669         def report_extraction(self, video_id):
670                 """Report information extraction."""
671                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
672
673         def _real_initialize(self):
674                 # Retrieve disclaimer
675                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
676                 try:
677                         self.report_disclaimer()
678                         disclaimer = urllib2.urlopen(request).read()
679                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
680                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
681                         return
682
683                 # Confirm age
684                 disclaimer_form = {
685                         'filters': '0',
686                         'submit': "Continue - I'm over 18",
687                         }
688                 request = urllib2.Request('http://www.metacafe.com/', urllib.urlencode(disclaimer_form), std_headers)
689                 try:
690                         self.report_age_confirmation()
691                         disclaimer = urllib2.urlopen(request).read()
692                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
693                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
694                         return
695         
696         def _real_extract(self, url):
697                 # Extract id and simplified title from URL
698                 mobj = re.match(self._VALID_URL, url)
699                 if mobj is None:
700                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
701                         return [None]
702
703                 video_id = mobj.group(1)
704
705                 # Check if video comes from YouTube
706                 mobj2 = re.match(r'^yt-(.*)$', video_id)
707                 if mobj2 is not None:
708                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
709
710                 simple_title = mobj.group(2).decode('utf-8')
711                 video_extension = 'flv'
712
713                 # Retrieve video webpage to extract further information
714                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
715                 try:
716                         self.report_download_webpage(video_id)
717                         webpage = urllib2.urlopen(request).read()
718                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
719                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
720                         return [None]
721
722                 # Extract URL, uploader and title from webpage
723                 self.report_extraction(video_id)
724                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
725                 if mobj is None:
726                         self.to_stderr(u'ERROR: unable to extract media URL')
727                         return [None]
728                 mediaURL = mobj.group(1).replace('\\', '')
729
730                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
731                 if mobj is None:
732                         self.to_stderr(u'ERROR: unable to extract gdaKey')
733                         return [None]
734                 gdaKey = mobj.group(1)
735
736                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
737
738                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
739                 if mobj is None:
740                         self.to_stderr(u'ERROR: unable to extract title')
741                         return [None]
742                 video_title = mobj.group(1).decode('utf-8')
743
744                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
745                 if mobj is None:
746                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
747                         return [None]
748                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
749
750                 # Return information
751                 return [{
752                         'id':           video_id.decode('utf-8'),
753                         'url':          video_url.decode('utf-8'),
754                         'uploader':     video_uploader.decode('utf-8'),
755                         'title':        video_title,
756                         'stitle':       simple_title,
757                         'ext':          video_extension.decode('utf-8'),
758                         }]
759
760
761 class YoutubeSearchIE(InfoExtractor):
762         """Information Extractor for YouTube search queries."""
763         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
764         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
765         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
766         _MORE_PAGES_INDICATOR = r'>Next</a>'
767         _youtube_ie = None
768         _max_youtube_results = 1000
769
770         def __init__(self, youtube_ie, downloader=None):
771                 InfoExtractor.__init__(self, downloader)
772                 self._youtube_ie = youtube_ie
773         
774         @staticmethod
775         def suitable(url):
776                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
777
778         def report_download_page(self, query, pagenum):
779                 """Report attempt to download playlist page with given number."""
780                 self.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
781
782         def _real_initialize(self):
783                 self._youtube_ie.initialize()
784         
785         def _real_extract(self, query):
786                 mobj = re.match(self._VALID_QUERY, query)
787                 if mobj is None:
788                         self.to_stderr(u'ERROR: invalid search query "%s"' % query)
789                         return [None]
790
791                 prefix, query = query.split(':')
792                 prefix = prefix[8:]
793                 if prefix == '':
794                         return self._download_n_results(query, 1)
795                 elif prefix == 'all':
796                         return self._download_n_results(query, self._max_youtube_results)
797                 else:
798                         try:
799                                 n = int(prefix)
800                                 if n <= 0:
801                                         self.to_stderr(u'ERROR: invalid download number %s for query "%s"' % (n, query))
802                                         return [None]
803                                 elif n > self._max_youtube_results:
804                                         self.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
805                                         n = self._max_youtube_results
806                                 return self._download_n_results(query, n)
807                         except ValueError: # parsing prefix as int fails
808                                 return self._download_n_results(query, 1)
809
810         def _download_n_results(self, query, n):
811                 """Downloads a specified number of results for a query"""
812
813                 video_ids = []
814                 already_seen = set()
815                 pagenum = 1
816
817                 while True:
818                         self.report_download_page(query, pagenum)
819                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
820                         request = urllib2.Request(result_url, None, std_headers)
821                         try:
822                                 page = urllib2.urlopen(request).read()
823                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
824                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
825                                 return [None]
826
827                         # Extract video identifiers
828                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
829                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
830                                 if video_id not in already_seen:
831                                         video_ids.append(video_id)
832                                         already_seen.add(video_id)
833                                         if len(video_ids) == n:
834                                                 # Specified n videos reached
835                                                 information = []
836                                                 for id in video_ids:
837                                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
838                                                 return information
839
840                         if self._MORE_PAGES_INDICATOR not in page:
841                                 information = []
842                                 for id in video_ids:
843                                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
844                                 return information
845
846                         pagenum = pagenum + 1
847
848 class YoutubePlaylistIE(InfoExtractor):
849         """Information Extractor for YouTube playlists."""
850
851         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
852         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
853         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
854         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
855         _youtube_ie = None
856
857         def __init__(self, youtube_ie, downloader=None):
858                 InfoExtractor.__init__(self, downloader)
859                 self._youtube_ie = youtube_ie
860         
861         @staticmethod
862         def suitable(url):
863                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
864
865         def report_download_page(self, playlist_id, pagenum):
866                 """Report attempt to download playlist page with given number."""
867                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
868
869         def _real_initialize(self):
870                 self._youtube_ie.initialize()
871         
872         def _real_extract(self, url):
873                 # Extract playlist id
874                 mobj = re.match(self._VALID_URL, url)
875                 if mobj is None:
876                         self.to_stderr(u'ERROR: invalid url: %s' % url)
877                         return [None]
878
879                 # Download playlist pages
880                 playlist_id = mobj.group(1)
881                 video_ids = []
882                 pagenum = 1
883
884                 while True:
885                         self.report_download_page(playlist_id, pagenum)
886                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
887                         try:
888                                 page = urllib2.urlopen(request).read()
889                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
890                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
891                                 return [None]
892
893                         # Extract video identifiers
894                         ids_in_page = []
895                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
896                                 if mobj.group(1) not in ids_in_page:
897                                         ids_in_page.append(mobj.group(1))
898                         video_ids.extend(ids_in_page)
899
900                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
901                                 break
902                         pagenum = pagenum + 1
903
904                 information = []
905                 for id in video_ids:
906                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
907                 return information
908
909 class PostProcessor(object):
910         """Post Processor class.
911
912         PostProcessor objects can be added to downloaders with their
913         add_post_processor() method. When the downloader has finished a
914         successful download, it will take its internal chain of PostProcessors
915         and start calling the run() method on each one of them, first with
916         an initial argument and then with the returned value of the previous
917         PostProcessor.
918
919         The chain will be stopped if one of them ever returns None or the end
920         of the chain is reached.
921
922         PostProcessor objects follow a "mutual registration" process similar
923         to InfoExtractor objects.
924         """
925
926         _downloader = None
927
928         def __init__(self, downloader=None):
929                 self._downloader = downloader
930
931         def to_stdout(self, message):
932                 """Print message to stdout if downloader is not in quiet mode."""
933                 if self._downloader is None or not self._downloader.params.get('quiet', False):
934                         print message
935         
936         def to_stderr(self, message):
937                 """Print message to stderr."""
938                 print >>sys.stderr, message
939
940         def set_downloader(self, downloader):
941                 """Sets the downloader for this PP."""
942                 self._downloader = downloader
943         
944         def run(self, information):
945                 """Run the PostProcessor.
946
947                 The "information" argument is a dictionary like the ones
948                 returned by InfoExtractors. The only difference is that this
949                 one has an extra field called "filepath" that points to the
950                 downloaded file.
951
952                 When this method returns None, the postprocessing chain is
953                 stopped. However, this method may return an information
954                 dictionary that will be passed to the next postprocessing
955                 object in the chain. It can be the one it received after
956                 changing some fields.
957
958                 In addition, this method may raise a PostProcessingError
959                 exception that will be taken into account by the downloader
960                 it was called from.
961                 """
962                 return information # by default, do nothing
963         
964 ### MAIN PROGRAM ###
965 if __name__ == '__main__':
966         try:
967                 # Modules needed only when running the main program
968                 import getpass
969                 import optparse
970
971                 # General configuration
972                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
973                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
974                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
975
976                 # Parse command line
977                 parser = optparse.OptionParser(
978                                 usage='Usage: %prog [options] url...',
979                                 version='INTERNAL',
980                                 conflict_handler='resolve',
981                                 )
982                 parser.add_option('-h', '--help',
983                                 action='help', help='print this help text and exit')
984                 parser.add_option('-v', '--version',
985                                 action='version', help='print program version and exit')
986                 parser.add_option('-u', '--username',
987                                 dest='username', metavar='UN', help='account username')
988                 parser.add_option('-p', '--password',
989                                 dest='password', metavar='PW', help='account password')
990                 parser.add_option('-o', '--output',
991                                 dest='outtmpl', metavar='TPL', help='output filename template')
992                 parser.add_option('-q', '--quiet',
993                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
994                 parser.add_option('-s', '--simulate',
995                                 action='store_true', dest='simulate', help='do not download video', default=False)
996                 parser.add_option('-t', '--title',
997                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
998                 parser.add_option('-l', '--literal',
999                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1000                 parser.add_option('-n', '--netrc',
1001                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1002                 parser.add_option('-g', '--get-url',
1003                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1004                 parser.add_option('-e', '--get-title',
1005                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1006                 parser.add_option('-f', '--format',
1007                                 dest='format', metavar='FMT', help='video format code')
1008                 parser.add_option('-m', '--mobile-version',
1009                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1010                 parser.add_option('-d', '--high-def',
1011                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1012                 parser.add_option('-i', '--ignore-errors',
1013                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1014                 parser.add_option('-r', '--rate-limit',
1015                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1016                 parser.add_option('-a', '--batch-file',
1017                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1018                 parser.add_option('-w', '--no-overwrites',
1019                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1020                 (opts, args) = parser.parse_args()
1021
1022                 # Batch file verification
1023                 batchurls = []
1024                 if opts.batchfile is not None:
1025                         try:
1026                                 batchurls = [line.strip() for line in open(opts.batchfile, 'r')]
1027                         except IOError:
1028                                 sys.exit(u'ERROR: batch file could not be read')
1029                 all_urls = batchurls + args
1030
1031                 # Conflicting, missing and erroneous options
1032                 if len(all_urls) < 1:
1033                         sys.exit(u'ERROR: you must provide at least one URL')
1034                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1035                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1036                 if opts.password is not None and opts.username is None:
1037                         sys.exit(u'ERROR: account username missing')
1038                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1039                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1040                 if opts.usetitle and opts.useliteral:
1041                         sys.exit(u'ERROR: using title conflicts with using literal title')
1042                 if opts.username is not None and opts.password is None:
1043                         opts.password = getpass.getpass(u'Type account password and press return:')
1044                 if opts.ratelimit is not None:
1045                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1046                         if numeric_limit is None:
1047                                 sys.exit(u'ERROR: invalid rate limit specified')
1048                         opts.ratelimit = numeric_limit
1049
1050                 # Information extractors
1051                 youtube_ie = YoutubeIE()
1052                 metacafe_ie = MetacafeIE(youtube_ie)
1053                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1054                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1055
1056                 # File downloader
1057                 charset = locale.getdefaultlocale()[1]
1058                 if charset is None:
1059                         charset = 'ascii'
1060                 fd = FileDownloader({
1061                         'usenetrc': opts.usenetrc,
1062                         'username': opts.username,
1063                         'password': opts.password,
1064                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1065                         'forceurl': opts.geturl,
1066                         'forcetitle': opts.gettitle,
1067                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1068                         'format': opts.format,
1069                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(charset))
1070                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1071                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1072                                 or u'%(id)s.%(ext)s'),
1073                         'ignoreerrors': opts.ignoreerrors,
1074                         'ratelimit': opts.ratelimit,
1075                         'nooverwrites': opts.nooverwrites,
1076                         })
1077                 fd.add_info_extractor(youtube_search_ie)
1078                 fd.add_info_extractor(youtube_pl_ie)
1079                 fd.add_info_extractor(metacafe_ie)
1080                 fd.add_info_extractor(youtube_ie)
1081                 retcode = fd.download(all_urls)
1082                 sys.exit(retcode)
1083
1084         except DownloadError:
1085                 sys.exit(1)
1086         except SameFileError:
1087                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1088         except KeyboardInterrupt:
1089                 sys.exit(u'\nERROR: Interrupted by user')