Add format 35 (flv) as second best in quality
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class UnavailableFormatError(Exception):
56         """Unavailable Format exception.
57
58         This exception will be thrown when a video is requested
59         in a format that is not available for that video.
60         """
61
62 class FileDownloader(object):
63         """File Downloader class.
64
65         File downloader objects are the ones responsible of downloading the
66         actual video file and writing it to disk if the user has requested
67         it, among some other tasks. In most cases there should be one per
68         program. As, given a video URL, the downloader doesn't know how to
69         extract all the needed information, task that InfoExtractors do, it
70         has to pass the URL to one of them.
71
72         For this, file downloader objects have a method that allows
73         InfoExtractors to be registered in a given order. When it is passed
74         a URL, the file downloader handles it to the first InfoExtractor it
75         finds that reports being able to handle it. The InfoExtractor extracts
76         all the information about the video or videos the URL refers to, and
77         asks the FileDownloader to process the video information, possibly
78         downloading the video.
79
80         File downloaders accept a lot of parameters. In order not to saturate
81         the object constructor with arguments, it receives a dictionary of
82         options instead. These options are available through the params
83         attribute for the InfoExtractors to use. The FileDownloader also
84         registers itself as the downloader in charge for the InfoExtractors
85         that are added to it, so this is a "mutual registration".
86
87         Available options:
88
89         username:       Username for authentication purposes.
90         password:       Password for authentication purposes.
91         usenetrc:       Use netrc for authentication instead.
92         quiet:          Do not print messages to stdout.
93         forceurl:       Force printing final URL.
94         forcetitle:     Force printing title.
95         simulate:       Do not download the video files.
96         format:         Video format code.
97         outtmpl:        Template for output names.
98         ignoreerrors:   Do not stop on download errors.
99         ratelimit:      Download speed limit, in bytes/sec.
100         nooverwrites:   Prevent overwriting files.
101         """
102
103         params = None
104         _ies = []
105         _pps = []
106         _download_retcode = None
107
108         def __init__(self, params):
109                 """Create a FileDownloader object with the given options."""
110                 self._ies = []
111                 self._pps = []
112                 self._download_retcode = 0
113                 self.params = params
114         
115         @staticmethod
116         def pmkdir(filename):
117                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
118                 components = filename.split(os.sep)
119                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
120                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
121                 for dir in aggregate:
122                         if not os.path.exists(dir):
123                                 os.mkdir(dir)
124         
125         @staticmethod
126         def format_bytes(bytes):
127                 if bytes is None:
128                         return 'N/A'
129                 if bytes == 0:
130                         exponent = 0
131                 else:
132                         exponent = long(math.log(float(bytes), 1024.0))
133                 suffix = 'bkMGTPEZY'[exponent]
134                 converted = float(bytes) / float(1024**exponent)
135                 return '%.2f%s' % (converted, suffix)
136
137         @staticmethod
138         def calc_percent(byte_counter, data_len):
139                 if data_len is None:
140                         return '---.-%'
141                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
142
143         @staticmethod
144         def calc_eta(start, now, total, current):
145                 if total is None:
146                         return '--:--'
147                 dif = now - start
148                 if current == 0 or dif < 0.001: # One millisecond
149                         return '--:--'
150                 rate = float(current) / dif
151                 eta = long((float(total) - float(current)) / rate)
152                 (eta_mins, eta_secs) = divmod(eta, 60)
153                 if eta_mins > 99:
154                         return '--:--'
155                 return '%02d:%02d' % (eta_mins, eta_secs)
156
157         @staticmethod
158         def calc_speed(start, now, bytes):
159                 dif = now - start
160                 if bytes == 0 or dif < 0.001: # One millisecond
161                         return '%10s' % '---b/s'
162                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
163
164         @staticmethod
165         def best_block_size(elapsed_time, bytes):
166                 new_min = max(bytes / 2.0, 1.0)
167                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
168                 if elapsed_time < 0.001:
169                         return int(new_max)
170                 rate = bytes / elapsed_time
171                 if rate > new_max:
172                         return int(new_max)
173                 if rate < new_min:
174                         return int(new_min)
175                 return int(rate)
176
177         @staticmethod
178         def parse_bytes(bytestr):
179                 """Parse a string indicating a byte quantity into a long integer."""
180                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
181                 if matchobj is None:
182                         return None
183                 number = float(matchobj.group(1))
184                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
185                 return long(round(number * multiplier))
186
187         def add_info_extractor(self, ie):
188                 """Add an InfoExtractor object to the end of the list."""
189                 self._ies.append(ie)
190                 ie.set_downloader(self)
191         
192         def add_post_processor(self, pp):
193                 """Add a PostProcessor object to the end of the chain."""
194                 self._pps.append(pp)
195                 pp.set_downloader(self)
196         
197         def to_stdout(self, message, skip_eol=False):
198                 """Print message to stdout if not in quiet mode."""
199                 if not self.params.get('quiet', False):
200                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
201                         sys.stdout.flush()
202         
203         def to_stderr(self, message):
204                 """Print message to stderr."""
205                 print >>sys.stderr, message
206         
207         def fixed_template(self):
208                 """Checks if the output template is fixed."""
209                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
210
211         def trouble(self, message=None):
212                 """Determine action to take when a download problem appears.
213
214                 Depending on if the downloader has been configured to ignore
215                 download errors or not, this method may throw an exception or
216                 not when errors are found, after printing the message.
217                 """
218                 if message is not None:
219                         self.to_stderr(message)
220                 if not self.params.get('ignoreerrors', False):
221                         raise DownloadError(message)
222                 self._download_retcode = 1
223
224         def slow_down(self, start_time, byte_counter):
225                 """Sleep if the download speed is over the rate limit."""
226                 rate_limit = self.params.get('ratelimit', None)
227                 if rate_limit is None or byte_counter == 0:
228                         return
229                 now = time.time()
230                 elapsed = now - start_time
231                 if elapsed <= 0.0:
232                         return
233                 speed = float(byte_counter) / elapsed
234                 if speed > rate_limit:
235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
236
237         def report_destination(self, filename):
238                 """Report destination filename."""
239                 self.to_stdout(u'[download] Destination: %s' % filename)
240         
241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
242                 """Report download progress."""
243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
245         
246         def report_finish(self):
247                 """Report download finished."""
248                 self.to_stdout(u'')
249
250         def process_info(self, info_dict):
251                 """Process a single dictionary returned by an InfoExtractor."""
252                 # Forced printings
253                 if self.params.get('forcetitle', False):
254                         print info_dict['title'].encode(locale.getpreferredencoding())
255                 if self.params.get('forceurl', False):
256                         print info_dict['url'].encode(locale.getpreferredencoding())
257                         
258                 # Do nothing else if in simulate mode
259                 if self.params.get('simulate', False):
260                         return
261
262                 try:
263                         filename = self.params['outtmpl'] % info_dict
264                         self.report_destination(filename)
265                 except (ValueError, KeyError), err:
266                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
267                 if self.params['nooverwrites'] and os.path.exists(filename):
268                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
269                         return
270
271                 try:
272                         self.pmkdir(filename)
273                 except (OSError, IOError), err:
274                         self.trouble('ERROR: unable to create directories: %s' % str(err))
275                         return
276
277                 try:
278                         outstream = open(filename, 'wb')
279                 except (OSError, IOError), err:
280                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
281                         return
282
283                 try:
284                         self._do_download(outstream, info_dict['url'])
285                         outstream.close()
286                 except (OSError, IOError), err:
287                         os.remove(filename)
288                         raise UnavailableFormatError
289                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
290                         self.trouble('ERROR: unable to download video data: %s' % str(err))
291                         return
292
293                 try:
294                         self.post_process(filename, info_dict)
295                 except (PostProcessingError), err:
296                         self.trouble('ERROR: postprocessing: %s' % str(err))
297                         return
298
299         def download(self, url_list):
300                 """Download a given list of URLs."""
301                 if len(url_list) > 1 and self.fixed_template():
302                         raise SameFileError(self.params['outtmpl'])
303
304                 for url in url_list:
305                         suitable_found = False
306                         for ie in self._ies:
307                                 # Go to next InfoExtractor if not suitable
308                                 if not ie.suitable(url):
309                                         continue
310
311                                 # Suitable InfoExtractor found
312                                 suitable_found = True
313
314                                 # Extract information from URL and process it
315                                 ie.extract(url)
316
317                                 # Suitable InfoExtractor had been found; go to next URL
318                                 break
319
320                         if not suitable_found:
321                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
322
323                 return self._download_retcode
324
325         def post_process(self, filename, ie_info):
326                 """Run the postprocessing chain on the given file."""
327                 info = dict(ie_info)
328                 info['filepath'] = filename
329                 for pp in self._pps:
330                         info = pp.run(info)
331                         if info is None:
332                                 break
333         
334         def _do_download(self, stream, url):
335                 request = urllib2.Request(url, None, std_headers)
336                 data = urllib2.urlopen(request)
337                 data_len = data.info().get('Content-length', None)
338                 data_len_str = self.format_bytes(data_len)
339                 byte_counter = 0
340                 block_size = 1024
341                 start = time.time()
342                 while True:
343                         # Progress message
344                         percent_str = self.calc_percent(byte_counter, data_len)
345                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
346                         speed_str = self.calc_speed(start, time.time(), byte_counter)
347                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
348
349                         # Download and write
350                         before = time.time()
351                         data_block = data.read(block_size)
352                         after = time.time()
353                         data_block_len = len(data_block)
354                         if data_block_len == 0:
355                                 break
356                         byte_counter += data_block_len
357                         stream.write(data_block)
358                         block_size = self.best_block_size(after - before, data_block_len)
359
360                         # Apply rate limit
361                         self.slow_down(start, byte_counter)
362
363                 self.report_finish()
364                 if data_len is not None and str(byte_counter) != data_len:
365                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
366
367 class InfoExtractor(object):
368         """Information Extractor class.
369
370         Information extractors are the classes that, given a URL, extract
371         information from the video (or videos) the URL refers to. This
372         information includes the real video URL, the video title and simplified
373         title, author and others. The information is stored in a dictionary
374         which is then passed to the FileDownloader. The FileDownloader
375         processes this information possibly downloading the video to the file
376         system, among other possible outcomes. The dictionaries must include
377         the following fields:
378
379         id:             Video identifier.
380         url:            Final video URL.
381         uploader:       Nickname of the video uploader.
382         title:          Literal title.
383         stitle:         Simplified title.
384         ext:            Video filename extension.
385
386         Subclasses of this one should re-define the _real_initialize() and
387         _real_extract() methods, as well as the suitable() static method.
388         Probably, they should also be instantiated and added to the main
389         downloader.
390         """
391
392         _ready = False
393         _downloader = None
394
395         def __init__(self, downloader=None):
396                 """Constructor. Receives an optional downloader."""
397                 self._ready = False
398                 self.set_downloader(downloader)
399
400         @staticmethod
401         def suitable(url):
402                 """Receives a URL and returns True if suitable for this IE."""
403                 return False
404
405         def initialize(self):
406                 """Initializes an instance (authentication, etc)."""
407                 if not self._ready:
408                         self._real_initialize()
409                         self._ready = True
410
411         def extract(self, url):
412                 """Extracts URL information and returns it in list of dicts."""
413                 self.initialize()
414                 return self._real_extract(url)
415
416         def set_downloader(self, downloader):
417                 """Sets the downloader for this IE."""
418                 self._downloader = downloader
419         
420         def _real_initialize(self):
421                 """Real initialization process. Redefine in subclasses."""
422                 pass
423
424         def _real_extract(self, url):
425                 """Real extraction process. Redefine in subclasses."""
426                 pass
427
428 class YoutubeIE(InfoExtractor):
429         """Information extractor for youtube.com."""
430
431         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
432         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
433         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
434         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
435         _NETRC_MACHINE = 'youtube'
436         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
437         _video_extensions = {
438                 '13': '3gp',
439                 '17': 'mp4',
440                 '18': 'mp4',
441                 '22': 'mp4',
442         }
443
444         @staticmethod
445         def suitable(url):
446                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
447
448         @staticmethod
449         def htmlentity_transform(matchobj):
450                 """Transforms an HTML entity to a Unicode character."""
451                 entity = matchobj.group(1)
452
453                 # Known non-numeric HTML entity
454                 if entity in htmlentitydefs.name2codepoint:
455                         return unichr(htmlentitydefs.name2codepoint[entity])
456
457                 # Unicode character
458                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
459                 if mobj is not None:
460                         numstr = mobj.group(1)
461                         if numstr.startswith(u'x'):
462                                 base = 16
463                                 numstr = u'0%s' % numstr
464                         else:
465                                 base = 10
466                         return unichr(long(numstr, base))
467
468                 # Unknown entity in name, return its literal representation
469                 return (u'&%s;' % entity)
470
471         def report_lang(self):
472                 """Report attempt to set language."""
473                 self._downloader.to_stdout(u'[youtube] Setting language')
474
475         def report_login(self):
476                 """Report attempt to log in."""
477                 self._downloader.to_stdout(u'[youtube] Logging in')
478         
479         def report_age_confirmation(self):
480                 """Report attempt to confirm age."""
481                 self._downloader.to_stdout(u'[youtube] Confirming age')
482         
483         def report_webpage_download(self, video_id):
484                 """Report attempt to download webpage."""
485                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
486         
487         def report_information_extraction(self, video_id):
488                 """Report attempt to extract video information."""
489                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
490         
491         def report_video_url(self, video_id, video_real_url):
492                 """Report extracted video URL."""
493                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
494         
495         def report_unavailable_format(self, video_id, format):
496                 """Report extracted video URL."""
497                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
498         
499         def _real_initialize(self):
500                 if self._downloader is None:
501                         return
502
503                 username = None
504                 password = None
505                 downloader_params = self._downloader.params
506
507                 # Attempt to use provided username and password or .netrc data
508                 if downloader_params.get('username', None) is not None:
509                         username = downloader_params['username']
510                         password = downloader_params['password']
511                 elif downloader_params.get('usenetrc', False):
512                         try:
513                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
514                                 if info is not None:
515                                         username = info[0]
516                                         password = info[2]
517                                 else:
518                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
519                         except (IOError, netrc.NetrcParseError), err:
520                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
521                                 return
522
523                 # Set language
524                 request = urllib2.Request(self._LANG_URL, None, std_headers)
525                 try:
526                         self.report_lang()
527                         urllib2.urlopen(request).read()
528                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
529                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
530                         return
531
532                 # No authentication to be performed
533                 if username is None:
534                         return
535
536                 # Log in
537                 login_form = {
538                                 'current_form': 'loginForm',
539                                 'next':         '/',
540                                 'action_login': 'Log In',
541                                 'username':     username,
542                                 'password':     password,
543                                 }
544                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
545                 try:
546                         self.report_login()
547                         login_results = urllib2.urlopen(request).read()
548                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
549                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
550                                 return
551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
552                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
553                         return
554         
555                 # Confirm age
556                 age_form = {
557                                 'next_url':             '/',
558                                 'action_confirm':       'Confirm',
559                                 }
560                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
561                 try:
562                         self.report_age_confirmation()
563                         age_results = urllib2.urlopen(request).read()
564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
565                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
566                         return
567
568         def _real_extract(self, url):
569                 # Extract video id from URL
570                 mobj = re.match(self._VALID_URL, url)
571                 if mobj is None:
572                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
573                         return
574                 video_id = mobj.group(2)
575
576                 # Downloader parameters
577                 best_quality = False
578                 format_param = None
579                 quality_index = 0
580                 if self._downloader is not None:
581                         params = self._downloader.params
582                         format_param = params.get('format', None)
583                         if format_param == '0':
584                                 format_param = self._available_formats[quality_index]
585                                 best_quality = True
586
587                 while True:
588                         try:
589                                 # Extension
590                                 video_extension = self._video_extensions.get(format_param, 'flv')
591
592                                 # Normalize URL, including format
593                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
594                                 if format_param is not None:
595                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
596                                 request = urllib2.Request(normalized_url, None, std_headers)
597                                 try:
598                                         self.report_webpage_download(video_id)
599                                         video_webpage = urllib2.urlopen(request).read()
600                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
601                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
602                                         return
603                                 self.report_information_extraction(video_id)
604                                 
605                                 # "t" param
606                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
607                                 if mobj is None:
608                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
609                                         return
610                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
611                                 if format_param is not None:
612                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
613                                 self.report_video_url(video_id, video_real_url)
614
615                                 # uploader
616                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
617                                 if mobj is None:
618                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
619                                         return
620                                 video_uploader = mobj.group(1)
621
622                                 # title
623                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
624                                 if mobj is None:
625                                         self._downloader.trouble(u'ERROR: unable to extract video title')
626                                         return
627                                 video_title = mobj.group(1).decode('utf-8')
628                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
629                                 video_title = video_title.replace(os.sep, u'%')
630
631                                 # simplified title
632                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
633                                 simple_title = simple_title.strip(ur'_')
634
635                                 # Process video information
636                                 self._downloader.process_info({
637                                         'id':           video_id.decode('utf-8'),
638                                         'url':          video_real_url.decode('utf-8'),
639                                         'uploader':     video_uploader.decode('utf-8'),
640                                         'title':        video_title,
641                                         'stitle':       simple_title,
642                                         'ext':          video_extension.decode('utf-8'),
643                                 })
644
645                                 return
646
647                         except UnavailableFormatError, err:
648                                 if best_quality:
649                                         if quality_index == len(self._available_formats) - 1:
650                                                 # I don't ever expect this to happen
651                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
652                                                 return
653                                         else:
654                                                 self.report_unavailable_format(video_id, format_param)
655                                                 quality_index += 1
656                                                 format_param = self._available_formats[quality_index]
657                                                 continue
658                                 else: 
659                                         self._downloader.trouble('ERROR: format not available for video')
660                                         return
661
662
663 class MetacafeIE(InfoExtractor):
664         """Information Extractor for metacafe.com."""
665
666         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
667         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
668         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
669         _youtube_ie = None
670
671         def __init__(self, youtube_ie, downloader=None):
672                 InfoExtractor.__init__(self, downloader)
673                 self._youtube_ie = youtube_ie
674
675         @staticmethod
676         def suitable(url):
677                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
678
679         def report_disclaimer(self):
680                 """Report disclaimer retrieval."""
681                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
682
683         def report_age_confirmation(self):
684                 """Report attempt to confirm age."""
685                 self._downloader.to_stdout(u'[metacafe] Confirming age')
686         
687         def report_download_webpage(self, video_id):
688                 """Report webpage download."""
689                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
690         
691         def report_extraction(self, video_id):
692                 """Report information extraction."""
693                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
694
695         def _real_initialize(self):
696                 # Retrieve disclaimer
697                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
698                 try:
699                         self.report_disclaimer()
700                         disclaimer = urllib2.urlopen(request).read()
701                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
702                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
703                         return
704
705                 # Confirm age
706                 disclaimer_form = {
707                         'filters': '0',
708                         'submit': "Continue - I'm over 18",
709                         }
710                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
711                 try:
712                         self.report_age_confirmation()
713                         disclaimer = urllib2.urlopen(request).read()
714                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
715                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
716                         return
717         
718         def _real_extract(self, url):
719                 # Extract id and simplified title from URL
720                 mobj = re.match(self._VALID_URL, url)
721                 if mobj is None:
722                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
723                         return
724
725                 video_id = mobj.group(1)
726
727                 # Check if video comes from YouTube
728                 mobj2 = re.match(r'^yt-(.*)$', video_id)
729                 if mobj2 is not None:
730                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
731                         return
732
733                 simple_title = mobj.group(2).decode('utf-8')
734                 video_extension = 'flv'
735
736                 # Retrieve video webpage to extract further information
737                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
738                 try:
739                         self.report_download_webpage(video_id)
740                         webpage = urllib2.urlopen(request).read()
741                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
742                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
743                         return
744
745                 # Extract URL, uploader and title from webpage
746                 self.report_extraction(video_id)
747                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
748                 if mobj is None:
749                         self._downloader.trouble(u'ERROR: unable to extract media URL')
750                         return
751                 mediaURL = urllib.unquote(mobj.group(1))
752
753                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
754                 if mobj is None:
755                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
756                         return
757                 gdaKey = mobj.group(1)
758
759                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
760
761                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
762                 if mobj is None:
763                         self._downloader.trouble(u'ERROR: unable to extract title')
764                         return
765                 video_title = mobj.group(1).decode('utf-8')
766
767                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
768                 if mobj is None:
769                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
770                         return
771                 video_uploader = mobj.group(1)
772
773                 try:
774                         # Process video information
775                         self._downloader.process_info({
776                                 'id':           video_id.decode('utf-8'),
777                                 'url':          video_url.decode('utf-8'),
778                                 'uploader':     video_uploader.decode('utf-8'),
779                                 'title':        video_title,
780                                 'stitle':       simple_title,
781                                 'ext':          video_extension.decode('utf-8'),
782                         })
783                 except UnavailableFormatError:
784                         self._downloader.trouble(u'ERROR: format not available for video')
785
786
787 class YoutubeSearchIE(InfoExtractor):
788         """Information Extractor for YouTube search queries."""
789         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
790         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
791         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
792         _MORE_PAGES_INDICATOR = r'>Next</a>'
793         _youtube_ie = None
794         _max_youtube_results = 1000
795
796         def __init__(self, youtube_ie, downloader=None):
797                 InfoExtractor.__init__(self, downloader)
798                 self._youtube_ie = youtube_ie
799         
800         @staticmethod
801         def suitable(url):
802                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
803
804         def report_download_page(self, query, pagenum):
805                 """Report attempt to download playlist page with given number."""
806                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
807
808         def _real_initialize(self):
809                 self._youtube_ie.initialize()
810         
811         def _real_extract(self, query):
812                 mobj = re.match(self._VALID_QUERY, query)
813                 if mobj is None:
814                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
815                         return
816
817                 prefix, query = query.split(':')
818                 prefix = prefix[8:]
819                 if prefix == '':
820                         self._download_n_results(query, 1)
821                         return
822                 elif prefix == 'all':
823                         self._download_n_results(query, self._max_youtube_results)
824                         return
825                 else:
826                         try:
827                                 n = int(prefix)
828                                 if n <= 0:
829                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
830                                         return
831                                 elif n > self._max_youtube_results:
832                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
833                                         n = self._max_youtube_results
834                                 self._download_n_results(query, n)
835                                 return
836                         except ValueError: # parsing prefix as int fails
837                                 self._download_n_results(query, 1)
838                                 return
839
840         def _download_n_results(self, query, n):
841                 """Downloads a specified number of results for a query"""
842
843                 video_ids = []
844                 already_seen = set()
845                 pagenum = 1
846
847                 while True:
848                         self.report_download_page(query, pagenum)
849                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
850                         request = urllib2.Request(result_url, None, std_headers)
851                         try:
852                                 page = urllib2.urlopen(request).read()
853                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
854                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
855                                 return
856
857                         # Extract video identifiers
858                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
859                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
860                                 if video_id not in already_seen:
861                                         video_ids.append(video_id)
862                                         already_seen.add(video_id)
863                                         if len(video_ids) == n:
864                                                 # Specified n videos reached
865                                                 for id in video_ids:
866                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
867                                                 return
868
869                         if self._MORE_PAGES_INDICATOR not in page:
870                                 for id in video_ids:
871                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
872                                 return
873
874                         pagenum = pagenum + 1
875
876 class YoutubePlaylistIE(InfoExtractor):
877         """Information Extractor for YouTube playlists."""
878
879         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
880         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
881         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
882         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
883         _youtube_ie = None
884
885         def __init__(self, youtube_ie, downloader=None):
886                 InfoExtractor.__init__(self, downloader)
887                 self._youtube_ie = youtube_ie
888         
889         @staticmethod
890         def suitable(url):
891                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
892
893         def report_download_page(self, playlist_id, pagenum):
894                 """Report attempt to download playlist page with given number."""
895                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
896
897         def _real_initialize(self):
898                 self._youtube_ie.initialize()
899         
900         def _real_extract(self, url):
901                 # Extract playlist id
902                 mobj = re.match(self._VALID_URL, url)
903                 if mobj is None:
904                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
905                         return
906
907                 # Download playlist pages
908                 playlist_id = mobj.group(1)
909                 video_ids = []
910                 pagenum = 1
911
912                 while True:
913                         self.report_download_page(playlist_id, pagenum)
914                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
915                         try:
916                                 page = urllib2.urlopen(request).read()
917                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
918                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
919                                 return
920
921                         # Extract video identifiers
922                         ids_in_page = []
923                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
924                                 if mobj.group(1) not in ids_in_page:
925                                         ids_in_page.append(mobj.group(1))
926                         video_ids.extend(ids_in_page)
927
928                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
929                                 break
930                         pagenum = pagenum + 1
931
932                 for id in video_ids:
933                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
934                 return
935
936 class PostProcessor(object):
937         """Post Processor class.
938
939         PostProcessor objects can be added to downloaders with their
940         add_post_processor() method. When the downloader has finished a
941         successful download, it will take its internal chain of PostProcessors
942         and start calling the run() method on each one of them, first with
943         an initial argument and then with the returned value of the previous
944         PostProcessor.
945
946         The chain will be stopped if one of them ever returns None or the end
947         of the chain is reached.
948
949         PostProcessor objects follow a "mutual registration" process similar
950         to InfoExtractor objects.
951         """
952
953         _downloader = None
954
955         def __init__(self, downloader=None):
956                 self._downloader = downloader
957
958         def set_downloader(self, downloader):
959                 """Sets the downloader for this PP."""
960                 self._downloader = downloader
961         
962         def run(self, information):
963                 """Run the PostProcessor.
964
965                 The "information" argument is a dictionary like the ones
966                 composed by InfoExtractors. The only difference is that this
967                 one has an extra field called "filepath" that points to the
968                 downloaded file.
969
970                 When this method returns None, the postprocessing chain is
971                 stopped. However, this method may return an information
972                 dictionary that will be passed to the next postprocessing
973                 object in the chain. It can be the one it received after
974                 changing some fields.
975
976                 In addition, this method may raise a PostProcessingError
977                 exception that will be taken into account by the downloader
978                 it was called from.
979                 """
980                 return information # by default, do nothing
981         
982 ### MAIN PROGRAM ###
983 if __name__ == '__main__':
984         try:
985                 # Modules needed only when running the main program
986                 import getpass
987                 import optparse
988
989                 # General configuration
990                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
991                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
992                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
993
994                 # Parse command line
995                 parser = optparse.OptionParser(
996                         usage='Usage: %prog [options] url...',
997                         version='INTERNAL',
998                         conflict_handler='resolve',
999                 )
1000
1001                 parser.add_option('-h', '--help',
1002                                 action='help', help='print this help text and exit')
1003                 parser.add_option('-v', '--version',
1004                                 action='version', help='print program version and exit')
1005                 parser.add_option('-i', '--ignore-errors',
1006                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1007                 parser.add_option('-r', '--rate-limit',
1008                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1009
1010                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1011                 authentication.add_option('-u', '--username',
1012                                 dest='username', metavar='UN', help='account username')
1013                 authentication.add_option('-p', '--password',
1014                                 dest='password', metavar='PW', help='account password')
1015                 authentication.add_option('-n', '--netrc',
1016                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1017                 parser.add_option_group(authentication)
1018
1019                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1020                 video_format.add_option('-f', '--format',
1021                                 action='append', dest='format', metavar='FMT', help='video format code')
1022                 video_format.add_option('-b', '--best-quality',
1023                                 action='append_const', dest='format', help='download the best quality video possible', const='0')
1024                 video_format.add_option('-m', '--mobile-version',
1025                                 action='append_const', dest='format', help='alias for -f 17', const='17')
1026                 video_format.add_option('-d', '--high-def',
1027                                 action='append_const', dest='format', help='alias for -f 22', const='22')
1028                 parser.add_option_group(video_format)
1029
1030                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1031                 verbosity.add_option('-q', '--quiet',
1032                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1033                 verbosity.add_option('-s', '--simulate',
1034                                 action='store_true', dest='simulate', help='do not download video', default=False)
1035                 verbosity.add_option('-g', '--get-url',
1036                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1037                 verbosity.add_option('-e', '--get-title',
1038                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1039                 parser.add_option_group(verbosity)
1040
1041                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1042                 filesystem.add_option('-t', '--title',
1043                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1044                 filesystem.add_option('-l', '--literal',
1045                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1046                 filesystem.add_option('-o', '--output',
1047                                 dest='outtmpl', metavar='TPL', help='output filename template')
1048                 filesystem.add_option('-a', '--batch-file',
1049                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1050                 filesystem.add_option('-w', '--no-overwrites',
1051                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1052                 parser.add_option_group(filesystem)
1053
1054                 (opts, args) = parser.parse_args()
1055
1056                 # Batch file verification
1057                 batchurls = []
1058                 if opts.batchfile is not None:
1059                         try:
1060                                 batchurls = open(opts.batchfile, 'r').readlines()
1061                                 batchurls = [x.strip() for x in batchurls]
1062                                 batchurls = [x for x in batchurls if len(x) > 0]
1063                         except IOError:
1064                                 sys.exit(u'ERROR: batch file could not be read')
1065                 all_urls = batchurls + args
1066
1067                 # Conflicting, missing and erroneous options
1068                 if len(all_urls) < 1:
1069                         parser.error(u'you must provide at least one URL')
1070                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1071                         parser.error(u'using .netrc conflicts with giving username/password')
1072                 if opts.password is not None and opts.username is None:
1073                         parser.error(u'account username missing')
1074                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1075                         parser.error(u'using output template conflicts with using title or literal title')
1076                 if opts.usetitle and opts.useliteral:
1077                         parser.error(u'using title conflicts with using literal title')
1078                 if opts.username is not None and opts.password is None:
1079                         opts.password = getpass.getpass(u'Type account password and press return:')
1080                 if opts.ratelimit is not None:
1081                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1082                         if numeric_limit is None:
1083                                 parser.error(u'invalid rate limit specified')
1084                         opts.ratelimit = numeric_limit
1085                 if opts.format is not None and len(opts.format) > 1:
1086                         parser.error(u'pass at most one of the video format option flags (-f, -b, -m, -d)')
1087                 if opts.format is None:
1088                         real_format = None
1089                 else:
1090                         real_format = opts.format[0]
1091
1092
1093                 # Information extractors
1094                 youtube_ie = YoutubeIE()
1095                 metacafe_ie = MetacafeIE(youtube_ie)
1096                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1097                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1098
1099                 # File downloader
1100                 fd = FileDownloader({
1101                         'usenetrc': opts.usenetrc,
1102                         'username': opts.username,
1103                         'password': opts.password,
1104                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1105                         'forceurl': opts.geturl,
1106                         'forcetitle': opts.gettitle,
1107                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1108                         'format': real_format,
1109                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1110                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1111                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1112                                 or u'%(id)s.%(ext)s'),
1113                         'ignoreerrors': opts.ignoreerrors,
1114                         'ratelimit': opts.ratelimit,
1115                         'nooverwrites': opts.nooverwrites,
1116                         })
1117                 fd.add_info_extractor(youtube_search_ie)
1118                 fd.add_info_extractor(youtube_pl_ie)
1119                 fd.add_info_extractor(metacafe_ie)
1120                 fd.add_info_extractor(youtube_ie)
1121                 retcode = fd.download(all_urls)
1122                 sys.exit(retcode)
1123
1124         except DownloadError:
1125                 sys.exit(1)
1126         except SameFileError:
1127                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1128         except KeyboardInterrupt:
1129                 sys.exit(u'\nERROR: Interrupted by user')