]> git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl
d3acd9d0a86205776b8bc2bf9cce87ca8c32718e
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class UnavailableFormatError(Exception):
56         """Unavailable Format exception.
57
58         This exception will be thrown when a video is requested
59         in a format that is not available for that video.
60         """
61
62 class FileDownloader(object):
63         """File Downloader class.
64
65         File downloader objects are the ones responsible of downloading the
66         actual video file and writing it to disk if the user has requested
67         it, among some other tasks. In most cases there should be one per
68         program. As, given a video URL, the downloader doesn't know how to
69         extract all the needed information, task that InfoExtractors do, it
70         has to pass the URL to one of them.
71
72         For this, file downloader objects have a method that allows
73         InfoExtractors to be registered in a given order. When it is passed
74         a URL, the file downloader handles it to the first InfoExtractor it
75         finds that reports being able to handle it. The InfoExtractor extracts
76         all the information about the video or videos the URL refers to, and
77         asks the FileDownloader to process the video information, possibly
78         downloading the video.
79
80         File downloaders accept a lot of parameters. In order not to saturate
81         the object constructor with arguments, it receives a dictionary of
82         options instead. These options are available through the params
83         attribute for the InfoExtractors to use. The FileDownloader also
84         registers itself as the downloader in charge for the InfoExtractors
85         that are added to it, so this is a "mutual registration".
86
87         Available options:
88
89         username:       Username for authentication purposes.
90         password:       Password for authentication purposes.
91         usenetrc:       Use netrc for authentication instead.
92         quiet:          Do not print messages to stdout.
93         forceurl:       Force printing final URL.
94         forcetitle:     Force printing title.
95         simulate:       Do not download the video files.
96         format:         Video format code.
97         outtmpl:        Template for output names.
98         ignoreerrors:   Do not stop on download errors.
99         ratelimit:      Download speed limit, in bytes/sec.
100         nooverwrites:   Prevent overwriting files.
101         """
102
103         params = None
104         _ies = []
105         _pps = []
106         _download_retcode = None
107
108         def __init__(self, params):
109                 """Create a FileDownloader object with the given options."""
110                 self._ies = []
111                 self._pps = []
112                 self._download_retcode = 0
113                 self.params = params
114         
115         @staticmethod
116         def pmkdir(filename):
117                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
118                 components = filename.split(os.sep)
119                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
120                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
121                 for dir in aggregate:
122                         if not os.path.exists(dir):
123                                 os.mkdir(dir)
124         
125         @staticmethod
126         def format_bytes(bytes):
127                 if bytes is None:
128                         return 'N/A'
129                 if bytes == 0:
130                         exponent = 0
131                 else:
132                         exponent = long(math.log(float(bytes), 1024.0))
133                 suffix = 'bkMGTPEZY'[exponent]
134                 converted = float(bytes) / float(1024**exponent)
135                 return '%.2f%s' % (converted, suffix)
136
137         @staticmethod
138         def calc_percent(byte_counter, data_len):
139                 if data_len is None:
140                         return '---.-%'
141                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
142
143         @staticmethod
144         def calc_eta(start, now, total, current):
145                 if total is None:
146                         return '--:--'
147                 dif = now - start
148                 if current == 0 or dif < 0.001: # One millisecond
149                         return '--:--'
150                 rate = float(current) / dif
151                 eta = long((float(total) - float(current)) / rate)
152                 (eta_mins, eta_secs) = divmod(eta, 60)
153                 if eta_mins > 99:
154                         return '--:--'
155                 return '%02d:%02d' % (eta_mins, eta_secs)
156
157         @staticmethod
158         def calc_speed(start, now, bytes):
159                 dif = now - start
160                 if bytes == 0 or dif < 0.001: # One millisecond
161                         return '%10s' % '---b/s'
162                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
163
164         @staticmethod
165         def best_block_size(elapsed_time, bytes):
166                 new_min = max(bytes / 2.0, 1.0)
167                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
168                 if elapsed_time < 0.001:
169                         return int(new_max)
170                 rate = bytes / elapsed_time
171                 if rate > new_max:
172                         return int(new_max)
173                 if rate < new_min:
174                         return int(new_min)
175                 return int(rate)
176
177         @staticmethod
178         def parse_bytes(bytestr):
179                 """Parse a string indicating a byte quantity into a long integer."""
180                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
181                 if matchobj is None:
182                         return None
183                 number = float(matchobj.group(1))
184                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
185                 return long(round(number * multiplier))
186
187         def add_info_extractor(self, ie):
188                 """Add an InfoExtractor object to the end of the list."""
189                 self._ies.append(ie)
190                 ie.set_downloader(self)
191         
192         def add_post_processor(self, pp):
193                 """Add a PostProcessor object to the end of the chain."""
194                 self._pps.append(pp)
195                 pp.set_downloader(self)
196         
197         def to_stdout(self, message, skip_eol=False):
198                 """Print message to stdout if not in quiet mode."""
199                 if not self.params.get('quiet', False):
200                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
201                         sys.stdout.flush()
202         
203         def to_stderr(self, message):
204                 """Print message to stderr."""
205                 print >>sys.stderr, message
206         
207         def fixed_template(self):
208                 """Checks if the output template is fixed."""
209                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
210
211         def trouble(self, message=None):
212                 """Determine action to take when a download problem appears.
213
214                 Depending on if the downloader has been configured to ignore
215                 download errors or not, this method may throw an exception or
216                 not when errors are found, after printing the message.
217                 """
218                 if message is not None:
219                         self.to_stderr(message)
220                 if not self.params.get('ignoreerrors', False):
221                         raise DownloadError(message)
222                 self._download_retcode = 1
223
224         def slow_down(self, start_time, byte_counter):
225                 """Sleep if the download speed is over the rate limit."""
226                 rate_limit = self.params.get('ratelimit', None)
227                 if rate_limit is None or byte_counter == 0:
228                         return
229                 now = time.time()
230                 elapsed = now - start_time
231                 if elapsed <= 0.0:
232                         return
233                 speed = float(byte_counter) / elapsed
234                 if speed > rate_limit:
235                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
236
237         def report_destination(self, filename):
238                 """Report destination filename."""
239                 self.to_stdout(u'[download] Destination: %s' % filename)
240         
241         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
242                 """Report download progress."""
243                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
244                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
245         
246         def report_finish(self):
247                 """Report download finished."""
248                 self.to_stdout(u'')
249
250         def process_info(self, info_dict):
251                 """Process a single dictionary returned by an InfoExtractor."""
252                 # Forced printings
253                 if self.params.get('forcetitle', False):
254                         print info_dict['title'].encode(locale.getpreferredencoding())
255                 if self.params.get('forceurl', False):
256                         print info_dict['url'].encode(locale.getpreferredencoding())
257                         
258                 # Do nothing else if in simulate mode
259                 if self.params.get('simulate', False):
260                         return
261
262                 try:
263                         filename = self.params['outtmpl'] % info_dict
264                         self.report_destination(filename)
265                 except (ValueError, KeyError), err:
266                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
267                 if self.params['nooverwrites'] and os.path.exists(filename):
268                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
269                         return
270
271                 try:
272                         self.pmkdir(filename)
273                 except (OSError, IOError), err:
274                         self.trouble('ERROR: unable to create directories: %s' % str(err))
275                         return
276
277                 try:
278                         outstream = open(filename, 'wb')
279                 except (OSError, IOError), err:
280                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
281                         return
282
283                 try:
284                         self._do_download(outstream, info_dict['url'])
285                         outstream.close()
286                 except (OSError, IOError), err:
287                         if info_dict['best_quality']:
288                                 raise UnavailableFormatError
289                         else:
290                                 self.trouble('ERROR: unable to write video data: %s' % str(err))
291                                 return
292                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
293                         self.trouble('ERROR: unable to download video data: %s' % str(err))
294                         return
295
296                 try:
297                         self.post_process(filename, info_dict)
298                 except (PostProcessingError), err:
299                         self.trouble('ERROR: postprocessing: %s' % str(err))
300                         return
301
302         def download(self, url_list):
303                 """Download a given list of URLs."""
304                 if len(url_list) > 1 and self.fixed_template():
305                         raise SameFileError(self.params['outtmpl'])
306
307                 for url in url_list:
308                         suitable_found = False
309                         for ie in self._ies:
310                                 # Go to next InfoExtractor if not suitable
311                                 if not ie.suitable(url):
312                                         continue
313
314                                 # Suitable InfoExtractor found
315                                 suitable_found = True
316
317                                 # Extract information from URL and process it
318                                 ie.extract(url)
319
320                                 # Suitable InfoExtractor had been found; go to next URL
321                                 break
322
323                         if not suitable_found:
324                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
325
326                 return self._download_retcode
327
328         def post_process(self, filename, ie_info):
329                 """Run the postprocessing chain on the given file."""
330                 info = dict(ie_info)
331                 info['filepath'] = filename
332                 for pp in self._pps:
333                         info = pp.run(info)
334                         if info is None:
335                                 break
336         
337         def _do_download(self, stream, url):
338                 request = urllib2.Request(url, None, std_headers)
339                 data = urllib2.urlopen(request)
340                 data_len = data.info().get('Content-length', None)
341                 data_len_str = self.format_bytes(data_len)
342                 byte_counter = 0
343                 block_size = 1024
344                 start = time.time()
345                 while True:
346                         # Progress message
347                         percent_str = self.calc_percent(byte_counter, data_len)
348                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
349                         speed_str = self.calc_speed(start, time.time(), byte_counter)
350                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
351
352                         # Download and write
353                         before = time.time()
354                         data_block = data.read(block_size)
355                         after = time.time()
356                         data_block_len = len(data_block)
357                         if data_block_len == 0:
358                                 break
359                         byte_counter += data_block_len
360                         stream.write(data_block)
361                         block_size = self.best_block_size(after - before, data_block_len)
362
363                         # Apply rate limit
364                         self.slow_down(start, byte_counter)
365
366                 self.report_finish()
367                 if data_len is not None and str(byte_counter) != data_len:
368                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
369
370 class InfoExtractor(object):
371         """Information Extractor class.
372
373         Information extractors are the classes that, given a URL, extract
374         information from the video (or videos) the URL refers to. This
375         information includes the real video URL, the video title and simplified
376         title, author and others. The information is stored in a dictionary
377         which is then passed to the FileDownloader. The FileDownloader
378         processes this information possibly downloading the video to the file
379         system, among other possible outcomes. The dictionaries must include
380         the following fields:
381
382         id:             Video identifier.
383         url:            Final video URL.
384         uploader:       Nickname of the video uploader.
385         title:          Literal title.
386         stitle:         Simplified title.
387         ext:            Video filename extension.
388
389         Subclasses of this one should re-define the _real_initialize() and
390         _real_extract() methods, as well as the suitable() static method.
391         Probably, they should also be instantiated and added to the main
392         downloader.
393         """
394
395         _ready = False
396         _downloader = None
397
398         def __init__(self, downloader=None):
399                 """Constructor. Receives an optional downloader."""
400                 self._ready = False
401                 self.set_downloader(downloader)
402
403         @staticmethod
404         def suitable(url):
405                 """Receives a URL and returns True if suitable for this IE."""
406                 return False
407
408         def initialize(self):
409                 """Initializes an instance (authentication, etc)."""
410                 if not self._ready:
411                         self._real_initialize()
412                         self._ready = True
413
414         def extract(self, url):
415                 """Extracts URL information and returns it in list of dicts."""
416                 self.initialize()
417                 return self._real_extract(url)
418
419         def set_downloader(self, downloader):
420                 """Sets the downloader for this IE."""
421                 self._downloader = downloader
422         
423         def _real_initialize(self):
424                 """Real initialization process. Redefine in subclasses."""
425                 pass
426
427         def _real_extract(self, url):
428                 """Real extraction process. Redefine in subclasses."""
429                 pass
430
431 class YoutubeIE(InfoExtractor):
432         """Information extractor for youtube.com."""
433
434         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
435         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
436         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
437         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
438         _NETRC_MACHINE = 'youtube'
439         _available_formats = ['22', '18', '17', '13'] # listed in order of priority for -b flag
440         _video_extensions = {
441                 '13': '3gp',
442                 '17': 'mp4',
443                 '18': 'mp4',
444                 '22': 'mp4',
445         }
446
447         @staticmethod
448         def suitable(url):
449                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
450
451         @staticmethod
452         def htmlentity_transform(matchobj):
453                 """Transforms an HTML entity to a Unicode character."""
454                 entity = matchobj.group(1)
455
456                 # Known non-numeric HTML entity
457                 if entity in htmlentitydefs.name2codepoint:
458                         return unichr(htmlentitydefs.name2codepoint[entity])
459
460                 # Unicode character
461                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
462                 if mobj is not None:
463                         numstr = mobj.group(1)
464                         if numstr.startswith(u'x'):
465                                 base = 16
466                                 numstr = u'0%s' % numstr
467                         else:
468                                 base = 10
469                         return unichr(long(numstr, base))
470
471                 # Unknown entity in name, return its literal representation
472                 return (u'&%s;' % entity)
473
474         def report_lang(self):
475                 """Report attempt to set language."""
476                 self._downloader.to_stdout(u'[youtube] Setting language')
477
478         def report_login(self):
479                 """Report attempt to log in."""
480                 self._downloader.to_stdout(u'[youtube] Logging in')
481         
482         def report_age_confirmation(self):
483                 """Report attempt to confirm age."""
484                 self._downloader.to_stdout(u'[youtube] Confirming age')
485         
486         def report_webpage_download(self, video_id):
487                 """Report attempt to download webpage."""
488                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
489         
490         def report_information_extraction(self, video_id):
491                 """Report attempt to extract video information."""
492                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
493         
494         def report_video_url(self, video_id, video_real_url):
495                 """Report extracted video URL."""
496                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
497         
498         def report_unavailable_format(self, video_id, format):
499                 """Report extracted video URL."""
500                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
501         
502         def _real_initialize(self):
503                 if self._downloader is None:
504                         return
505
506                 username = None
507                 password = None
508                 downloader_params = self._downloader.params
509
510                 # Attempt to use provided username and password or .netrc data
511                 if downloader_params.get('username', None) is not None:
512                         username = downloader_params['username']
513                         password = downloader_params['password']
514                 elif downloader_params.get('usenetrc', False):
515                         try:
516                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
517                                 if info is not None:
518                                         username = info[0]
519                                         password = info[2]
520                                 else:
521                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
522                         except (IOError, netrc.NetrcParseError), err:
523                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
524                                 return
525
526                 # Set language
527                 request = urllib2.Request(self._LANG_URL, None, std_headers)
528                 try:
529                         self.report_lang()
530                         urllib2.urlopen(request).read()
531                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
532                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
533                         return
534
535                 # No authentication to be performed
536                 if username is None:
537                         return
538
539                 # Log in
540                 login_form = {
541                                 'current_form': 'loginForm',
542                                 'next':         '/',
543                                 'action_login': 'Log In',
544                                 'username':     username,
545                                 'password':     password,
546                                 }
547                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
548                 try:
549                         self.report_login()
550                         login_results = urllib2.urlopen(request).read()
551                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
552                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
553                                 return
554                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
555                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
556                         return
557         
558                 # Confirm age
559                 age_form = {
560                                 'next_url':             '/',
561                                 'action_confirm':       'Confirm',
562                                 }
563                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
564                 try:
565                         self.report_age_confirmation()
566                         age_results = urllib2.urlopen(request).read()
567                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
568                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
569                         return
570
571         def _real_extract(self, url):
572                 # Extract video id from URL
573                 mobj = re.match(self._VALID_URL, url)
574                 if mobj is None:
575                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
576                         return
577                 video_id = mobj.group(2)
578
579                 # Downloader parameters
580                 best_quality = False
581                 format_param = None
582                 quality_index = 0
583                 if self._downloader is not None:
584                         params = self._downloader.params
585                         format_param = params.get('format', None)
586                         if format_param == '0':
587                                 format_param = self._available_formats[quality_index]
588                                 best_quality = True
589
590                 while True:
591                         try:
592                                 # Extension
593                                 video_extension = self._video_extensions.get(format_param, 'flv')
594
595                                 # Normalize URL, including format
596                                 normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
597                                 if format_param is not None:
598                                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
599                                 request = urllib2.Request(normalized_url, None, std_headers)
600                                 try:
601                                         self.report_webpage_download(video_id)
602                                         video_webpage = urllib2.urlopen(request).read()
603                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
604                                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
605                                         return
606                                 self.report_information_extraction(video_id)
607                                 
608                                 # "t" param
609                                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
610                                 if mobj is None:
611                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
612                                         return
613                                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
614                                 if format_param is not None:
615                                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
616                                 self.report_video_url(video_id, video_real_url)
617
618                                 # uploader
619                                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
620                                 if mobj is None:
621                                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
622                                         return
623                                 video_uploader = mobj.group(1)
624
625                                 # title
626                                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
627                                 if mobj is None:
628                                         self._downloader.trouble(u'ERROR: unable to extract video title')
629                                         return
630                                 video_title = mobj.group(1).decode('utf-8')
631                                 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
632                                 video_title = video_title.replace(os.sep, u'%')
633
634                                 # simplified title
635                                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
636                                 simple_title = simple_title.strip(ur'_')
637
638                                 # Process video information
639                                 self._downloader.process_info({
640                                         'id':           video_id.decode('utf-8'),
641                                         'url':          video_real_url.decode('utf-8'),
642                                         'uploader':     video_uploader.decode('utf-8'),
643                                         'title':        video_title,
644                                         'stitle':       simple_title,
645                                         'ext':          video_extension.decode('utf-8'),
646                                         'best_quality': best_quality,
647                                 })
648
649                                 return
650
651                         except UnavailableFormatError:
652                                 if quality_index == len(self._available_formats) - 1:
653                                         # I don't ever expect this to happen
654                                         self._downloader.trouble(u'ERROR: no known formats available for video')
655                                         return
656                                 else:
657                                         self.report_unavailable_format(video_id, format_param)
658                                         quality_index += 1
659                                         format_param = self._available_formats[quality_index]
660                                         continue
661
662 class MetacafeIE(InfoExtractor):
663         """Information Extractor for metacafe.com."""
664
665         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
666         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
667         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
668         _youtube_ie = None
669
670         def __init__(self, youtube_ie, downloader=None):
671                 InfoExtractor.__init__(self, downloader)
672                 self._youtube_ie = youtube_ie
673
674         @staticmethod
675         def suitable(url):
676                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
677
678         def report_disclaimer(self):
679                 """Report disclaimer retrieval."""
680                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
681
682         def report_age_confirmation(self):
683                 """Report attempt to confirm age."""
684                 self._downloader.to_stdout(u'[metacafe] Confirming age')
685         
686         def report_download_webpage(self, video_id):
687                 """Report webpage download."""
688                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
689         
690         def report_extraction(self, video_id):
691                 """Report information extraction."""
692                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
693
694         def _real_initialize(self):
695                 # Retrieve disclaimer
696                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
697                 try:
698                         self.report_disclaimer()
699                         disclaimer = urllib2.urlopen(request).read()
700                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
701                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
702                         return
703
704                 # Confirm age
705                 disclaimer_form = {
706                         'filters': '0',
707                         'submit': "Continue - I'm over 18",
708                         }
709                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
710                 try:
711                         self.report_age_confirmation()
712                         disclaimer = urllib2.urlopen(request).read()
713                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
714                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
715                         return
716         
717         def _real_extract(self, url):
718                 # Extract id and simplified title from URL
719                 mobj = re.match(self._VALID_URL, url)
720                 if mobj is None:
721                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
722                         return
723
724                 video_id = mobj.group(1)
725
726                 # Check if video comes from YouTube
727                 mobj2 = re.match(r'^yt-(.*)$', video_id)
728                 if mobj2 is not None:
729                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
730                         return
731
732                 simple_title = mobj.group(2).decode('utf-8')
733                 video_extension = 'flv'
734
735                 # Retrieve video webpage to extract further information
736                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
737                 try:
738                         self.report_download_webpage(video_id)
739                         webpage = urllib2.urlopen(request).read()
740                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
741                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
742                         return
743
744                 # Extract URL, uploader and title from webpage
745                 self.report_extraction(video_id)
746                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
747                 if mobj is None:
748                         self._downloader.trouble(u'ERROR: unable to extract media URL')
749                         return
750                 mediaURL = urllib.unquote(mobj.group(1))
751
752                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
753                 if mobj is None:
754                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
755                         return
756                 gdaKey = mobj.group(1)
757
758                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
759
760                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
761                 if mobj is None:
762                         self._downloader.trouble(u'ERROR: unable to extract title')
763                         return
764                 video_title = mobj.group(1).decode('utf-8')
765
766                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
767                 if mobj is None:
768                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
769                         return
770                 video_uploader = mobj.group(1)
771
772                 # Process video information
773                 self._downloader.process_info({
774                         'id':           video_id.decode('utf-8'),
775                         'url':          video_url.decode('utf-8'),
776                         'uploader':     video_uploader.decode('utf-8'),
777                         'title':        video_title,
778                         'stitle':       simple_title,
779                         'ext':          video_extension.decode('utf-8'),
780                         'best_quality': False, # TODO
781                 })
782
783
784 class YoutubeSearchIE(InfoExtractor):
785         """Information Extractor for YouTube search queries."""
786         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
787         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
788         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
789         _MORE_PAGES_INDICATOR = r'>Next</a>'
790         _youtube_ie = None
791         _max_youtube_results = 1000
792
793         def __init__(self, youtube_ie, downloader=None):
794                 InfoExtractor.__init__(self, downloader)
795                 self._youtube_ie = youtube_ie
796         
797         @staticmethod
798         def suitable(url):
799                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
800
801         def report_download_page(self, query, pagenum):
802                 """Report attempt to download playlist page with given number."""
803                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
804
805         def _real_initialize(self):
806                 self._youtube_ie.initialize()
807         
808         def _real_extract(self, query):
809                 mobj = re.match(self._VALID_QUERY, query)
810                 if mobj is None:
811                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
812                         return
813
814                 prefix, query = query.split(':')
815                 prefix = prefix[8:]
816                 if prefix == '':
817                         self._download_n_results(query, 1)
818                         return
819                 elif prefix == 'all':
820                         self._download_n_results(query, self._max_youtube_results)
821                         return
822                 else:
823                         try:
824                                 n = int(prefix)
825                                 if n <= 0:
826                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
827                                         return
828                                 elif n > self._max_youtube_results:
829                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
830                                         n = self._max_youtube_results
831                                 self._download_n_results(query, n)
832                                 return
833                         except ValueError: # parsing prefix as int fails
834                                 self._download_n_results(query, 1)
835                                 return
836
837         def _download_n_results(self, query, n):
838                 """Downloads a specified number of results for a query"""
839
840                 video_ids = []
841                 already_seen = set()
842                 pagenum = 1
843
844                 while True:
845                         self.report_download_page(query, pagenum)
846                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
847                         request = urllib2.Request(result_url, None, std_headers)
848                         try:
849                                 page = urllib2.urlopen(request).read()
850                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
851                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
852                                 return
853
854                         # Extract video identifiers
855                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
856                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
857                                 if video_id not in already_seen:
858                                         video_ids.append(video_id)
859                                         already_seen.add(video_id)
860                                         if len(video_ids) == n:
861                                                 # Specified n videos reached
862                                                 for id in video_ids:
863                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
864                                                 return
865
866                         if self._MORE_PAGES_INDICATOR not in page:
867                                 for id in video_ids:
868                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
869                                 return
870
871                         pagenum = pagenum + 1
872
873 class YoutubePlaylistIE(InfoExtractor):
874         """Information Extractor for YouTube playlists."""
875
876         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
877         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
878         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
879         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
880         _youtube_ie = None
881
882         def __init__(self, youtube_ie, downloader=None):
883                 InfoExtractor.__init__(self, downloader)
884                 self._youtube_ie = youtube_ie
885         
886         @staticmethod
887         def suitable(url):
888                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
889
890         def report_download_page(self, playlist_id, pagenum):
891                 """Report attempt to download playlist page with given number."""
892                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
893
894         def _real_initialize(self):
895                 self._youtube_ie.initialize()
896         
897         def _real_extract(self, url):
898                 # Extract playlist id
899                 mobj = re.match(self._VALID_URL, url)
900                 if mobj is None:
901                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
902                         return
903
904                 # Download playlist pages
905                 playlist_id = mobj.group(1)
906                 video_ids = []
907                 pagenum = 1
908
909                 while True:
910                         self.report_download_page(playlist_id, pagenum)
911                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
912                         try:
913                                 page = urllib2.urlopen(request).read()
914                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
915                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
916                                 return
917
918                         # Extract video identifiers
919                         ids_in_page = []
920                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
921                                 if mobj.group(1) not in ids_in_page:
922                                         ids_in_page.append(mobj.group(1))
923                         video_ids.extend(ids_in_page)
924
925                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
926                                 break
927                         pagenum = pagenum + 1
928
929                 for id in video_ids:
930                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
931                 return
932
933 class PostProcessor(object):
934         """Post Processor class.
935
936         PostProcessor objects can be added to downloaders with their
937         add_post_processor() method. When the downloader has finished a
938         successful download, it will take its internal chain of PostProcessors
939         and start calling the run() method on each one of them, first with
940         an initial argument and then with the returned value of the previous
941         PostProcessor.
942
943         The chain will be stopped if one of them ever returns None or the end
944         of the chain is reached.
945
946         PostProcessor objects follow a "mutual registration" process similar
947         to InfoExtractor objects.
948         """
949
950         _downloader = None
951
952         def __init__(self, downloader=None):
953                 self._downloader = downloader
954
955         def set_downloader(self, downloader):
956                 """Sets the downloader for this PP."""
957                 self._downloader = downloader
958         
959         def run(self, information):
960                 """Run the PostProcessor.
961
962                 The "information" argument is a dictionary like the ones
963                 composed by InfoExtractors. The only difference is that this
964                 one has an extra field called "filepath" that points to the
965                 downloaded file.
966
967                 When this method returns None, the postprocessing chain is
968                 stopped. However, this method may return an information
969                 dictionary that will be passed to the next postprocessing
970                 object in the chain. It can be the one it received after
971                 changing some fields.
972
973                 In addition, this method may raise a PostProcessingError
974                 exception that will be taken into account by the downloader
975                 it was called from.
976                 """
977                 return information # by default, do nothing
978         
979 ### MAIN PROGRAM ###
980 if __name__ == '__main__':
981         try:
982                 # Modules needed only when running the main program
983                 import getpass
984                 import optparse
985
986                 # General configuration
987                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
988                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
989                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
990
991                 # Parse command line
992                 parser = optparse.OptionParser(
993                         usage='Usage: %prog [options] url...',
994                         version='INTERNAL',
995                         conflict_handler='resolve',
996                 )
997
998                 parser.add_option('-h', '--help',
999                                 action='help', help='print this help text and exit')
1000                 parser.add_option('-v', '--version',
1001                                 action='version', help='print program version and exit')
1002                 parser.add_option('-i', '--ignore-errors',
1003                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1004                 parser.add_option('-r', '--rate-limit',
1005                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1006
1007                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1008                 authentication.add_option('-u', '--username',
1009                                 dest='username', metavar='UN', help='account username')
1010                 authentication.add_option('-p', '--password',
1011                                 dest='password', metavar='PW', help='account password')
1012                 authentication.add_option('-n', '--netrc',
1013                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1014                 parser.add_option_group(authentication)
1015
1016                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1017                 video_format.add_option('-f', '--format',
1018                                 dest='format', metavar='FMT', help='video format code')
1019                 video_format.add_option('-b', '--best-quality',
1020                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1021                 video_format.add_option('-m', '--mobile-version',
1022                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1023                 video_format.add_option('-d', '--high-def',
1024                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1025                 parser.add_option_group(video_format)
1026
1027                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1028                 verbosity.add_option('-q', '--quiet',
1029                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1030                 verbosity.add_option('-s', '--simulate',
1031                                 action='store_true', dest='simulate', help='do not download video', default=False)
1032                 verbosity.add_option('-g', '--get-url',
1033                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1034                 verbosity.add_option('-e', '--get-title',
1035                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1036                 verbosity.add_option('-t', '--title',
1037                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1038                 verbosity.add_option('-l', '--literal',
1039                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1040                 parser.add_option_group(verbosity)
1041
1042                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1043                 filesystem.add_option('-o', '--output',
1044                                 dest='outtmpl', metavar='TPL', help='output filename template')
1045                 filesystem.add_option('-a', '--batch-file',
1046                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1047                 filesystem.add_option('-w', '--no-overwrites',
1048                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1049                 parser.add_option_group(filesystem)
1050
1051                 (opts, args) = parser.parse_args()
1052
1053                 # Batch file verification
1054                 batchurls = []
1055                 if opts.batchfile is not None:
1056                         try:
1057                                 batchurls = open(opts.batchfile, 'r').readlines()
1058                                 batchurls = [x.strip() for x in batchurls]
1059                                 batchurls = [x for x in batchurls if len(x) > 0]
1060                         except IOError:
1061                                 sys.exit(u'ERROR: batch file could not be read')
1062                 all_urls = batchurls + args
1063
1064                 # Conflicting, missing and erroneous options
1065                 if len(all_urls) < 1:
1066                         sys.exit(u'ERROR: you must provide at least one URL')
1067                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1068                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
1069                 if opts.password is not None and opts.username is None:
1070                         sys.exit(u'ERROR: account username missing')
1071                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1072                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
1073                 if opts.usetitle and opts.useliteral:
1074                         sys.exit(u'ERROR: using title conflicts with using literal title')
1075                 if opts.username is not None and opts.password is None:
1076                         opts.password = getpass.getpass(u'Type account password and press return:')
1077                 if opts.ratelimit is not None:
1078                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1079                         if numeric_limit is None:
1080                                 sys.exit(u'ERROR: invalid rate limit specified')
1081                         opts.ratelimit = numeric_limit
1082
1083                 # Information extractors
1084                 youtube_ie = YoutubeIE()
1085                 metacafe_ie = MetacafeIE(youtube_ie)
1086                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1087                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1088
1089                 # File downloader
1090                 fd = FileDownloader({
1091                         'usenetrc': opts.usenetrc,
1092                         'username': opts.username,
1093                         'password': opts.password,
1094                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1095                         'forceurl': opts.geturl,
1096                         'forcetitle': opts.gettitle,
1097                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1098                         'format': opts.format,
1099                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1100                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1101                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1102                                 or u'%(id)s.%(ext)s'),
1103                         'ignoreerrors': opts.ignoreerrors,
1104                         'ratelimit': opts.ratelimit,
1105                         'nooverwrites': opts.nooverwrites,
1106                         })
1107                 fd.add_info_extractor(youtube_search_ie)
1108                 fd.add_info_extractor(youtube_pl_ie)
1109                 fd.add_info_extractor(metacafe_ie)
1110                 fd.add_info_extractor(youtube_ie)
1111                 retcode = fd.download(all_urls)
1112                 sys.exit(retcode)
1113
1114         except DownloadError:
1115                 sys.exit(1)
1116         except SameFileError:
1117                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1118         except KeyboardInterrupt:
1119                 sys.exit(u'\nERROR: Interrupted by user')