b1f2717936c93cd0503bcf78deb0f6ccf54e47be
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 def preferredencoding():
31         """Get preferred encoding.
32
33         Returns the best encoding scheme for the system, based on
34         locale.getpreferredencoding() and some further tweaks.
35         """
36         try:
37                 pref = locale.getpreferredencoding()
38                 # Mac OSX systems have this problem sometimes
39                 if pref == '':
40                         return 'UTF-8'
41                 return pref
42         except:
43                 sys.stderr.write('WARNING: problem obtaining preferred encoding. Falling back to UTF-8.\n')
44                 return 'UTF-8'
45
46 class DownloadError(Exception):
47         """Download Error exception.
48         
49         This exception may be thrown by FileDownloader objects if they are not
50         configured to continue on errors. They will contain the appropriate
51         error message.
52         """
53         pass
54
55 class SameFileError(Exception):
56         """Same File exception.
57
58         This exception will be thrown by FileDownloader objects if they detect
59         multiple files would have to be downloaded to the same file on disk.
60         """
61         pass
62
63 class PostProcessingError(Exception):
64         """Post Processing exception.
65
66         This exception may be raised by PostProcessor's .run() method to
67         indicate an error in the postprocessing task.
68         """
69         pass
70
71 class UnavailableFormatError(Exception):
72         """Unavailable Format exception.
73
74         This exception will be thrown when a video is requested
75         in a format that is not available for that video.
76         """
77         pass
78
79 class ContentTooShortError(Exception):
80         """Content Too Short exception.
81
82         This exception may be raised by FileDownloader objects when a file they
83         download is too small for what the server announced first, indicating
84         the connection was probably interrupted.
85         """
86         # Both in bytes
87         downloaded = None
88         expected = None
89
90         def __init__(self, downloaded, expected):
91                 self.downloaded = downloaded
92                 self.expected = expected
93
94 class FileDownloader(object):
95         """File Downloader class.
96
97         File downloader objects are the ones responsible of downloading the
98         actual video file and writing it to disk if the user has requested
99         it, among some other tasks. In most cases there should be one per
100         program. As, given a video URL, the downloader doesn't know how to
101         extract all the needed information, task that InfoExtractors do, it
102         has to pass the URL to one of them.
103
104         For this, file downloader objects have a method that allows
105         InfoExtractors to be registered in a given order. When it is passed
106         a URL, the file downloader handles it to the first InfoExtractor it
107         finds that reports being able to handle it. The InfoExtractor extracts
108         all the information about the video or videos the URL refers to, and
109         asks the FileDownloader to process the video information, possibly
110         downloading the video.
111
112         File downloaders accept a lot of parameters. In order not to saturate
113         the object constructor with arguments, it receives a dictionary of
114         options instead. These options are available through the params
115         attribute for the InfoExtractors to use. The FileDownloader also
116         registers itself as the downloader in charge for the InfoExtractors
117         that are added to it, so this is a "mutual registration".
118
119         Available options:
120
121         username:       Username for authentication purposes.
122         password:       Password for authentication purposes.
123         usenetrc:       Use netrc for authentication instead.
124         quiet:          Do not print messages to stdout.
125         forceurl:       Force printing final URL.
126         forcetitle:     Force printing title.
127         simulate:       Do not download the video files.
128         format:         Video format code.
129         outtmpl:        Template for output names.
130         ignoreerrors:   Do not stop on download errors.
131         ratelimit:      Download speed limit, in bytes/sec.
132         nooverwrites:   Prevent overwriting files.
133         continuedl:     Try to continue downloads if possible.
134         """
135
136         params = None
137         _ies = []
138         _pps = []
139         _download_retcode = None
140
141         def __init__(self, params):
142                 """Create a FileDownloader object with the given options."""
143                 self._ies = []
144                 self._pps = []
145                 self._download_retcode = 0
146                 self.params = params
147         
148         @staticmethod
149         def pmkdir(filename):
150                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
151                 components = filename.split(os.sep)
152                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
153                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
154                 for dir in aggregate:
155                         if not os.path.exists(dir):
156                                 os.mkdir(dir)
157         
158         @staticmethod
159         def format_bytes(bytes):
160                 if bytes is None:
161                         return 'N/A'
162                 if type(bytes) is str:
163                         bytes = float(bytes)
164                 if bytes == 0.0:
165                         exponent = 0
166                 else:
167                         exponent = long(math.log(bytes, 1024.0))
168                 suffix = 'bkMGTPEZY'[exponent]
169                 converted = float(bytes) / float(1024**exponent)
170                 return '%.2f%s' % (converted, suffix)
171
172         @staticmethod
173         def calc_percent(byte_counter, data_len):
174                 if data_len is None:
175                         return '---.-%'
176                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
177
178         @staticmethod
179         def calc_eta(start, now, total, current):
180                 if total is None:
181                         return '--:--'
182                 dif = now - start
183                 if current == 0 or dif < 0.001: # One millisecond
184                         return '--:--'
185                 rate = float(current) / dif
186                 eta = long((float(total) - float(current)) / rate)
187                 (eta_mins, eta_secs) = divmod(eta, 60)
188                 if eta_mins > 99:
189                         return '--:--'
190                 return '%02d:%02d' % (eta_mins, eta_secs)
191
192         @staticmethod
193         def calc_speed(start, now, bytes):
194                 dif = now - start
195                 if bytes == 0 or dif < 0.001: # One millisecond
196                         return '%10s' % '---b/s'
197                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
198
199         @staticmethod
200         def best_block_size(elapsed_time, bytes):
201                 new_min = max(bytes / 2.0, 1.0)
202                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
203                 if elapsed_time < 0.001:
204                         return long(new_max)
205                 rate = bytes / elapsed_time
206                 if rate > new_max:
207                         return long(new_max)
208                 if rate < new_min:
209                         return long(new_min)
210                 return long(rate)
211
212         @staticmethod
213         def parse_bytes(bytestr):
214                 """Parse a string indicating a byte quantity into a long integer."""
215                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
216                 if matchobj is None:
217                         return None
218                 number = float(matchobj.group(1))
219                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
220                 return long(round(number * multiplier))
221
222         @staticmethod
223         def verify_url(url):
224                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
225                 request = urllib2.Request(url, None, std_headers)
226                 data = urllib2.urlopen(request)
227                 data.read(1)
228                 url = data.geturl()
229                 data.close()
230                 return url
231
232         def add_info_extractor(self, ie):
233                 """Add an InfoExtractor object to the end of the list."""
234                 self._ies.append(ie)
235                 ie.set_downloader(self)
236         
237         def add_post_processor(self, pp):
238                 """Add a PostProcessor object to the end of the chain."""
239                 self._pps.append(pp)
240                 pp.set_downloader(self)
241         
242         def to_stdout(self, message, skip_eol=False):
243                 """Print message to stdout if not in quiet mode."""
244                 if not self.params.get('quiet', False):
245                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
246                         sys.stdout.flush()
247         
248         def to_stderr(self, message):
249                 """Print message to stderr."""
250                 print >>sys.stderr, message.encode(preferredencoding())
251         
252         def fixed_template(self):
253                 """Checks if the output template is fixed."""
254                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
255
256         def trouble(self, message=None):
257                 """Determine action to take when a download problem appears.
258
259                 Depending on if the downloader has been configured to ignore
260                 download errors or not, this method may throw an exception or
261                 not when errors are found, after printing the message.
262                 """
263                 if message is not None:
264                         self.to_stderr(message)
265                 if not self.params.get('ignoreerrors', False):
266                         raise DownloadError(message)
267                 self._download_retcode = 1
268
269         def slow_down(self, start_time, byte_counter):
270                 """Sleep if the download speed is over the rate limit."""
271                 rate_limit = self.params.get('ratelimit', None)
272                 if rate_limit is None or byte_counter == 0:
273                         return
274                 now = time.time()
275                 elapsed = now - start_time
276                 if elapsed <= 0.0:
277                         return
278                 speed = float(byte_counter) / elapsed
279                 if speed > rate_limit:
280                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
281
282         def report_destination(self, filename):
283                 """Report destination filename."""
284                 self.to_stdout(u'[download] Destination: %s' % filename)
285         
286         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
287                 """Report download progress."""
288                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
289                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
290
291         def report_resuming_byte(self, resume_len):
292                 """Report attemtp to resume at given byte."""
293                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
294         
295         def report_file_already_downloaded(self, file_name):
296                 """Report file has already been fully downloaded."""
297                 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
298         
299         def report_unable_to_resume(self):
300                 """Report it was impossible to resume download."""
301                 self.to_stdout(u'[download] Unable to resume')
302         
303         def report_finish(self):
304                 """Report download finished."""
305                 self.to_stdout(u'')
306
307         def process_info(self, info_dict):
308                 """Process a single dictionary returned by an InfoExtractor."""
309                 # Do nothing else if in simulate mode
310                 if self.params.get('simulate', False):
311                         try:
312                                 info_dict['url'] = self.verify_url(info_dict['url'])
313                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
314                                 raise UnavailableFormatError
315
316                         # Forced printings
317                         if self.params.get('forcetitle', False):
318                                 print info_dict['title'].encode(preferredencoding())
319                         if self.params.get('forceurl', False):
320                                 print info_dict['url'].encode(preferredencoding())
321
322                         return
323                         
324                 try:
325                         template_dict = dict(info_dict)
326                         template_dict['epoch'] = unicode(long(time.time()))
327                         filename = self.params['outtmpl'] % template_dict
328                 except (ValueError, KeyError), err:
329                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
330                 if self.params['nooverwrites'] and os.path.exists(filename):
331                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
332                         return
333
334                 try:
335                         self.pmkdir(filename)
336                 except (OSError, IOError), err:
337                         self.trouble('ERROR: unable to create directories: %s' % str(err))
338                         return
339
340                 try:
341                         success = self._do_download(filename, info_dict['url'])
342                 except (OSError, IOError), err:
343                         raise UnavailableFormatError
344                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
345                         self.trouble('ERROR: unable to download video data: %s' % str(err))
346                         return
347                 except (ContentTooShortError, ), err:
348                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
349                         return
350
351                 if success:
352                         try:
353                                 self.post_process(filename, info_dict)
354                         except (PostProcessingError), err:
355                                 self.trouble('ERROR: postprocessing: %s' % str(err))
356                                 return
357
358         def download(self, url_list):
359                 """Download a given list of URLs."""
360                 if len(url_list) > 1 and self.fixed_template():
361                         raise SameFileError(self.params['outtmpl'])
362
363                 for url in url_list:
364                         suitable_found = False
365                         for ie in self._ies:
366                                 # Go to next InfoExtractor if not suitable
367                                 if not ie.suitable(url):
368                                         continue
369
370                                 # Suitable InfoExtractor found
371                                 suitable_found = True
372
373                                 # Extract information from URL and process it
374                                 ie.extract(url)
375
376                                 # Suitable InfoExtractor had been found; go to next URL
377                                 break
378
379                         if not suitable_found:
380                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
381
382                 return self._download_retcode
383
384         def post_process(self, filename, ie_info):
385                 """Run the postprocessing chain on the given file."""
386                 info = dict(ie_info)
387                 info['filepath'] = filename
388                 for pp in self._pps:
389                         info = pp.run(info)
390                         if info is None:
391                                 break
392         
393         def _do_download(self, filename, url):
394                 stream = None
395                 open_mode = 'ab'
396
397                 basic_request = urllib2.Request(url, None, std_headers)
398                 request = urllib2.Request(url, None, std_headers)
399
400                 # Attempt to resume download with "continuedl" option
401                 if os.path.isfile(filename):
402                         resume_len = os.path.getsize(filename)
403                 else:
404                         resume_len = 0
405                 if self.params['continuedl'] and resume_len != 0:
406                         self.report_resuming_byte(resume_len)
407                         request.add_header('Range','bytes=%d-' % resume_len)
408
409                 # Establish connection
410                 try:
411                         data = urllib2.urlopen(request)
412                 except (urllib2.HTTPError, ), err:
413                         if err.code != 416: #  416 is 'Requested range not satisfiable'
414                                 raise
415                         data = urllib2.urlopen(basic_request)
416                         content_length = data.info()['Content-Length']
417                         if content_length is not None and long(content_length) == resume_len:
418                                 self.report_file_already_downloaded(filename)
419                                 return True
420                         else:
421                                 self.report_unable_to_resume()
422                                 open_mode = 'wb'
423
424                 data_len = data.info().get('Content-length', None)
425                 data_len_str = self.format_bytes(data_len)
426                 byte_counter = 0
427                 block_size = 1024
428                 start = time.time()
429                 while True:
430                         # Download and write
431                         before = time.time()
432                         data_block = data.read(block_size)
433                         after = time.time()
434                         data_block_len = len(data_block)
435                         if data_block_len == 0:
436                                 break
437                         byte_counter += data_block_len
438
439                         # Open file just in time
440                         if stream is None:
441                                 try:
442                                         stream = open(filename, open_mode)
443                                         self.report_destination(filename)
444                                 except (OSError, IOError), err:
445                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
446                                         return False
447                         stream.write(data_block)
448                         block_size = self.best_block_size(after - before, data_block_len)
449
450                         # Progress message
451                         percent_str = self.calc_percent(byte_counter, data_len)
452                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
453                         speed_str = self.calc_speed(start, time.time(), byte_counter)
454                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
455
456                         # Apply rate limit
457                         self.slow_down(start, byte_counter)
458
459                 self.report_finish()
460                 if data_len is not None and str(byte_counter) != data_len:
461                         raise ContentTooShortError(byte_counter, long(data_len))
462                 return True
463
464 class InfoExtractor(object):
465         """Information Extractor class.
466
467         Information extractors are the classes that, given a URL, extract
468         information from the video (or videos) the URL refers to. This
469         information includes the real video URL, the video title and simplified
470         title, author and others. The information is stored in a dictionary
471         which is then passed to the FileDownloader. The FileDownloader
472         processes this information possibly downloading the video to the file
473         system, among other possible outcomes. The dictionaries must include
474         the following fields:
475
476         id:             Video identifier.
477         url:            Final video URL.
478         uploader:       Nickname of the video uploader.
479         title:          Literal title.
480         stitle:         Simplified title.
481         ext:            Video filename extension.
482
483         Subclasses of this one should re-define the _real_initialize() and
484         _real_extract() methods, as well as the suitable() static method.
485         Probably, they should also be instantiated and added to the main
486         downloader.
487         """
488
489         _ready = False
490         _downloader = None
491
492         def __init__(self, downloader=None):
493                 """Constructor. Receives an optional downloader."""
494                 self._ready = False
495                 self.set_downloader(downloader)
496
497         @staticmethod
498         def suitable(url):
499                 """Receives a URL and returns True if suitable for this IE."""
500                 return False
501
502         def initialize(self):
503                 """Initializes an instance (authentication, etc)."""
504                 if not self._ready:
505                         self._real_initialize()
506                         self._ready = True
507
508         def extract(self, url):
509                 """Extracts URL information and returns it in list of dicts."""
510                 self.initialize()
511                 return self._real_extract(url)
512
513         def set_downloader(self, downloader):
514                 """Sets the downloader for this IE."""
515                 self._downloader = downloader
516         
517         def _real_initialize(self):
518                 """Real initialization process. Redefine in subclasses."""
519                 pass
520
521         def _real_extract(self, url):
522                 """Real extraction process. Redefine in subclasses."""
523                 pass
524
525 class YoutubeIE(InfoExtractor):
526         """Information extractor for youtube.com."""
527
528         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
529         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
530         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
531         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
532         _NETRC_MACHINE = 'youtube'
533         _available_formats = ['22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
534         _video_extensions = {
535                 '13': '3gp',
536                 '17': 'mp4',
537                 '18': 'mp4',
538                 '22': 'mp4',
539         }
540
541         @staticmethod
542         def suitable(url):
543                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
544
545         @staticmethod
546         def htmlentity_transform(matchobj):
547                 """Transforms an HTML entity to a Unicode character."""
548                 entity = matchobj.group(1)
549
550                 # Known non-numeric HTML entity
551                 if entity in htmlentitydefs.name2codepoint:
552                         return unichr(htmlentitydefs.name2codepoint[entity])
553
554                 # Unicode character
555                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
556                 if mobj is not None:
557                         numstr = mobj.group(1)
558                         if numstr.startswith(u'x'):
559                                 base = 16
560                                 numstr = u'0%s' % numstr
561                         else:
562                                 base = 10
563                         return unichr(long(numstr, base))
564
565                 # Unknown entity in name, return its literal representation
566                 return (u'&%s;' % entity)
567
568         def report_lang(self):
569                 """Report attempt to set language."""
570                 self._downloader.to_stdout(u'[youtube] Setting language')
571
572         def report_login(self):
573                 """Report attempt to log in."""
574                 self._downloader.to_stdout(u'[youtube] Logging in')
575         
576         def report_age_confirmation(self):
577                 """Report attempt to confirm age."""
578                 self._downloader.to_stdout(u'[youtube] Confirming age')
579         
580         def report_video_info_webpage_download(self, video_id):
581                 """Report attempt to download video info webpage."""
582                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
583         
584         def report_information_extraction(self, video_id):
585                 """Report attempt to extract video information."""
586                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
587         
588         def report_unavailable_format(self, video_id, format):
589                 """Report extracted video URL."""
590                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
591         
592         def _real_initialize(self):
593                 if self._downloader is None:
594                         return
595
596                 username = None
597                 password = None
598                 downloader_params = self._downloader.params
599
600                 # Attempt to use provided username and password or .netrc data
601                 if downloader_params.get('username', None) is not None:
602                         username = downloader_params['username']
603                         password = downloader_params['password']
604                 elif downloader_params.get('usenetrc', False):
605                         try:
606                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
607                                 if info is not None:
608                                         username = info[0]
609                                         password = info[2]
610                                 else:
611                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
612                         except (IOError, netrc.NetrcParseError), err:
613                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
614                                 return
615
616                 # Set language
617                 request = urllib2.Request(self._LANG_URL, None, std_headers)
618                 try:
619                         self.report_lang()
620                         urllib2.urlopen(request).read()
621                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
622                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
623                         return
624
625                 # No authentication to be performed
626                 if username is None:
627                         return
628
629                 # Log in
630                 login_form = {
631                                 'current_form': 'loginForm',
632                                 'next':         '/',
633                                 'action_login': 'Log In',
634                                 'username':     username,
635                                 'password':     password,
636                                 }
637                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
638                 try:
639                         self.report_login()
640                         login_results = urllib2.urlopen(request).read()
641                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
642                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
643                                 return
644                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
645                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
646                         return
647         
648                 # Confirm age
649                 age_form = {
650                                 'next_url':             '/',
651                                 'action_confirm':       'Confirm',
652                                 }
653                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
654                 try:
655                         self.report_age_confirmation()
656                         age_results = urllib2.urlopen(request).read()
657                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
658                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
659                         return
660
661         def _real_extract(self, url):
662                 # Extract video id from URL
663                 mobj = re.match(self._VALID_URL, url)
664                 if mobj is None:
665                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
666                         return
667                 video_id = mobj.group(2)
668
669                 # Downloader parameters
670                 best_quality = False
671                 format_param = None
672                 quality_index = 0
673                 if self._downloader is not None:
674                         params = self._downloader.params
675                         format_param = params.get('format', None)
676                         if format_param == '0':
677                                 format_param = self._available_formats[quality_index]
678                                 best_quality = True
679
680                 while True:
681                         # Extension
682                         video_extension = self._video_extensions.get(format_param, 'flv')
683
684                         # Get video info
685                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
686                         request = urllib2.Request(video_info_url, None, std_headers)
687                         try:
688                                 self.report_video_info_webpage_download(video_id)
689                                 video_info_webpage = urllib2.urlopen(request).read()
690                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
691                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
692                                 return
693                         self.report_information_extraction(video_id)
694
695                         # "t" param
696                         mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
697                         if mobj is None:
698                                 # Attempt to see if YouTube has issued an error message
699                                 mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
700                                 if mobj is None:
701                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
702                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
703                                         stream.write(video_info_webpage)
704                                         stream.close()
705                                 else:
706                                         reason = urllib.unquote_plus(mobj.group(1))
707                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
708                                 return
709                         token = urllib.unquote(mobj.group(1))
710                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
711                         if format_param is not None:
712                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
713
714                         # uploader
715                         mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
716                         if mobj is None:
717                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
718                                 return
719                         video_uploader = urllib.unquote(mobj.group(1))
720
721                         # title
722                         mobj = re.search(r'(?m)&title=([^&]+)(?:&|$)', video_info_webpage)
723                         if mobj is None:
724                                 self._downloader.trouble(u'ERROR: unable to extract video title')
725                                 return
726                         video_title = urllib.unquote(mobj.group(1))
727                         video_title = video_title.decode('utf-8')
728                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
729                         video_title = video_title.replace(os.sep, u'%')
730
731                         # simplified title
732                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
733                         simple_title = simple_title.strip(ur'_')
734
735                         try:
736                                 # Process video information
737                                 self._downloader.process_info({
738                                         'id':           video_id.decode('utf-8'),
739                                         'url':          video_real_url.decode('utf-8'),
740                                         'uploader':     video_uploader.decode('utf-8'),
741                                         'title':        video_title,
742                                         'stitle':       simple_title,
743                                         'ext':          video_extension.decode('utf-8'),
744                                 })
745
746                                 return
747
748                         except UnavailableFormatError, err:
749                                 if best_quality:
750                                         if quality_index == len(self._available_formats) - 1:
751                                                 # I don't ever expect this to happen
752                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
753                                                 return
754                                         else:
755                                                 self.report_unavailable_format(video_id, format_param)
756                                                 quality_index += 1
757                                                 format_param = self._available_formats[quality_index]
758                                                 continue
759                                 else: 
760                                         self._downloader.trouble('ERROR: format not available for video')
761                                         return
762
763
764 class MetacafeIE(InfoExtractor):
765         """Information Extractor for metacafe.com."""
766
767         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
768         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
769         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
770         _youtube_ie = None
771
772         def __init__(self, youtube_ie, downloader=None):
773                 InfoExtractor.__init__(self, downloader)
774                 self._youtube_ie = youtube_ie
775
776         @staticmethod
777         def suitable(url):
778                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
779
780         def report_disclaimer(self):
781                 """Report disclaimer retrieval."""
782                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
783
784         def report_age_confirmation(self):
785                 """Report attempt to confirm age."""
786                 self._downloader.to_stdout(u'[metacafe] Confirming age')
787         
788         def report_download_webpage(self, video_id):
789                 """Report webpage download."""
790                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
791         
792         def report_extraction(self, video_id):
793                 """Report information extraction."""
794                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
795
796         def _real_initialize(self):
797                 # Retrieve disclaimer
798                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
799                 try:
800                         self.report_disclaimer()
801                         disclaimer = urllib2.urlopen(request).read()
802                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
803                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
804                         return
805
806                 # Confirm age
807                 disclaimer_form = {
808                         'filters': '0',
809                         'submit': "Continue - I'm over 18",
810                         }
811                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
812                 try:
813                         self.report_age_confirmation()
814                         disclaimer = urllib2.urlopen(request).read()
815                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
816                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
817                         return
818         
819         def _real_extract(self, url):
820                 # Extract id and simplified title from URL
821                 mobj = re.match(self._VALID_URL, url)
822                 if mobj is None:
823                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
824                         return
825
826                 video_id = mobj.group(1)
827
828                 # Check if video comes from YouTube
829                 mobj2 = re.match(r'^yt-(.*)$', video_id)
830                 if mobj2 is not None:
831                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
832                         return
833
834                 simple_title = mobj.group(2).decode('utf-8')
835                 video_extension = 'flv'
836
837                 # Retrieve video webpage to extract further information
838                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
839                 try:
840                         self.report_download_webpage(video_id)
841                         webpage = urllib2.urlopen(request).read()
842                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
843                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
844                         return
845
846                 # Extract URL, uploader and title from webpage
847                 self.report_extraction(video_id)
848                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
849                 if mobj is None:
850                         self._downloader.trouble(u'ERROR: unable to extract media URL')
851                         return
852                 mediaURL = urllib.unquote(mobj.group(1))
853
854                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
855                 #if mobj is None:
856                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
857                 #       return
858                 #gdaKey = mobj.group(1)
859                 #
860                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
861
862                 video_url = mediaURL
863
864                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
865                 if mobj is None:
866                         self._downloader.trouble(u'ERROR: unable to extract title')
867                         return
868                 video_title = mobj.group(1).decode('utf-8')
869
870                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
871                 if mobj is None:
872                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
873                         return
874                 video_uploader = mobj.group(1)
875
876                 try:
877                         # Process video information
878                         self._downloader.process_info({
879                                 'id':           video_id.decode('utf-8'),
880                                 'url':          video_url.decode('utf-8'),
881                                 'uploader':     video_uploader.decode('utf-8'),
882                                 'title':        video_title,
883                                 'stitle':       simple_title,
884                                 'ext':          video_extension.decode('utf-8'),
885                         })
886                 except UnavailableFormatError:
887                         self._downloader.trouble(u'ERROR: format not available for video')
888
889
890 class YoutubeSearchIE(InfoExtractor):
891         """Information Extractor for YouTube search queries."""
892         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
893         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
894         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
895         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
896         _youtube_ie = None
897         _max_youtube_results = 1000
898
899         def __init__(self, youtube_ie, downloader=None):
900                 InfoExtractor.__init__(self, downloader)
901                 self._youtube_ie = youtube_ie
902         
903         @staticmethod
904         def suitable(url):
905                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
906
907         def report_download_page(self, query, pagenum):
908                 """Report attempt to download playlist page with given number."""
909                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
910
911         def _real_initialize(self):
912                 self._youtube_ie.initialize()
913         
914         def _real_extract(self, query):
915                 mobj = re.match(self._VALID_QUERY, query)
916                 if mobj is None:
917                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
918                         return
919
920                 prefix, query = query.split(':')
921                 prefix = prefix[8:]
922                 if prefix == '':
923                         self._download_n_results(query, 1)
924                         return
925                 elif prefix == 'all':
926                         self._download_n_results(query, self._max_youtube_results)
927                         return
928                 else:
929                         try:
930                                 n = long(prefix)
931                                 if n <= 0:
932                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
933                                         return
934                                 elif n > self._max_youtube_results:
935                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
936                                         n = self._max_youtube_results
937                                 self._download_n_results(query, n)
938                                 return
939                         except ValueError: # parsing prefix as integer fails
940                                 self._download_n_results(query, 1)
941                                 return
942
943         def _download_n_results(self, query, n):
944                 """Downloads a specified number of results for a query"""
945
946                 video_ids = []
947                 already_seen = set()
948                 pagenum = 1
949
950                 while True:
951                         self.report_download_page(query, pagenum)
952                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
953                         request = urllib2.Request(result_url, None, std_headers)
954                         try:
955                                 page = urllib2.urlopen(request).read()
956                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
957                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
958                                 return
959
960                         # Extract video identifiers
961                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
962                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
963                                 if video_id not in already_seen:
964                                         video_ids.append(video_id)
965                                         already_seen.add(video_id)
966                                         if len(video_ids) == n:
967                                                 # Specified n videos reached
968                                                 for id in video_ids:
969                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
970                                                 return
971
972                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
973                                 for id in video_ids:
974                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
975                                 return
976
977                         pagenum = pagenum + 1
978
979 class YoutubePlaylistIE(InfoExtractor):
980         """Information Extractor for YouTube playlists."""
981
982         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
983         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
984         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
985         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
986         _youtube_ie = None
987
988         def __init__(self, youtube_ie, downloader=None):
989                 InfoExtractor.__init__(self, downloader)
990                 self._youtube_ie = youtube_ie
991         
992         @staticmethod
993         def suitable(url):
994                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
995
996         def report_download_page(self, playlist_id, pagenum):
997                 """Report attempt to download playlist page with given number."""
998                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
999
1000         def _real_initialize(self):
1001                 self._youtube_ie.initialize()
1002         
1003         def _real_extract(self, url):
1004                 # Extract playlist id
1005                 mobj = re.match(self._VALID_URL, url)
1006                 if mobj is None:
1007                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1008                         return
1009
1010                 # Download playlist pages
1011                 playlist_id = mobj.group(1)
1012                 video_ids = []
1013                 pagenum = 1
1014
1015                 while True:
1016                         self.report_download_page(playlist_id, pagenum)
1017                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1018                         try:
1019                                 page = urllib2.urlopen(request).read()
1020                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1021                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1022                                 return
1023
1024                         # Extract video identifiers
1025                         ids_in_page = []
1026                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1027                                 if mobj.group(1) not in ids_in_page:
1028                                         ids_in_page.append(mobj.group(1))
1029                         video_ids.extend(ids_in_page)
1030
1031                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1032                                 break
1033                         pagenum = pagenum + 1
1034
1035                 for id in video_ids:
1036                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1037                 return
1038
1039 class PostProcessor(object):
1040         """Post Processor class.
1041
1042         PostProcessor objects can be added to downloaders with their
1043         add_post_processor() method. When the downloader has finished a
1044         successful download, it will take its internal chain of PostProcessors
1045         and start calling the run() method on each one of them, first with
1046         an initial argument and then with the returned value of the previous
1047         PostProcessor.
1048
1049         The chain will be stopped if one of them ever returns None or the end
1050         of the chain is reached.
1051
1052         PostProcessor objects follow a "mutual registration" process similar
1053         to InfoExtractor objects.
1054         """
1055
1056         _downloader = None
1057
1058         def __init__(self, downloader=None):
1059                 self._downloader = downloader
1060
1061         def set_downloader(self, downloader):
1062                 """Sets the downloader for this PP."""
1063                 self._downloader = downloader
1064         
1065         def run(self, information):
1066                 """Run the PostProcessor.
1067
1068                 The "information" argument is a dictionary like the ones
1069                 composed by InfoExtractors. The only difference is that this
1070                 one has an extra field called "filepath" that points to the
1071                 downloaded file.
1072
1073                 When this method returns None, the postprocessing chain is
1074                 stopped. However, this method may return an information
1075                 dictionary that will be passed to the next postprocessing
1076                 object in the chain. It can be the one it received after
1077                 changing some fields.
1078
1079                 In addition, this method may raise a PostProcessingError
1080                 exception that will be taken into account by the downloader
1081                 it was called from.
1082                 """
1083                 return information # by default, do nothing
1084         
1085 ### MAIN PROGRAM ###
1086 if __name__ == '__main__':
1087         try:
1088                 # Modules needed only when running the main program
1089                 import getpass
1090                 import optparse
1091
1092                 # General configuration
1093                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1094                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1095                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1096
1097                 # Parse command line
1098                 parser = optparse.OptionParser(
1099                         usage='Usage: %prog [options] url...',
1100                         version='INTERNAL',
1101                         conflict_handler='resolve',
1102                 )
1103
1104                 parser.add_option('-h', '--help',
1105                                 action='help', help='print this help text and exit')
1106                 parser.add_option('-v', '--version',
1107                                 action='version', help='print program version and exit')
1108                 parser.add_option('-i', '--ignore-errors',
1109                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1110                 parser.add_option('-r', '--rate-limit',
1111                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1112
1113                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1114                 authentication.add_option('-u', '--username',
1115                                 dest='username', metavar='UN', help='account username')
1116                 authentication.add_option('-p', '--password',
1117                                 dest='password', metavar='PW', help='account password')
1118                 authentication.add_option('-n', '--netrc',
1119                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1120                 parser.add_option_group(authentication)
1121
1122                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1123                 video_format.add_option('-f', '--format',
1124                                 action='store', dest='format', metavar='FMT', help='video format code')
1125                 video_format.add_option('-b', '--best-quality',
1126                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1127                 video_format.add_option('-m', '--mobile-version',
1128                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1129                 video_format.add_option('-d', '--high-def',
1130                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1131                 parser.add_option_group(video_format)
1132
1133                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1134                 verbosity.add_option('-q', '--quiet',
1135                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1136                 verbosity.add_option('-s', '--simulate',
1137                                 action='store_true', dest='simulate', help='do not download video', default=False)
1138                 verbosity.add_option('-g', '--get-url',
1139                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1140                 verbosity.add_option('-e', '--get-title',
1141                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1142                 parser.add_option_group(verbosity)
1143
1144                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1145                 filesystem.add_option('-t', '--title',
1146                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1147                 filesystem.add_option('-l', '--literal',
1148                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1149                 filesystem.add_option('-o', '--output',
1150                                 dest='outtmpl', metavar='TPL', help='output filename template')
1151                 filesystem.add_option('-a', '--batch-file',
1152                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1153                 filesystem.add_option('-w', '--no-overwrites',
1154                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1155                 filesystem.add_option('-c', '--continue',
1156                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1157                 parser.add_option_group(filesystem)
1158
1159                 (opts, args) = parser.parse_args()
1160
1161                 # Batch file verification
1162                 batchurls = []
1163                 if opts.batchfile is not None:
1164                         try:
1165                                 batchurls = open(opts.batchfile, 'r').readlines()
1166                                 batchurls = [x.strip() for x in batchurls]
1167                                 batchurls = [x for x in batchurls if len(x) > 0]
1168                         except IOError:
1169                                 sys.exit(u'ERROR: batch file could not be read')
1170                 all_urls = batchurls + args
1171
1172                 # Conflicting, missing and erroneous options
1173                 if len(all_urls) < 1:
1174                         parser.error(u'you must provide at least one URL')
1175                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1176                         parser.error(u'using .netrc conflicts with giving username/password')
1177                 if opts.password is not None and opts.username is None:
1178                         parser.error(u'account username missing')
1179                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1180                         parser.error(u'using output template conflicts with using title or literal title')
1181                 if opts.usetitle and opts.useliteral:
1182                         parser.error(u'using title conflicts with using literal title')
1183                 if opts.username is not None and opts.password is None:
1184                         opts.password = getpass.getpass(u'Type account password and press return:')
1185                 if opts.ratelimit is not None:
1186                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1187                         if numeric_limit is None:
1188                                 parser.error(u'invalid rate limit specified')
1189                         opts.ratelimit = numeric_limit
1190
1191                 # Information extractors
1192                 youtube_ie = YoutubeIE()
1193                 metacafe_ie = MetacafeIE(youtube_ie)
1194                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1195                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1196
1197                 # File downloader
1198                 fd = FileDownloader({
1199                         'usenetrc': opts.usenetrc,
1200                         'username': opts.username,
1201                         'password': opts.password,
1202                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1203                         'forceurl': opts.geturl,
1204                         'forcetitle': opts.gettitle,
1205                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1206                         'format': opts.format,
1207                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1208                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1209                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1210                                 or u'%(id)s.%(ext)s'),
1211                         'ignoreerrors': opts.ignoreerrors,
1212                         'ratelimit': opts.ratelimit,
1213                         'nooverwrites': opts.nooverwrites,
1214                         'continuedl': opts.continue_dl,
1215                         })
1216                 fd.add_info_extractor(youtube_search_ie)
1217                 fd.add_info_extractor(youtube_pl_ie)
1218                 fd.add_info_extractor(metacafe_ie)
1219                 fd.add_info_extractor(youtube_ie)
1220                 retcode = fd.download(all_urls)
1221                 sys.exit(retcode)
1222
1223         except DownloadError:
1224                 sys.exit(1)
1225         except SameFileError:
1226                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1227         except KeyboardInterrupt:
1228                 sys.exit(u'\nERROR: Interrupted by user')