Added -c option (--continue)
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.8',
23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25         'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 class DownloadError(Exception):
31         """Download Error exception.
32         
33         This exception may be thrown by FileDownloader objects if they are not
34         configured to continue on errors. They will contain the appropriate
35         error message.
36         """
37         pass
38
39 class SameFileError(Exception):
40         """Same File exception.
41
42         This exception will be thrown by FileDownloader objects if they detect
43         multiple files would have to be downloaded to the same file on disk.
44         """
45         pass
46
47 class PostProcessingError(Exception):
48         """Post Processing exception.
49
50         This exception may be raised by PostProcessor's .run() method to
51         indicate an error in the postprocessing task.
52         """
53         pass
54
55 class UnavailableFormatError(Exception):
56         """Unavailable Format exception.
57
58         This exception will be thrown when a video is requested
59         in a format that is not available for that video.
60         """
61         pass
62
63 class ContentTooShortError(Exception):
64         """Content Too Short exception.
65
66         This exception may be raised by FileDownloader objects when a file they
67         download is too small for what the server announced first, indicating
68         the connection was probably interrupted.
69         """
70         # Both in bytes
71         downloaded = None
72         expected = None
73
74         def __init__(self, downloaded, expected):
75                 self.downloaded = downloaded
76                 self.expected = expected
77
78 class FileDownloader(object):
79         """File Downloader class.
80
81         File downloader objects are the ones responsible of downloading the
82         actual video file and writing it to disk if the user has requested
83         it, among some other tasks. In most cases there should be one per
84         program. As, given a video URL, the downloader doesn't know how to
85         extract all the needed information, task that InfoExtractors do, it
86         has to pass the URL to one of them.
87
88         For this, file downloader objects have a method that allows
89         InfoExtractors to be registered in a given order. When it is passed
90         a URL, the file downloader handles it to the first InfoExtractor it
91         finds that reports being able to handle it. The InfoExtractor extracts
92         all the information about the video or videos the URL refers to, and
93         asks the FileDownloader to process the video information, possibly
94         downloading the video.
95
96         File downloaders accept a lot of parameters. In order not to saturate
97         the object constructor with arguments, it receives a dictionary of
98         options instead. These options are available through the params
99         attribute for the InfoExtractors to use. The FileDownloader also
100         registers itself as the downloader in charge for the InfoExtractors
101         that are added to it, so this is a "mutual registration".
102
103         Available options:
104
105         username:       Username for authentication purposes.
106         password:       Password for authentication purposes.
107         usenetrc:       Use netrc for authentication instead.
108         quiet:          Do not print messages to stdout.
109         forceurl:       Force printing final URL.
110         forcetitle:     Force printing title.
111         simulate:       Do not download the video files.
112         format:         Video format code.
113         outtmpl:        Template for output names.
114         ignoreerrors:   Do not stop on download errors.
115         ratelimit:      Download speed limit, in bytes/sec.
116         nooverwrites:   Prevent overwriting files.
117         """
118
119         params = None
120         _ies = []
121         _pps = []
122         _download_retcode = None
123
124         def __init__(self, params):
125                 """Create a FileDownloader object with the given options."""
126                 self._ies = []
127                 self._pps = []
128                 self._download_retcode = 0
129                 self.params = params
130         
131         @staticmethod
132         def pmkdir(filename):
133                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
134                 components = filename.split(os.sep)
135                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
136                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
137                 for dir in aggregate:
138                         if not os.path.exists(dir):
139                                 os.mkdir(dir)
140         
141         @staticmethod
142         def format_bytes(bytes):
143                 if bytes is None:
144                         return 'N/A'
145                 if bytes == 0:
146                         exponent = 0
147                 else:
148                         exponent = long(math.log(float(bytes), 1024.0))
149                 suffix = 'bkMGTPEZY'[exponent]
150                 converted = float(bytes) / float(1024**exponent)
151                 return '%.2f%s' % (converted, suffix)
152
153         @staticmethod
154         def calc_percent(byte_counter, data_len):
155                 if data_len is None:
156                         return '---.-%'
157                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
158
159         @staticmethod
160         def calc_eta(start, now, total, current):
161                 if total is None:
162                         return '--:--'
163                 dif = now - start
164                 if current == 0 or dif < 0.001: # One millisecond
165                         return '--:--'
166                 rate = float(current) / dif
167                 eta = long((float(total) - float(current)) / rate)
168                 (eta_mins, eta_secs) = divmod(eta, 60)
169                 if eta_mins > 99:
170                         return '--:--'
171                 return '%02d:%02d' % (eta_mins, eta_secs)
172
173         @staticmethod
174         def calc_speed(start, now, bytes):
175                 dif = now - start
176                 if bytes == 0 or dif < 0.001: # One millisecond
177                         return '%10s' % '---b/s'
178                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
179
180         @staticmethod
181         def best_block_size(elapsed_time, bytes):
182                 new_min = max(bytes / 2.0, 1.0)
183                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
184                 if elapsed_time < 0.001:
185                         return int(new_max)
186                 rate = bytes / elapsed_time
187                 if rate > new_max:
188                         return int(new_max)
189                 if rate < new_min:
190                         return int(new_min)
191                 return int(rate)
192
193         @staticmethod
194         def parse_bytes(bytestr):
195                 """Parse a string indicating a byte quantity into a long integer."""
196                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
197                 if matchobj is None:
198                         return None
199                 number = float(matchobj.group(1))
200                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
201                 return long(round(number * multiplier))
202
203         @staticmethod
204         def verify_url(url):
205                 """Verify a URL is valid and data could be downloaded."""
206                 request = urllib2.Request(url, None, std_headers)
207                 data = urllib2.urlopen(request)
208                 data.read(1)
209                 data.close()
210
211         def add_info_extractor(self, ie):
212                 """Add an InfoExtractor object to the end of the list."""
213                 self._ies.append(ie)
214                 ie.set_downloader(self)
215         
216         def add_post_processor(self, pp):
217                 """Add a PostProcessor object to the end of the chain."""
218                 self._pps.append(pp)
219                 pp.set_downloader(self)
220         
221         def to_stdout(self, message, skip_eol=False):
222                 """Print message to stdout if not in quiet mode."""
223                 if not self.params.get('quiet', False):
224                         print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(locale.getpreferredencoding()),
225                         sys.stdout.flush()
226         
227         def to_stderr(self, message):
228                 """Print message to stderr."""
229                 print >>sys.stderr, message
230         
231         def fixed_template(self):
232                 """Checks if the output template is fixed."""
233                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
234
235         def trouble(self, message=None):
236                 """Determine action to take when a download problem appears.
237
238                 Depending on if the downloader has been configured to ignore
239                 download errors or not, this method may throw an exception or
240                 not when errors are found, after printing the message.
241                 """
242                 if message is not None:
243                         self.to_stderr(message)
244                 if not self.params.get('ignoreerrors', False):
245                         raise DownloadError(message)
246                 self._download_retcode = 1
247
248         def slow_down(self, start_time, byte_counter):
249                 """Sleep if the download speed is over the rate limit."""
250                 rate_limit = self.params.get('ratelimit', None)
251                 if rate_limit is None or byte_counter == 0:
252                         return
253                 now = time.time()
254                 elapsed = now - start_time
255                 if elapsed <= 0.0:
256                         return
257                 speed = float(byte_counter) / elapsed
258                 if speed > rate_limit:
259                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
260
261         def report_destination(self, filename):
262                 """Report destination filename."""
263                 self.to_stdout(u'[download] Destination: %s' % filename)
264         
265         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
266                 """Report download progress."""
267                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
268                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
269         
270         def report_finish(self):
271                 """Report download finished."""
272                 self.to_stdout(u'')
273
274         def process_info(self, info_dict):
275                 """Process a single dictionary returned by an InfoExtractor."""
276                 # Do nothing else if in simulate mode
277                 if self.params.get('simulate', False):
278                         try:
279                                 self.verify_url(info_dict['url'])
280                         except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
281                                 raise UnavailableFormatError
282
283                         # Forced printings
284                         if self.params.get('forcetitle', False):
285                                 print info_dict['title'].encode(locale.getpreferredencoding())
286                         if self.params.get('forceurl', False):
287                                 print info_dict['url'].encode(locale.getpreferredencoding())
288
289                         return
290                         
291                 try:
292                         template_dict = dict(info_dict)
293                         template_dict['epoch'] = unicode(long(time.time()))
294                         filename = self.params['outtmpl'] % template_dict
295                         self.report_destination(filename)
296                 except (ValueError, KeyError), err:
297                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
298                 if self.params['nooverwrites'] and os.path.exists(filename):
299                         self.to_stderr('WARNING: file exists: %s; skipping' % filename)
300                         return
301
302                 try:
303                         self.pmkdir(filename)
304                 except (OSError, IOError), err:
305                         self.trouble('ERROR: unable to create directories: %s' % str(err))
306                         return
307
308                 try:
309                         outstream = open(filename, 'ab')
310                 except (OSError, IOError), err:
311                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
312                         return
313
314                 try:
315                         self._do_download(outstream, info_dict['url'])
316                         outstream.close()
317                 except (OSError, IOError), err:
318                         outstream.close()
319                         os.remove(filename)
320                         raise UnavailableFormatError
321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
322                         self.trouble('ERROR: unable to download video data: %s' % str(err))
323                         return
324                 except (ContentTooShortError, ), err:
325                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
326                         return
327
328                 try:
329                         self.post_process(filename, info_dict)
330                 except (PostProcessingError), err:
331                         self.trouble('ERROR: postprocessing: %s' % str(err))
332                         return
333
334         def download(self, url_list):
335                 """Download a given list of URLs."""
336                 if len(url_list) > 1 and self.fixed_template():
337                         raise SameFileError(self.params['outtmpl'])
338
339                 for url in url_list:
340                         suitable_found = False
341                         for ie in self._ies:
342                                 # Go to next InfoExtractor if not suitable
343                                 if not ie.suitable(url):
344                                         continue
345
346                                 # Suitable InfoExtractor found
347                                 suitable_found = True
348
349                                 # Extract information from URL and process it
350                                 ie.extract(url)
351
352                                 # Suitable InfoExtractor had been found; go to next URL
353                                 break
354
355                         if not suitable_found:
356                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
357
358                 return self._download_retcode
359
360         def post_process(self, filename, ie_info):
361                 """Run the postprocessing chain on the given file."""
362                 info = dict(ie_info)
363                 info['filepath'] = filename
364                 for pp in self._pps:
365                         info = pp.run(info)
366                         if info is None:
367                                 break
368         
369         def _do_download(self, stream, url):
370                 request = urllib2.Request(url, None, std_headers)
371                 # Resume transfer if filesize is non-zero
372                 resume_len = stream.tell()
373                 if self.params["continue"] and resume_len != 0:
374                         print "[download] Resuming download at byte %d" % resume_len
375                         request.add_header("Range","bytes=%d-" % resume_len)
376                 else:
377                         stream.close()
378                         stream = open(stream.name,'wb')
379                 try:
380                         data = urllib2.urlopen(request)
381                 except urllib2.HTTPError, e:
382                         if not e.code == 416: #  416 is 'Requested range not satisfiable'
383                                 raise
384                         data = urllib2.urlopen(url)
385                         if int(data.info()['Content-Length']) == resume_len:
386                                 print '[download] %s has already been downloaded' % stream.name
387                                 return
388                         else:
389                                 print "[download] Unable to resume, restarting download from the beginning"
390                                 stream.close()
391                                 stream = open(stream.name,'wb')
392                 data_len = data.info().get('Content-length', None)
393                 data_len_str = self.format_bytes(data_len)
394                 byte_counter = 0
395                 block_size = 1024
396                 start = time.time()
397                 while True:
398                         # Progress message
399                         percent_str = self.calc_percent(byte_counter, data_len)
400                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
401                         speed_str = self.calc_speed(start, time.time(), byte_counter)
402                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
403
404                         # Download and write
405                         before = time.time()
406                         data_block = data.read(block_size)
407                         after = time.time()
408                         data_block_len = len(data_block)
409                         if data_block_len == 0:
410                                 break
411                         byte_counter += data_block_len
412                         stream.write(data_block)
413                         block_size = self.best_block_size(after - before, data_block_len)
414
415                         # Apply rate limit
416                         self.slow_down(start, byte_counter)
417
418                 self.report_finish()
419                 if data_len is not None and str(byte_counter) != data_len:
420                         raise ContentTooShortError(byte_counter, long(data_len))
421
422 class InfoExtractor(object):
423         """Information Extractor class.
424
425         Information extractors are the classes that, given a URL, extract
426         information from the video (or videos) the URL refers to. This
427         information includes the real video URL, the video title and simplified
428         title, author and others. The information is stored in a dictionary
429         which is then passed to the FileDownloader. The FileDownloader
430         processes this information possibly downloading the video to the file
431         system, among other possible outcomes. The dictionaries must include
432         the following fields:
433
434         id:             Video identifier.
435         url:            Final video URL.
436         uploader:       Nickname of the video uploader.
437         title:          Literal title.
438         stitle:         Simplified title.
439         ext:            Video filename extension.
440
441         Subclasses of this one should re-define the _real_initialize() and
442         _real_extract() methods, as well as the suitable() static method.
443         Probably, they should also be instantiated and added to the main
444         downloader.
445         """
446
447         _ready = False
448         _downloader = None
449
450         def __init__(self, downloader=None):
451                 """Constructor. Receives an optional downloader."""
452                 self._ready = False
453                 self.set_downloader(downloader)
454
455         @staticmethod
456         def suitable(url):
457                 """Receives a URL and returns True if suitable for this IE."""
458                 return False
459
460         def initialize(self):
461                 """Initializes an instance (authentication, etc)."""
462                 if not self._ready:
463                         self._real_initialize()
464                         self._ready = True
465
466         def extract(self, url):
467                 """Extracts URL information and returns it in list of dicts."""
468                 self.initialize()
469                 return self._real_extract(url)
470
471         def set_downloader(self, downloader):
472                 """Sets the downloader for this IE."""
473                 self._downloader = downloader
474         
475         def _real_initialize(self):
476                 """Real initialization process. Redefine in subclasses."""
477                 pass
478
479         def _real_extract(self, url):
480                 """Real extraction process. Redefine in subclasses."""
481                 pass
482
483 class YoutubeIE(InfoExtractor):
484         """Information extractor for youtube.com."""
485
486         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
487         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
488         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
489         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
490         _NETRC_MACHINE = 'youtube'
491         _available_formats = ['22', '35', '18', '17', '13'] # listed in order of priority for -b flag
492         _video_extensions = {
493                 '13': '3gp',
494                 '17': 'mp4',
495                 '18': 'mp4',
496                 '22': 'mp4',
497         }
498
499         @staticmethod
500         def suitable(url):
501                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
502
503         @staticmethod
504         def htmlentity_transform(matchobj):
505                 """Transforms an HTML entity to a Unicode character."""
506                 entity = matchobj.group(1)
507
508                 # Known non-numeric HTML entity
509                 if entity in htmlentitydefs.name2codepoint:
510                         return unichr(htmlentitydefs.name2codepoint[entity])
511
512                 # Unicode character
513                 mobj = re.match(ur'(?u)#(x?\d+)', entity)
514                 if mobj is not None:
515                         numstr = mobj.group(1)
516                         if numstr.startswith(u'x'):
517                                 base = 16
518                                 numstr = u'0%s' % numstr
519                         else:
520                                 base = 10
521                         return unichr(long(numstr, base))
522
523                 # Unknown entity in name, return its literal representation
524                 return (u'&%s;' % entity)
525
526         def report_lang(self):
527                 """Report attempt to set language."""
528                 self._downloader.to_stdout(u'[youtube] Setting language')
529
530         def report_login(self):
531                 """Report attempt to log in."""
532                 self._downloader.to_stdout(u'[youtube] Logging in')
533         
534         def report_age_confirmation(self):
535                 """Report attempt to confirm age."""
536                 self._downloader.to_stdout(u'[youtube] Confirming age')
537         
538         def report_webpage_download(self, video_id):
539                 """Report attempt to download webpage."""
540                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
541         
542         def report_information_extraction(self, video_id):
543                 """Report attempt to extract video information."""
544                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
545         
546         def report_video_url(self, video_id, video_real_url):
547                 """Report extracted video URL."""
548                 self._downloader.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
549         
550         def report_unavailable_format(self, video_id, format):
551                 """Report extracted video URL."""
552                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
553         
554         def _real_initialize(self):
555                 if self._downloader is None:
556                         return
557
558                 username = None
559                 password = None
560                 downloader_params = self._downloader.params
561
562                 # Attempt to use provided username and password or .netrc data
563                 if downloader_params.get('username', None) is not None:
564                         username = downloader_params['username']
565                         password = downloader_params['password']
566                 elif downloader_params.get('usenetrc', False):
567                         try:
568                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
569                                 if info is not None:
570                                         username = info[0]
571                                         password = info[2]
572                                 else:
573                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
574                         except (IOError, netrc.NetrcParseError), err:
575                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
576                                 return
577
578                 # Set language
579                 request = urllib2.Request(self._LANG_URL, None, std_headers)
580                 try:
581                         self.report_lang()
582                         urllib2.urlopen(request).read()
583                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
584                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
585                         return
586
587                 # No authentication to be performed
588                 if username is None:
589                         return
590
591                 # Log in
592                 login_form = {
593                                 'current_form': 'loginForm',
594                                 'next':         '/',
595                                 'action_login': 'Log In',
596                                 'username':     username,
597                                 'password':     password,
598                                 }
599                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
600                 try:
601                         self.report_login()
602                         login_results = urllib2.urlopen(request).read()
603                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
604                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
605                                 return
606                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
607                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
608                         return
609         
610                 # Confirm age
611                 age_form = {
612                                 'next_url':             '/',
613                                 'action_confirm':       'Confirm',
614                                 }
615                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
616                 try:
617                         self.report_age_confirmation()
618                         age_results = urllib2.urlopen(request).read()
619                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
620                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
621                         return
622
623         def _real_extract(self, url):
624                 # Extract video id from URL
625                 mobj = re.match(self._VALID_URL, url)
626                 if mobj is None:
627                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
628                         return
629                 video_id = mobj.group(2)
630
631                 # Downloader parameters
632                 best_quality = False
633                 format_param = None
634                 quality_index = 0
635                 if self._downloader is not None:
636                         params = self._downloader.params
637                         format_param = params.get('format', None)
638                         if format_param == '0':
639                                 format_param = self._available_formats[quality_index]
640                                 best_quality = True
641
642                 while True:
643                         # Extension
644                         video_extension = self._video_extensions.get(format_param, 'flv')
645
646                         # Normalize URL, including format
647                         normalized_url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
648                         if format_param is not None:
649                                 normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
650                         request = urllib2.Request(normalized_url, None, std_headers)
651                         try:
652                                 self.report_webpage_download(video_id)
653                                 video_webpage = urllib2.urlopen(request).read()
654                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
655                                 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
656                                 return
657                         self.report_information_extraction(video_id)
658                         
659                         # "t" param
660                         mobj = re.search(r', "t": "([^"]+)"', video_webpage)
661                         if mobj is None:
662                                 self._downloader.trouble(u'ERROR: unable to extract "t" parameter')
663                                 return
664                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&el=detailpage&ps=' % (video_id, mobj.group(1))
665                         if format_param is not None:
666                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
667                         self.report_video_url(video_id, video_real_url)
668
669                         # uploader
670                         mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
671                         if mobj is None:
672                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
673                                 return
674                         video_uploader = mobj.group(1)
675
676                         # title
677                         mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
678                         if mobj is None:
679                                 self._downloader.trouble(u'ERROR: unable to extract video title')
680                                 return
681                         video_title = mobj.group(1).decode('utf-8')
682                         video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
683                         video_title = video_title.replace(os.sep, u'%')
684
685                         # simplified title
686                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
687                         simple_title = simple_title.strip(ur'_')
688
689                         try:
690                                 # Process video information
691                                 self._downloader.process_info({
692                                         'id':           video_id.decode('utf-8'),
693                                         'url':          video_real_url.decode('utf-8'),
694                                         'uploader':     video_uploader.decode('utf-8'),
695                                         'title':        video_title,
696                                         'stitle':       simple_title,
697                                         'ext':          video_extension.decode('utf-8'),
698                                 })
699
700                                 return
701
702                         except UnavailableFormatError, err:
703                                 if best_quality:
704                                         if quality_index == len(self._available_formats) - 1:
705                                                 # I don't ever expect this to happen
706                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
707                                                 return
708                                         else:
709                                                 self.report_unavailable_format(video_id, format_param)
710                                                 quality_index += 1
711                                                 format_param = self._available_formats[quality_index]
712                                                 continue
713                                 else: 
714                                         self._downloader.trouble('ERROR: format not available for video')
715                                         return
716
717
718 class MetacafeIE(InfoExtractor):
719         """Information Extractor for metacafe.com."""
720
721         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
722         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
723         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
724         _youtube_ie = None
725
726         def __init__(self, youtube_ie, downloader=None):
727                 InfoExtractor.__init__(self, downloader)
728                 self._youtube_ie = youtube_ie
729
730         @staticmethod
731         def suitable(url):
732                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
733
734         def report_disclaimer(self):
735                 """Report disclaimer retrieval."""
736                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
737
738         def report_age_confirmation(self):
739                 """Report attempt to confirm age."""
740                 self._downloader.to_stdout(u'[metacafe] Confirming age')
741         
742         def report_download_webpage(self, video_id):
743                 """Report webpage download."""
744                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
745         
746         def report_extraction(self, video_id):
747                 """Report information extraction."""
748                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
749
750         def _real_initialize(self):
751                 # Retrieve disclaimer
752                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
753                 try:
754                         self.report_disclaimer()
755                         disclaimer = urllib2.urlopen(request).read()
756                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
757                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
758                         return
759
760                 # Confirm age
761                 disclaimer_form = {
762                         'filters': '0',
763                         'submit': "Continue - I'm over 18",
764                         }
765                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
766                 try:
767                         self.report_age_confirmation()
768                         disclaimer = urllib2.urlopen(request).read()
769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
770                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
771                         return
772         
773         def _real_extract(self, url):
774                 # Extract id and simplified title from URL
775                 mobj = re.match(self._VALID_URL, url)
776                 if mobj is None:
777                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
778                         return
779
780                 video_id = mobj.group(1)
781
782                 # Check if video comes from YouTube
783                 mobj2 = re.match(r'^yt-(.*)$', video_id)
784                 if mobj2 is not None:
785                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
786                         return
787
788                 simple_title = mobj.group(2).decode('utf-8')
789                 video_extension = 'flv'
790
791                 # Retrieve video webpage to extract further information
792                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
793                 try:
794                         self.report_download_webpage(video_id)
795                         webpage = urllib2.urlopen(request).read()
796                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
797                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
798                         return
799
800                 # Extract URL, uploader and title from webpage
801                 self.report_extraction(video_id)
802                 mobj = re.search(r'(?m)&mediaURL=(http.*?\.flv)', webpage)
803                 if mobj is None:
804                         self._downloader.trouble(u'ERROR: unable to extract media URL')
805                         return
806                 mediaURL = urllib.unquote(mobj.group(1))
807
808                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
809                 if mobj is None:
810                         self._downloader.trouble(u'ERROR: unable to extract gdaKey')
811                         return
812                 gdaKey = mobj.group(1)
813
814                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
815
816                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
817                 if mobj is None:
818                         self._downloader.trouble(u'ERROR: unable to extract title')
819                         return
820                 video_title = mobj.group(1).decode('utf-8')
821
822                 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
823                 if mobj is None:
824                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
825                         return
826                 video_uploader = mobj.group(1)
827
828                 try:
829                         # Process video information
830                         self._downloader.process_info({
831                                 'id':           video_id.decode('utf-8'),
832                                 'url':          video_url.decode('utf-8'),
833                                 'uploader':     video_uploader.decode('utf-8'),
834                                 'title':        video_title,
835                                 'stitle':       simple_title,
836                                 'ext':          video_extension.decode('utf-8'),
837                         })
838                 except UnavailableFormatError:
839                         self._downloader.trouble(u'ERROR: format not available for video')
840
841
842 class YoutubeSearchIE(InfoExtractor):
843         """Information Extractor for YouTube search queries."""
844         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
845         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
846         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
847         _MORE_PAGES_INDICATOR = r'>Next</a>'
848         _youtube_ie = None
849         _max_youtube_results = 1000
850
851         def __init__(self, youtube_ie, downloader=None):
852                 InfoExtractor.__init__(self, downloader)
853                 self._youtube_ie = youtube_ie
854         
855         @staticmethod
856         def suitable(url):
857                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
858
859         def report_download_page(self, query, pagenum):
860                 """Report attempt to download playlist page with given number."""
861                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
862
863         def _real_initialize(self):
864                 self._youtube_ie.initialize()
865         
866         def _real_extract(self, query):
867                 mobj = re.match(self._VALID_QUERY, query)
868                 if mobj is None:
869                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
870                         return
871
872                 prefix, query = query.split(':')
873                 prefix = prefix[8:]
874                 if prefix == '':
875                         self._download_n_results(query, 1)
876                         return
877                 elif prefix == 'all':
878                         self._download_n_results(query, self._max_youtube_results)
879                         return
880                 else:
881                         try:
882                                 n = int(prefix)
883                                 if n <= 0:
884                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
885                                         return
886                                 elif n > self._max_youtube_results:
887                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
888                                         n = self._max_youtube_results
889                                 self._download_n_results(query, n)
890                                 return
891                         except ValueError: # parsing prefix as int fails
892                                 self._download_n_results(query, 1)
893                                 return
894
895         def _download_n_results(self, query, n):
896                 """Downloads a specified number of results for a query"""
897
898                 video_ids = []
899                 already_seen = set()
900                 pagenum = 1
901
902                 while True:
903                         self.report_download_page(query, pagenum)
904                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
905                         request = urllib2.Request(result_url, None, std_headers)
906                         try:
907                                 page = urllib2.urlopen(request).read()
908                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
909                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
910                                 return
911
912                         # Extract video identifiers
913                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
914                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
915                                 if video_id not in already_seen:
916                                         video_ids.append(video_id)
917                                         already_seen.add(video_id)
918                                         if len(video_ids) == n:
919                                                 # Specified n videos reached
920                                                 for id in video_ids:
921                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
922                                                 return
923
924                         if self._MORE_PAGES_INDICATOR not in page:
925                                 for id in video_ids:
926                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
927                                 return
928
929                         pagenum = pagenum + 1
930
931 class YoutubePlaylistIE(InfoExtractor):
932         """Information Extractor for YouTube playlists."""
933
934         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
935         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
936         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
937         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
938         _youtube_ie = None
939
940         def __init__(self, youtube_ie, downloader=None):
941                 InfoExtractor.__init__(self, downloader)
942                 self._youtube_ie = youtube_ie
943         
944         @staticmethod
945         def suitable(url):
946                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
947
948         def report_download_page(self, playlist_id, pagenum):
949                 """Report attempt to download playlist page with given number."""
950                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
951
952         def _real_initialize(self):
953                 self._youtube_ie.initialize()
954         
955         def _real_extract(self, url):
956                 # Extract playlist id
957                 mobj = re.match(self._VALID_URL, url)
958                 if mobj is None:
959                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
960                         return
961
962                 # Download playlist pages
963                 playlist_id = mobj.group(1)
964                 video_ids = []
965                 pagenum = 1
966
967                 while True:
968                         self.report_download_page(playlist_id, pagenum)
969                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
970                         try:
971                                 page = urllib2.urlopen(request).read()
972                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
973                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
974                                 return
975
976                         # Extract video identifiers
977                         ids_in_page = []
978                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
979                                 if mobj.group(1) not in ids_in_page:
980                                         ids_in_page.append(mobj.group(1))
981                         video_ids.extend(ids_in_page)
982
983                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
984                                 break
985                         pagenum = pagenum + 1
986
987                 for id in video_ids:
988                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
989                 return
990
991 class PostProcessor(object):
992         """Post Processor class.
993
994         PostProcessor objects can be added to downloaders with their
995         add_post_processor() method. When the downloader has finished a
996         successful download, it will take its internal chain of PostProcessors
997         and start calling the run() method on each one of them, first with
998         an initial argument and then with the returned value of the previous
999         PostProcessor.
1000
1001         The chain will be stopped if one of them ever returns None or the end
1002         of the chain is reached.
1003
1004         PostProcessor objects follow a "mutual registration" process similar
1005         to InfoExtractor objects.
1006         """
1007
1008         _downloader = None
1009
1010         def __init__(self, downloader=None):
1011                 self._downloader = downloader
1012
1013         def set_downloader(self, downloader):
1014                 """Sets the downloader for this PP."""
1015                 self._downloader = downloader
1016         
1017         def run(self, information):
1018                 """Run the PostProcessor.
1019
1020                 The "information" argument is a dictionary like the ones
1021                 composed by InfoExtractors. The only difference is that this
1022                 one has an extra field called "filepath" that points to the
1023                 downloaded file.
1024
1025                 When this method returns None, the postprocessing chain is
1026                 stopped. However, this method may return an information
1027                 dictionary that will be passed to the next postprocessing
1028                 object in the chain. It can be the one it received after
1029                 changing some fields.
1030
1031                 In addition, this method may raise a PostProcessingError
1032                 exception that will be taken into account by the downloader
1033                 it was called from.
1034                 """
1035                 return information # by default, do nothing
1036         
1037 ### MAIN PROGRAM ###
1038 if __name__ == '__main__':
1039         try:
1040                 # Modules needed only when running the main program
1041                 import getpass
1042                 import optparse
1043
1044                 # General configuration
1045                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1046                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1047                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1048
1049                 # Parse command line
1050                 parser = optparse.OptionParser(
1051                         usage='Usage: %prog [options] url...',
1052                         version='INTERNAL',
1053                         conflict_handler='resolve',
1054                 )
1055
1056                 parser.add_option('-h', '--help',
1057                                 action='help', help='print this help text and exit')
1058                 parser.add_option('-v', '--version',
1059                                 action='version', help='print program version and exit')
1060                 parser.add_option('-i', '--ignore-errors',
1061                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1062                 parser.add_option('-r', '--rate-limit',
1063                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1064
1065                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1066                 authentication.add_option('-u', '--username',
1067                                 dest='username', metavar='UN', help='account username')
1068                 authentication.add_option('-p', '--password',
1069                                 dest='password', metavar='PW', help='account password')
1070                 authentication.add_option('-n', '--netrc',
1071                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1072                 parser.add_option_group(authentication)
1073
1074                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1075                 video_format.add_option('-f', '--format',
1076                                 action='store', dest='format', metavar='FMT', help='video format code')
1077                 video_format.add_option('-b', '--best-quality',
1078                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1079                 video_format.add_option('-m', '--mobile-version',
1080                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1081                 video_format.add_option('-d', '--high-def',
1082                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1083                 parser.add_option_group(video_format)
1084
1085                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1086                 verbosity.add_option('-q', '--quiet',
1087                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1088                 verbosity.add_option('-s', '--simulate',
1089                                 action='store_true', dest='simulate', help='do not download video', default=False)
1090                 verbosity.add_option('-g', '--get-url',
1091                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1092                 verbosity.add_option('-e', '--get-title',
1093                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1094                 parser.add_option_group(verbosity)
1095
1096                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1097                 filesystem.add_option('-t', '--title',
1098                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1099                 filesystem.add_option('-l', '--literal',
1100                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1101                 filesystem.add_option('-o', '--output',
1102                                 dest='outtmpl', metavar='TPL', help='output filename template')
1103                 filesystem.add_option('-a', '--batch-file',
1104                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1105                 filesystem.add_option('-w', '--no-overwrites',
1106                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1107                 filesystem.add_option('-c', '--continue',
1108                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1109                 parser.add_option_group(filesystem)
1110
1111                 (opts, args) = parser.parse_args()
1112
1113                 # Batch file verification
1114                 batchurls = []
1115                 if opts.batchfile is not None:
1116                         try:
1117                                 batchurls = open(opts.batchfile, 'r').readlines()
1118                                 batchurls = [x.strip() for x in batchurls]
1119                                 batchurls = [x for x in batchurls if len(x) > 0]
1120                         except IOError:
1121                                 sys.exit(u'ERROR: batch file could not be read')
1122                 all_urls = batchurls + args
1123
1124                 # Conflicting, missing and erroneous options
1125                 if len(all_urls) < 1:
1126                         parser.error(u'you must provide at least one URL')
1127                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1128                         parser.error(u'using .netrc conflicts with giving username/password')
1129                 if opts.password is not None and opts.username is None:
1130                         parser.error(u'account username missing')
1131                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1132                         parser.error(u'using output template conflicts with using title or literal title')
1133                 if opts.usetitle and opts.useliteral:
1134                         parser.error(u'using title conflicts with using literal title')
1135                 if opts.username is not None and opts.password is None:
1136                         opts.password = getpass.getpass(u'Type account password and press return:')
1137                 if opts.ratelimit is not None:
1138                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1139                         if numeric_limit is None:
1140                                 parser.error(u'invalid rate limit specified')
1141                         opts.ratelimit = numeric_limit
1142
1143                 # Information extractors
1144                 youtube_ie = YoutubeIE()
1145                 metacafe_ie = MetacafeIE(youtube_ie)
1146                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1147                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1148
1149                 # File downloader
1150                 fd = FileDownloader({
1151                         'usenetrc': opts.usenetrc,
1152                         'username': opts.username,
1153                         'password': opts.password,
1154                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1155                         'forceurl': opts.geturl,
1156                         'forcetitle': opts.gettitle,
1157                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1158                         'format': opts.format,
1159                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(locale.getpreferredencoding()))
1160                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1161                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1162                                 or u'%(id)s.%(ext)s'),
1163                         'ignoreerrors': opts.ignoreerrors,
1164                         'ratelimit': opts.ratelimit,
1165                         'nooverwrites': opts.nooverwrites,
1166                         'continue': opts.continue_dl,
1167                         })
1168                 fd.add_info_extractor(youtube_search_ie)
1169                 fd.add_info_extractor(youtube_pl_ie)
1170                 fd.add_info_extractor(metacafe_ie)
1171                 fd.add_info_extractor(youtube_ie)
1172                 retcode = fd.download(all_urls)
1173                 sys.exit(retcode)
1174
1175         except DownloadError:
1176                 sys.exit(1)
1177         except SameFileError:
1178                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1179         except KeyboardInterrupt:
1180                 sys.exit(u'\nERROR: Interrupted by user')