Give preference to format 34 before format 5 in quality list
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 stream = open(filename, open_mode)
97                 return (stream, filename)
98         except (IOError, OSError), err:
99                 # In case of error, try to remove win32 forbidden chars
100                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
101
102                 # An exception here should be caught in the caller
103                 stream = open(filename, open_mode)
104                 return (stream, filename)
105
106
107 class DownloadError(Exception):
108         """Download Error exception.
109         
110         This exception may be thrown by FileDownloader objects if they are not
111         configured to continue on errors. They will contain the appropriate
112         error message.
113         """
114         pass
115
116 class SameFileError(Exception):
117         """Same File exception.
118
119         This exception will be thrown by FileDownloader objects if they detect
120         multiple files would have to be downloaded to the same file on disk.
121         """
122         pass
123
124 class PostProcessingError(Exception):
125         """Post Processing exception.
126
127         This exception may be raised by PostProcessor's .run() method to
128         indicate an error in the postprocessing task.
129         """
130         pass
131
132 class UnavailableFormatError(Exception):
133         """Unavailable Format exception.
134
135         This exception will be thrown when a video is requested
136         in a format that is not available for that video.
137         """
138         pass
139
140 class ContentTooShortError(Exception):
141         """Content Too Short exception.
142
143         This exception may be raised by FileDownloader objects when a file they
144         download is too small for what the server announced first, indicating
145         the connection was probably interrupted.
146         """
147         # Both in bytes
148         downloaded = None
149         expected = None
150
151         def __init__(self, downloaded, expected):
152                 self.downloaded = downloaded
153                 self.expected = expected
154
155 class FileDownloader(object):
156         """File Downloader class.
157
158         File downloader objects are the ones responsible of downloading the
159         actual video file and writing it to disk if the user has requested
160         it, among some other tasks. In most cases there should be one per
161         program. As, given a video URL, the downloader doesn't know how to
162         extract all the needed information, task that InfoExtractors do, it
163         has to pass the URL to one of them.
164
165         For this, file downloader objects have a method that allows
166         InfoExtractors to be registered in a given order. When it is passed
167         a URL, the file downloader handles it to the first InfoExtractor it
168         finds that reports being able to handle it. The InfoExtractor extracts
169         all the information about the video or videos the URL refers to, and
170         asks the FileDownloader to process the video information, possibly
171         downloading the video.
172
173         File downloaders accept a lot of parameters. In order not to saturate
174         the object constructor with arguments, it receives a dictionary of
175         options instead. These options are available through the params
176         attribute for the InfoExtractors to use. The FileDownloader also
177         registers itself as the downloader in charge for the InfoExtractors
178         that are added to it, so this is a "mutual registration".
179
180         Available options:
181
182         username:       Username for authentication purposes.
183         password:       Password for authentication purposes.
184         usenetrc:       Use netrc for authentication instead.
185         quiet:          Do not print messages to stdout.
186         forceurl:       Force printing final URL.
187         forcetitle:     Force printing title.
188         simulate:       Do not download the video files.
189         format:         Video format code.
190         outtmpl:        Template for output names.
191         ignoreerrors:   Do not stop on download errors.
192         ratelimit:      Download speed limit, in bytes/sec.
193         nooverwrites:   Prevent overwriting files.
194         continuedl:     Try to continue downloads if possible.
195         """
196
197         params = None
198         _ies = []
199         _pps = []
200         _download_retcode = None
201
202         def __init__(self, params):
203                 """Create a FileDownloader object with the given options."""
204                 self._ies = []
205                 self._pps = []
206                 self._download_retcode = 0
207                 self.params = params
208         
209         @staticmethod
210         def pmkdir(filename):
211                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
212                 components = filename.split(os.sep)
213                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
214                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
215                 for dir in aggregate:
216                         if not os.path.exists(dir):
217                                 os.mkdir(dir)
218         
219         @staticmethod
220         def format_bytes(bytes):
221                 if bytes is None:
222                         return 'N/A'
223                 if type(bytes) is str:
224                         bytes = float(bytes)
225                 if bytes == 0.0:
226                         exponent = 0
227                 else:
228                         exponent = long(math.log(bytes, 1024.0))
229                 suffix = 'bkMGTPEZY'[exponent]
230                 converted = float(bytes) / float(1024**exponent)
231                 return '%.2f%s' % (converted, suffix)
232
233         @staticmethod
234         def calc_percent(byte_counter, data_len):
235                 if data_len is None:
236                         return '---.-%'
237                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
238
239         @staticmethod
240         def calc_eta(start, now, total, current):
241                 if total is None:
242                         return '--:--'
243                 dif = now - start
244                 if current == 0 or dif < 0.001: # One millisecond
245                         return '--:--'
246                 rate = float(current) / dif
247                 eta = long((float(total) - float(current)) / rate)
248                 (eta_mins, eta_secs) = divmod(eta, 60)
249                 if eta_mins > 99:
250                         return '--:--'
251                 return '%02d:%02d' % (eta_mins, eta_secs)
252
253         @staticmethod
254         def calc_speed(start, now, bytes):
255                 dif = now - start
256                 if bytes == 0 or dif < 0.001: # One millisecond
257                         return '%10s' % '---b/s'
258                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
259
260         @staticmethod
261         def best_block_size(elapsed_time, bytes):
262                 new_min = max(bytes / 2.0, 1.0)
263                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
264                 if elapsed_time < 0.001:
265                         return long(new_max)
266                 rate = bytes / elapsed_time
267                 if rate > new_max:
268                         return long(new_max)
269                 if rate < new_min:
270                         return long(new_min)
271                 return long(rate)
272
273         @staticmethod
274         def parse_bytes(bytestr):
275                 """Parse a string indicating a byte quantity into a long integer."""
276                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
277                 if matchobj is None:
278                         return None
279                 number = float(matchobj.group(1))
280                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
281                 return long(round(number * multiplier))
282
283         @staticmethod
284         def verify_url(url):
285                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
286                 request = urllib2.Request(url, None, std_headers)
287                 data = urllib2.urlopen(request)
288                 data.read(1)
289                 url = data.geturl()
290                 data.close()
291                 return url
292
293         def add_info_extractor(self, ie):
294                 """Add an InfoExtractor object to the end of the list."""
295                 self._ies.append(ie)
296                 ie.set_downloader(self)
297         
298         def add_post_processor(self, pp):
299                 """Add a PostProcessor object to the end of the chain."""
300                 self._pps.append(pp)
301                 pp.set_downloader(self)
302         
303         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
304                 """Print message to stdout if not in quiet mode."""
305                 try:
306                         if not self.params.get('quiet', False):
307                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
308                         sys.stdout.flush()
309                 except (UnicodeEncodeError), err:
310                         if not ignore_encoding_errors:
311                                 raise
312         
313         def to_stderr(self, message):
314                 """Print message to stderr."""
315                 print >>sys.stderr, message.encode(preferredencoding())
316         
317         def fixed_template(self):
318                 """Checks if the output template is fixed."""
319                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
320
321         def trouble(self, message=None):
322                 """Determine action to take when a download problem appears.
323
324                 Depending on if the downloader has been configured to ignore
325                 download errors or not, this method may throw an exception or
326                 not when errors are found, after printing the message.
327                 """
328                 if message is not None:
329                         self.to_stderr(message)
330                 if not self.params.get('ignoreerrors', False):
331                         raise DownloadError(message)
332                 self._download_retcode = 1
333
334         def slow_down(self, start_time, byte_counter):
335                 """Sleep if the download speed is over the rate limit."""
336                 rate_limit = self.params.get('ratelimit', None)
337                 if rate_limit is None or byte_counter == 0:
338                         return
339                 now = time.time()
340                 elapsed = now - start_time
341                 if elapsed <= 0.0:
342                         return
343                 speed = float(byte_counter) / elapsed
344                 if speed > rate_limit:
345                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
346
347         def report_destination(self, filename):
348                 """Report destination filename."""
349                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
350         
351         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
352                 """Report download progress."""
353                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
354                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
355
356         def report_resuming_byte(self, resume_len):
357                 """Report attemtp to resume at given byte."""
358                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
359         
360         def report_file_already_downloaded(self, file_name):
361                 """Report file has already been fully downloaded."""
362                 try:
363                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
364                 except (UnicodeEncodeError), err:
365                         self.to_stdout(u'[download] The file has already been downloaded')
366         
367         def report_unable_to_resume(self):
368                 """Report it was impossible to resume download."""
369                 self.to_stdout(u'[download] Unable to resume')
370         
371         def report_finish(self):
372                 """Report download finished."""
373                 self.to_stdout(u'')
374
375         def process_info(self, info_dict):
376                 """Process a single dictionary returned by an InfoExtractor."""
377                 # Do nothing else if in simulate mode
378                 if self.params.get('simulate', False):
379                         # Verify URL if it's an HTTP one
380                         if info_dict['url'].startswith('http'):
381                                 try:
382                                         info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
383                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
384                                         raise UnavailableFormatError
385
386                         # Forced printings
387                         if self.params.get('forcetitle', False):
388                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
389                         if self.params.get('forceurl', False):
390                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
391
392                         return
393                         
394                 try:
395                         template_dict = dict(info_dict)
396                         template_dict['epoch'] = unicode(long(time.time()))
397                         filename = self.params['outtmpl'] % template_dict
398                 except (ValueError, KeyError), err:
399                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
400                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
401                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
402                         return
403
404                 try:
405                         self.pmkdir(filename)
406                 except (OSError, IOError), err:
407                         self.trouble('ERROR: unable to create directories: %s' % str(err))
408                         return
409
410                 try:
411                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
412                 except (OSError, IOError), err:
413                         raise UnavailableFormatError
414                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
415                         self.trouble('ERROR: unable to download video data: %s' % str(err))
416                         return
417                 except (ContentTooShortError, ), err:
418                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
419                         return
420
421                 if success:
422                         try:
423                                 self.post_process(filename, info_dict)
424                         except (PostProcessingError), err:
425                                 self.trouble('ERROR: postprocessing: %s' % str(err))
426                                 return
427
428         def download(self, url_list):
429                 """Download a given list of URLs."""
430                 if len(url_list) > 1 and self.fixed_template():
431                         raise SameFileError(self.params['outtmpl'])
432
433                 for url in url_list:
434                         suitable_found = False
435                         for ie in self._ies:
436                                 # Go to next InfoExtractor if not suitable
437                                 if not ie.suitable(url):
438                                         continue
439
440                                 # Suitable InfoExtractor found
441                                 suitable_found = True
442
443                                 # Extract information from URL and process it
444                                 ie.extract(url)
445
446                                 # Suitable InfoExtractor had been found; go to next URL
447                                 break
448
449                         if not suitable_found:
450                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
451
452                 return self._download_retcode
453
454         def post_process(self, filename, ie_info):
455                 """Run the postprocessing chain on the given file."""
456                 info = dict(ie_info)
457                 info['filepath'] = filename
458                 for pp in self._pps:
459                         info = pp.run(info)
460                         if info is None:
461                                 break
462         
463         def _download_with_rtmpdump(self, filename, url):
464                 self.report_destination(filename)
465
466                 # Check for rtmpdump first
467                 try:
468                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
469                 except (OSError, IOError):
470                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
471                         return False
472
473                 # Download using rtmpdump. rtmpdump returns exit code 2 when
474                 # the connection was interrumpted and resuming appears to be
475                 # possible. This is part of rtmpdump's normal usage, AFAIK.
476                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
477                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
478                 while retval == 2 or retval == 1:
479                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
480                         time.sleep(2.0) # This seems to be needed
481                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
482                 if retval == 0:
483                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
484                         return True
485                 else:
486                         self.trouble('ERROR: rtmpdump exited with code %d' % retval)
487                         return False
488
489         def _do_download(self, filename, url):
490                 # Attempt to download using rtmpdump
491                 if url.startswith('rtmp'):
492                         return self._download_with_rtmpdump(filename, url)
493
494                 stream = None
495                 open_mode = 'wb'
496                 basic_request = urllib2.Request(url, None, std_headers)
497                 request = urllib2.Request(url, None, std_headers)
498
499                 # Establish possible resume length
500                 if os.path.isfile(filename):
501                         resume_len = os.path.getsize(filename)
502                 else:
503                         resume_len = 0
504
505                 # Request parameters in case of being able to resume
506                 if self.params.get('continuedl', False) and resume_len != 0:
507                         self.report_resuming_byte(resume_len)
508                         request.add_header('Range','bytes=%d-' % resume_len)
509                         open_mode = 'ab'
510
511                 # Establish connection
512                 try:
513                         data = urllib2.urlopen(request)
514                 except (urllib2.HTTPError, ), err:
515                         if err.code != 416: #  416 is 'Requested range not satisfiable'
516                                 raise
517                         # Unable to resume
518                         data = urllib2.urlopen(basic_request)
519                         content_length = data.info()['Content-Length']
520
521                         if content_length is not None and long(content_length) == resume_len:
522                                 # Because the file had already been fully downloaded
523                                 self.report_file_already_downloaded(filename)
524                                 return True
525                         else:
526                                 # Because the server didn't let us
527                                 self.report_unable_to_resume()
528                                 open_mode = 'wb'
529
530                 data_len = data.info().get('Content-length', None)
531                 data_len_str = self.format_bytes(data_len)
532                 byte_counter = 0
533                 block_size = 1024
534                 start = time.time()
535                 while True:
536                         # Download and write
537                         before = time.time()
538                         data_block = data.read(block_size)
539                         after = time.time()
540                         data_block_len = len(data_block)
541                         if data_block_len == 0:
542                                 break
543                         byte_counter += data_block_len
544
545                         # Open file just in time
546                         if stream is None:
547                                 try:
548                                         (stream, filename) = sanitize_open(filename, open_mode)
549                                         self.report_destination(filename)
550                                 except (OSError, IOError), err:
551                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
552                                         return False
553                         stream.write(data_block)
554                         block_size = self.best_block_size(after - before, data_block_len)
555
556                         # Progress message
557                         percent_str = self.calc_percent(byte_counter, data_len)
558                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
559                         speed_str = self.calc_speed(start, time.time(), byte_counter)
560                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
561
562                         # Apply rate limit
563                         self.slow_down(start, byte_counter)
564
565                 self.report_finish()
566                 if data_len is not None and str(byte_counter) != data_len:
567                         raise ContentTooShortError(byte_counter, long(data_len))
568                 return True
569
570 class InfoExtractor(object):
571         """Information Extractor class.
572
573         Information extractors are the classes that, given a URL, extract
574         information from the video (or videos) the URL refers to. This
575         information includes the real video URL, the video title and simplified
576         title, author and others. The information is stored in a dictionary
577         which is then passed to the FileDownloader. The FileDownloader
578         processes this information possibly downloading the video to the file
579         system, among other possible outcomes. The dictionaries must include
580         the following fields:
581
582         id:             Video identifier.
583         url:            Final video URL.
584         uploader:       Nickname of the video uploader.
585         title:          Literal title.
586         stitle:         Simplified title.
587         ext:            Video filename extension.
588
589         Subclasses of this one should re-define the _real_initialize() and
590         _real_extract() methods, as well as the suitable() static method.
591         Probably, they should also be instantiated and added to the main
592         downloader.
593         """
594
595         _ready = False
596         _downloader = None
597
598         def __init__(self, downloader=None):
599                 """Constructor. Receives an optional downloader."""
600                 self._ready = False
601                 self.set_downloader(downloader)
602
603         @staticmethod
604         def suitable(url):
605                 """Receives a URL and returns True if suitable for this IE."""
606                 return False
607
608         def initialize(self):
609                 """Initializes an instance (authentication, etc)."""
610                 if not self._ready:
611                         self._real_initialize()
612                         self._ready = True
613
614         def extract(self, url):
615                 """Extracts URL information and returns it in list of dicts."""
616                 self.initialize()
617                 return self._real_extract(url)
618
619         def set_downloader(self, downloader):
620                 """Sets the downloader for this IE."""
621                 self._downloader = downloader
622         
623         def _real_initialize(self):
624                 """Real initialization process. Redefine in subclasses."""
625                 pass
626
627         def _real_extract(self, url):
628                 """Real extraction process. Redefine in subclasses."""
629                 pass
630
631 class YoutubeIE(InfoExtractor):
632         """Information extractor for youtube.com."""
633
634         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
635         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
636         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
637         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
638         _NETRC_MACHINE = 'youtube'
639         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
640         _video_extensions = {
641                 '13': '3gp',
642                 '17': 'mp4',
643                 '18': 'mp4',
644                 '22': 'mp4',
645                 '37': 'mp4',
646         }
647
648         @staticmethod
649         def suitable(url):
650                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
651
652         def report_lang(self):
653                 """Report attempt to set language."""
654                 self._downloader.to_stdout(u'[youtube] Setting language')
655
656         def report_login(self):
657                 """Report attempt to log in."""
658                 self._downloader.to_stdout(u'[youtube] Logging in')
659         
660         def report_age_confirmation(self):
661                 """Report attempt to confirm age."""
662                 self._downloader.to_stdout(u'[youtube] Confirming age')
663         
664         def report_video_info_webpage_download(self, video_id):
665                 """Report attempt to download video info webpage."""
666                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
667         
668         def report_information_extraction(self, video_id):
669                 """Report attempt to extract video information."""
670                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
671         
672         def report_unavailable_format(self, video_id, format):
673                 """Report extracted video URL."""
674                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
675         
676         def report_rtmp_download(self):
677                 """Indicate the download will use the RTMP protocol."""
678                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
679         
680         def _real_initialize(self):
681                 if self._downloader is None:
682                         return
683
684                 username = None
685                 password = None
686                 downloader_params = self._downloader.params
687
688                 # Attempt to use provided username and password or .netrc data
689                 if downloader_params.get('username', None) is not None:
690                         username = downloader_params['username']
691                         password = downloader_params['password']
692                 elif downloader_params.get('usenetrc', False):
693                         try:
694                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
695                                 if info is not None:
696                                         username = info[0]
697                                         password = info[2]
698                                 else:
699                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
700                         except (IOError, netrc.NetrcParseError), err:
701                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
702                                 return
703
704                 # Set language
705                 request = urllib2.Request(self._LANG_URL, None, std_headers)
706                 try:
707                         self.report_lang()
708                         urllib2.urlopen(request).read()
709                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
710                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
711                         return
712
713                 # No authentication to be performed
714                 if username is None:
715                         return
716
717                 # Log in
718                 login_form = {
719                                 'current_form': 'loginForm',
720                                 'next':         '/',
721                                 'action_login': 'Log In',
722                                 'username':     username,
723                                 'password':     password,
724                                 }
725                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
726                 try:
727                         self.report_login()
728                         login_results = urllib2.urlopen(request).read()
729                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
730                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
731                                 return
732                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
733                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
734                         return
735         
736                 # Confirm age
737                 age_form = {
738                                 'next_url':             '/',
739                                 'action_confirm':       'Confirm',
740                                 }
741                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
742                 try:
743                         self.report_age_confirmation()
744                         age_results = urllib2.urlopen(request).read()
745                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
746                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
747                         return
748
749         def _real_extract(self, url):
750                 # Extract video id from URL
751                 mobj = re.match(self._VALID_URL, url)
752                 if mobj is None:
753                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
754                         return
755                 video_id = mobj.group(2)
756
757                 # Downloader parameters
758                 best_quality = False
759                 format_param = None
760                 quality_index = 0
761                 if self._downloader is not None:
762                         params = self._downloader.params
763                         format_param = params.get('format', None)
764                         if format_param == '0':
765                                 format_param = self._available_formats[quality_index]
766                                 best_quality = True
767
768                 while True:
769                         # Extension
770                         video_extension = self._video_extensions.get(format_param, 'flv')
771
772                         # Get video info
773                         video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
774                         request = urllib2.Request(video_info_url, None, std_headers)
775                         try:
776                                 self.report_video_info_webpage_download(video_id)
777                                 video_info_webpage = urllib2.urlopen(request).read()
778                                 video_info = parse_qs(video_info_webpage)
779                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
780                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
781                                 return
782                         self.report_information_extraction(video_id)
783
784                         # "t" param
785                         if 'token' not in video_info:
786                                 # Attempt to see if YouTube has issued an error message
787                                 if 'reason' not in video_info:
788                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
789                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
790                                         stream.write(video_info_webpage)
791                                         stream.close()
792                                 else:
793                                         reason = urllib.unquote_plus(video_info['reason'][0])
794                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
795                                 return
796                         token = urllib.unquote_plus(video_info['token'][0])
797                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
798                         if format_param is not None:
799                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
800
801                         # Check possible RTMP download
802                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
803                                 self.report_rtmp_download()
804                                 video_real_url = video_info['conn'][0]
805
806                         # uploader
807                         if 'author' not in video_info:
808                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
809                                 return
810                         video_uploader = urllib.unquote_plus(video_info['author'][0])
811
812                         # title
813                         if 'title' not in video_info:
814                                 self._downloader.trouble(u'ERROR: unable to extract video title')
815                                 return
816                         video_title = urllib.unquote_plus(video_info['title'][0])
817                         video_title = video_title.decode('utf-8')
818                         video_title = sanitize_title(video_title)
819
820                         # simplified title
821                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
822                         simple_title = simple_title.strip(ur'_')
823
824                         try:
825                                 # Process video information
826                                 self._downloader.process_info({
827                                         'id':           video_id.decode('utf-8'),
828                                         'url':          video_real_url.decode('utf-8'),
829                                         'uploader':     video_uploader.decode('utf-8'),
830                                         'title':        video_title,
831                                         'stitle':       simple_title,
832                                         'ext':          video_extension.decode('utf-8'),
833                                 })
834
835                                 return
836
837                         except UnavailableFormatError, err:
838                                 if best_quality:
839                                         if quality_index == len(self._available_formats) - 1:
840                                                 # I don't ever expect this to happen
841                                                 self._downloader.trouble(u'ERROR: no known formats available for video')
842                                                 return
843                                         else:
844                                                 self.report_unavailable_format(video_id, format_param)
845                                                 quality_index += 1
846                                                 format_param = self._available_formats[quality_index]
847                                                 continue
848                                 else: 
849                                         self._downloader.trouble('ERROR: format not available for video')
850                                         return
851
852
853 class MetacafeIE(InfoExtractor):
854         """Information Extractor for metacafe.com."""
855
856         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
857         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
858         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
859         _youtube_ie = None
860
861         def __init__(self, youtube_ie, downloader=None):
862                 InfoExtractor.__init__(self, downloader)
863                 self._youtube_ie = youtube_ie
864
865         @staticmethod
866         def suitable(url):
867                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
868
869         def report_disclaimer(self):
870                 """Report disclaimer retrieval."""
871                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
872
873         def report_age_confirmation(self):
874                 """Report attempt to confirm age."""
875                 self._downloader.to_stdout(u'[metacafe] Confirming age')
876         
877         def report_download_webpage(self, video_id):
878                 """Report webpage download."""
879                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
880         
881         def report_extraction(self, video_id):
882                 """Report information extraction."""
883                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
884
885         def _real_initialize(self):
886                 # Retrieve disclaimer
887                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
888                 try:
889                         self.report_disclaimer()
890                         disclaimer = urllib2.urlopen(request).read()
891                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
892                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
893                         return
894
895                 # Confirm age
896                 disclaimer_form = {
897                         'filters': '0',
898                         'submit': "Continue - I'm over 18",
899                         }
900                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
901                 try:
902                         self.report_age_confirmation()
903                         disclaimer = urllib2.urlopen(request).read()
904                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
905                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
906                         return
907         
908         def _real_extract(self, url):
909                 # Extract id and simplified title from URL
910                 mobj = re.match(self._VALID_URL, url)
911                 if mobj is None:
912                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
913                         return
914
915                 video_id = mobj.group(1)
916
917                 # Check if video comes from YouTube
918                 mobj2 = re.match(r'^yt-(.*)$', video_id)
919                 if mobj2 is not None:
920                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
921                         return
922
923                 simple_title = mobj.group(2).decode('utf-8')
924                 video_extension = 'flv'
925
926                 # Retrieve video webpage to extract further information
927                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
928                 try:
929                         self.report_download_webpage(video_id)
930                         webpage = urllib2.urlopen(request).read()
931                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
932                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
933                         return
934
935                 # Extract URL, uploader and title from webpage
936                 self.report_extraction(video_id)
937                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
938                 if mobj is None:
939                         self._downloader.trouble(u'ERROR: unable to extract media URL')
940                         return
941                 mediaURL = urllib.unquote(mobj.group(1))
942
943                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
944                 #if mobj is None:
945                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
946                 #       return
947                 #gdaKey = mobj.group(1)
948                 #
949                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
950
951                 video_url = mediaURL
952
953                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
954                 if mobj is None:
955                         self._downloader.trouble(u'ERROR: unable to extract title')
956                         return
957                 video_title = mobj.group(1).decode('utf-8')
958                 video_title = sanitize_title(video_title)
959
960                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
961                 if mobj is None:
962                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
963                         return
964                 video_uploader = mobj.group(1)
965
966                 try:
967                         # Process video information
968                         self._downloader.process_info({
969                                 'id':           video_id.decode('utf-8'),
970                                 'url':          video_url.decode('utf-8'),
971                                 'uploader':     video_uploader.decode('utf-8'),
972                                 'title':        video_title,
973                                 'stitle':       simple_title,
974                                 'ext':          video_extension.decode('utf-8'),
975                         })
976                 except UnavailableFormatError:
977                         self._downloader.trouble(u'ERROR: format not available for video')
978
979
980 class GoogleIE(InfoExtractor):
981         """Information extractor for video.google.com."""
982
983         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
984
985         def __init__(self, downloader=None):
986                 InfoExtractor.__init__(self, downloader)
987
988         @staticmethod
989         def suitable(url):
990                 return (re.match(GoogleIE._VALID_URL, url) is not None)
991
992         def report_download_webpage(self, video_id):
993                 """Report webpage download."""
994                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
995
996         def report_extraction(self, video_id):
997                 """Report information extraction."""
998                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
999
1000         def _real_initialize(self):
1001                 return
1002
1003         def _real_extract(self, url):
1004                 # Extract id from URL
1005                 mobj = re.match(self._VALID_URL, url)
1006                 if mobj is None:
1007                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1008                         return
1009
1010                 video_id = mobj.group(1)
1011
1012                 video_extension = 'mp4'
1013
1014                 # Retrieve video webpage to extract further information
1015                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1016                 try:
1017                         self.report_download_webpage(video_id)
1018                         webpage = urllib2.urlopen(request).read()
1019                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1020                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1021                         return
1022
1023                 # Extract URL, uploader, and title from webpage
1024                 self.report_extraction(video_id)
1025                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1026                 if mobj is None:
1027                         video_extension = 'flv'
1028                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1029                 if mobj is None:
1030                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1031                         return
1032                 mediaURL = urllib.unquote(mobj.group(1))
1033                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1034                 mediaURL = mediaURL.replace('\\x26', '\x26')
1035
1036                 video_url = mediaURL
1037
1038                 mobj = re.search(r'<title>(.*)</title>', webpage)
1039                 if mobj is None:
1040                         self._downloader.trouble(u'ERROR: unable to extract title')
1041                         return
1042                 video_title = mobj.group(1).decode('utf-8')
1043                 video_title = sanitize_title(video_title)
1044                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1045
1046                 # Google Video doesn't show uploader nicknames?
1047                 video_uploader = 'NA'
1048
1049                 try:
1050                         # Process video information
1051                         self._downloader.process_info({
1052                                 'id':           video_id.decode('utf-8'),
1053                                 'url':          video_url.decode('utf-8'),
1054                                 'uploader':     video_uploader.decode('utf-8'),
1055                                 'title':        video_title,
1056                                 'stitle':       simple_title,
1057                                 'ext':          video_extension.decode('utf-8'),
1058                         })
1059                 except UnavailableFormatError:
1060                         self._downloader.trouble(u'ERROR: format not available for video')
1061
1062
1063 class PhotobucketIE(InfoExtractor):
1064         """Information extractor for photobucket.com."""
1065
1066         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1067
1068         def __init__(self, downloader=None):
1069                 InfoExtractor.__init__(self, downloader)
1070
1071         @staticmethod
1072         def suitable(url):
1073                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1074
1075         def report_download_webpage(self, video_id):
1076                 """Report webpage download."""
1077                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1078
1079         def report_extraction(self, video_id):
1080                 """Report information extraction."""
1081                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1082
1083         def _real_initialize(self):
1084                 return
1085
1086         def _real_extract(self, url):
1087                 # Extract id from URL
1088                 mobj = re.match(self._VALID_URL, url)
1089                 if mobj is None:
1090                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1091                         return
1092
1093                 video_id = mobj.group(1)
1094
1095                 video_extension = 'flv'
1096
1097                 # Retrieve video webpage to extract further information
1098                 request = urllib2.Request(url)
1099                 try:
1100                         self.report_download_webpage(video_id)
1101                         webpage = urllib2.urlopen(request).read()
1102                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1103                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1104                         return
1105
1106                 # Extract URL, uploader, and title from webpage
1107                 self.report_extraction(video_id)
1108                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1109                 if mobj is None:
1110                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1111                         return
1112                 mediaURL = urllib.unquote(mobj.group(1))
1113
1114                 video_url = mediaURL
1115
1116                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1117                 if mobj is None:
1118                         self._downloader.trouble(u'ERROR: unable to extract title')
1119                         return
1120                 video_title = mobj.group(1).decode('utf-8')
1121                 video_title = sanitize_title(video_title)
1122                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1123
1124                 video_uploader = mobj.group(2).decode('utf-8')
1125
1126                 try:
1127                         # Process video information
1128                         self._downloader.process_info({
1129                                 'id':           video_id.decode('utf-8'),
1130                                 'url':          video_url.decode('utf-8'),
1131                                 'uploader':     video_uploader,
1132                                 'title':        video_title,
1133                                 'stitle':       simple_title,
1134                                 'ext':          video_extension.decode('utf-8'),
1135                         })
1136                 except UnavailableFormatError:
1137                         self._downloader.trouble(u'ERROR: format not available for video')
1138
1139
1140 class GenericIE(InfoExtractor):
1141         """Generic last-resort information extractor."""
1142
1143         def __init__(self, downloader=None):
1144                 InfoExtractor.__init__(self, downloader)
1145
1146         @staticmethod
1147         def suitable(url):
1148                 return True
1149
1150         def report_download_webpage(self, video_id):
1151                 """Report webpage download."""
1152                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1153                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1154
1155         def report_extraction(self, video_id):
1156                 """Report information extraction."""
1157                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1158
1159         def _real_initialize(self):
1160                 return
1161
1162         def _real_extract(self, url):
1163                 video_id = url.split('/')[-1]
1164                 request = urllib2.Request(url)
1165                 try:
1166                         self.report_download_webpage(video_id)
1167                         webpage = urllib2.urlopen(request).read()
1168                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1170                         return
1171                 except ValueError, err:
1172                         # since this is the last-resort InfoExtractor, if
1173                         # this error is thrown, it'll be thrown here
1174                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175                         return
1176
1177                 # Start with something easy: JW Player in SWFObject
1178                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1179                 if mobj is None:
1180                         # Broaden the search a little bit
1181                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1182                 if mobj is None:
1183                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1184                         return
1185
1186                 # It's possible that one of the regexes
1187                 # matched, but returned an empty group:
1188                 if mobj.group(1) is None:
1189                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190                         return
1191
1192                 video_url = urllib.unquote(mobj.group(1))
1193                 video_id  = os.path.basename(video_url)
1194
1195                 # here's a fun little line of code for you:
1196                 video_extension = os.path.splitext(video_id)[1][1:]
1197                 video_id        = os.path.splitext(video_id)[0]
1198
1199                 # it's tempting to parse this further, but you would
1200                 # have to take into account all the variations like
1201                 #   Video Title - Site Name
1202                 #   Site Name | Video Title
1203                 #   Video Title - Tagline | Site Name
1204                 # and so on and so forth; it's just not practical
1205                 mobj = re.search(r'<title>(.*)</title>', webpage)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: unable to extract title')
1208                         return
1209                 video_title = mobj.group(1).decode('utf-8')
1210                 video_title = sanitize_title(video_title)
1211                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1212
1213                 # video uploader is domain name
1214                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1215                 if mobj is None:
1216                         self._downloader.trouble(u'ERROR: unable to extract title')
1217                         return
1218                 video_uploader = mobj.group(1).decode('utf-8')
1219
1220                 try:
1221                         # Process video information
1222                         self._downloader.process_info({
1223                                 'id':           video_id.decode('utf-8'),
1224                                 'url':          video_url.decode('utf-8'),
1225                                 'uploader':     video_uploader,
1226                                 'title':        video_title,
1227                                 'stitle':       simple_title,
1228                                 'ext':          video_extension.decode('utf-8'),
1229                         })
1230                 except UnavailableFormatError:
1231                         self._downloader.trouble(u'ERROR: format not available for video')
1232
1233
1234 class YoutubeSearchIE(InfoExtractor):
1235         """Information Extractor for YouTube search queries."""
1236         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1237         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1238         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1239         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1240         _youtube_ie = None
1241         _max_youtube_results = 1000
1242
1243         def __init__(self, youtube_ie, downloader=None):
1244                 InfoExtractor.__init__(self, downloader)
1245                 self._youtube_ie = youtube_ie
1246         
1247         @staticmethod
1248         def suitable(url):
1249                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1250
1251         def report_download_page(self, query, pagenum):
1252                 """Report attempt to download playlist page with given number."""
1253                 query = query.decode(preferredencoding())
1254                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1255
1256         def _real_initialize(self):
1257                 self._youtube_ie.initialize()
1258         
1259         def _real_extract(self, query):
1260                 mobj = re.match(self._VALID_QUERY, query)
1261                 if mobj is None:
1262                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1263                         return
1264
1265                 prefix, query = query.split(':')
1266                 prefix = prefix[8:]
1267                 query  = query.encode('utf-8')
1268                 if prefix == '':
1269                         self._download_n_results(query, 1)
1270                         return
1271                 elif prefix == 'all':
1272                         self._download_n_results(query, self._max_youtube_results)
1273                         return
1274                 else:
1275                         try:
1276                                 n = long(prefix)
1277                                 if n <= 0:
1278                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1279                                         return
1280                                 elif n > self._max_youtube_results:
1281                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1282                                         n = self._max_youtube_results
1283                                 self._download_n_results(query, n)
1284                                 return
1285                         except ValueError: # parsing prefix as integer fails
1286                                 self._download_n_results(query, 1)
1287                                 return
1288
1289         def _download_n_results(self, query, n):
1290                 """Downloads a specified number of results for a query"""
1291
1292                 video_ids = []
1293                 already_seen = set()
1294                 pagenum = 1
1295
1296                 while True:
1297                         self.report_download_page(query, pagenum)
1298                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1299                         request = urllib2.Request(result_url, None, std_headers)
1300                         try:
1301                                 page = urllib2.urlopen(request).read()
1302                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1304                                 return
1305
1306                         # Extract video identifiers
1307                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1308                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1309                                 if video_id not in already_seen:
1310                                         video_ids.append(video_id)
1311                                         already_seen.add(video_id)
1312                                         if len(video_ids) == n:
1313                                                 # Specified n videos reached
1314                                                 for id in video_ids:
1315                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1316                                                 return
1317
1318                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1319                                 for id in video_ids:
1320                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1321                                 return
1322
1323                         pagenum = pagenum + 1
1324
1325 class YoutubePlaylistIE(InfoExtractor):
1326         """Information Extractor for YouTube playlists."""
1327
1328         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1329         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1330         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1331         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1332         _youtube_ie = None
1333
1334         def __init__(self, youtube_ie, downloader=None):
1335                 InfoExtractor.__init__(self, downloader)
1336                 self._youtube_ie = youtube_ie
1337         
1338         @staticmethod
1339         def suitable(url):
1340                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1341
1342         def report_download_page(self, playlist_id, pagenum):
1343                 """Report attempt to download playlist page with given number."""
1344                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1345
1346         def _real_initialize(self):
1347                 self._youtube_ie.initialize()
1348         
1349         def _real_extract(self, url):
1350                 # Extract playlist id
1351                 mobj = re.match(self._VALID_URL, url)
1352                 if mobj is None:
1353                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1354                         return
1355
1356                 # Download playlist pages
1357                 playlist_id = mobj.group(1)
1358                 video_ids = []
1359                 pagenum = 1
1360
1361                 while True:
1362                         self.report_download_page(playlist_id, pagenum)
1363                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1364                         try:
1365                                 page = urllib2.urlopen(request).read()
1366                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1367                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1368                                 return
1369
1370                         # Extract video identifiers
1371                         ids_in_page = []
1372                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1373                                 if mobj.group(1) not in ids_in_page:
1374                                         ids_in_page.append(mobj.group(1))
1375                         video_ids.extend(ids_in_page)
1376
1377                         if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1378                                 break
1379                         pagenum = pagenum + 1
1380
1381                 for id in video_ids:
1382                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1383                 return
1384
1385 class YoutubeUserIE(InfoExtractor):
1386         """Information Extractor for YouTube users."""
1387
1388         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1389         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1390         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1391         _youtube_ie = None
1392
1393         def __init__(self, youtube_ie, downloader=None):
1394                 InfoExtractor.__init__(self, downloader)
1395                 self._youtube_ie = youtube_ie
1396         
1397         @staticmethod
1398         def suitable(url):
1399                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1400
1401         def report_download_page(self, username):
1402                 """Report attempt to download user page."""
1403                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1404
1405         def _real_initialize(self):
1406                 self._youtube_ie.initialize()
1407         
1408         def _real_extract(self, url):
1409                 # Extract username
1410                 mobj = re.match(self._VALID_URL, url)
1411                 if mobj is None:
1412                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1413                         return
1414
1415                 # Download user page
1416                 username = mobj.group(1)
1417                 video_ids = []
1418                 pagenum = 1
1419
1420                 self.report_download_page(username)
1421                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1422                 try:
1423                         page = urllib2.urlopen(request).read()
1424                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1425                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1426                         return
1427
1428                 # Extract video identifiers
1429                 ids_in_page = []
1430
1431                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1432                         if mobj.group(1) not in ids_in_page:
1433                                 ids_in_page.append(mobj.group(1))
1434                 video_ids.extend(ids_in_page)
1435
1436                 for id in video_ids:
1437                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1438                 return
1439
1440 class PostProcessor(object):
1441         """Post Processor class.
1442
1443         PostProcessor objects can be added to downloaders with their
1444         add_post_processor() method. When the downloader has finished a
1445         successful download, it will take its internal chain of PostProcessors
1446         and start calling the run() method on each one of them, first with
1447         an initial argument and then with the returned value of the previous
1448         PostProcessor.
1449
1450         The chain will be stopped if one of them ever returns None or the end
1451         of the chain is reached.
1452
1453         PostProcessor objects follow a "mutual registration" process similar
1454         to InfoExtractor objects.
1455         """
1456
1457         _downloader = None
1458
1459         def __init__(self, downloader=None):
1460                 self._downloader = downloader
1461
1462         def set_downloader(self, downloader):
1463                 """Sets the downloader for this PP."""
1464                 self._downloader = downloader
1465         
1466         def run(self, information):
1467                 """Run the PostProcessor.
1468
1469                 The "information" argument is a dictionary like the ones
1470                 composed by InfoExtractors. The only difference is that this
1471                 one has an extra field called "filepath" that points to the
1472                 downloaded file.
1473
1474                 When this method returns None, the postprocessing chain is
1475                 stopped. However, this method may return an information
1476                 dictionary that will be passed to the next postprocessing
1477                 object in the chain. It can be the one it received after
1478                 changing some fields.
1479
1480                 In addition, this method may raise a PostProcessingError
1481                 exception that will be taken into account by the downloader
1482                 it was called from.
1483                 """
1484                 return information # by default, do nothing
1485         
1486 ### MAIN PROGRAM ###
1487 if __name__ == '__main__':
1488         try:
1489                 # Modules needed only when running the main program
1490                 import getpass
1491                 import optparse
1492
1493                 # Function to update the program file with the latest version from bitbucket.org
1494                 def update_self(downloader, filename):
1495                         # Note: downloader only used for options
1496                         if not os.access (filename, os.W_OK):
1497                                 sys.exit('ERROR: no write permissions on %s' % filename)
1498
1499                         downloader.to_stdout('Updating to latest stable version...')
1500                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1501                         latest_version = urllib.urlopen(latest_url).read().strip()
1502                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1503                         newcontent = urllib.urlopen(prog_url).read()
1504                         stream = open(filename, 'w')
1505                         stream.write(newcontent)
1506                         stream.close()
1507                         downloader.to_stdout('Updated to version %s' % latest_version)
1508
1509                 # General configuration
1510                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1511                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1512                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1513
1514                 # Parse command line
1515                 parser = optparse.OptionParser(
1516                         usage='Usage: %prog [options] url...',
1517                         version='2010.02.13',
1518                         conflict_handler='resolve',
1519                 )
1520
1521                 parser.add_option('-h', '--help',
1522                                 action='help', help='print this help text and exit')
1523                 parser.add_option('-v', '--version',
1524                                 action='version', help='print program version and exit')
1525                 parser.add_option('-U', '--update',
1526                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1527                 parser.add_option('-i', '--ignore-errors',
1528                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1529                 parser.add_option('-r', '--rate-limit',
1530                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1531
1532                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1533                 authentication.add_option('-u', '--username',
1534                                 dest='username', metavar='UN', help='account username')
1535                 authentication.add_option('-p', '--password',
1536                                 dest='password', metavar='PW', help='account password')
1537                 authentication.add_option('-n', '--netrc',
1538                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1539                 parser.add_option_group(authentication)
1540
1541                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1542                 video_format.add_option('-f', '--format',
1543                                 action='store', dest='format', metavar='FMT', help='video format code')
1544                 video_format.add_option('-b', '--best-quality',
1545                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1546                 video_format.add_option('-m', '--mobile-version',
1547                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1548                 video_format.add_option('-d', '--high-def',
1549                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1550                 parser.add_option_group(video_format)
1551
1552                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1553                 verbosity.add_option('-q', '--quiet',
1554                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1555                 verbosity.add_option('-s', '--simulate',
1556                                 action='store_true', dest='simulate', help='do not download video', default=False)
1557                 verbosity.add_option('-g', '--get-url',
1558                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1559                 verbosity.add_option('-e', '--get-title',
1560                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1561                 parser.add_option_group(verbosity)
1562
1563                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1564                 filesystem.add_option('-t', '--title',
1565                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
1566                 filesystem.add_option('-l', '--literal',
1567                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1568                 filesystem.add_option('-o', '--output',
1569                                 dest='outtmpl', metavar='TPL', help='output filename template')
1570                 filesystem.add_option('-a', '--batch-file',
1571                                 dest='batchfile', metavar='F', help='file containing URLs to download')
1572                 filesystem.add_option('-w', '--no-overwrites',
1573                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1574                 filesystem.add_option('-c', '--continue',
1575                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1576                 parser.add_option_group(filesystem)
1577
1578                 (opts, args) = parser.parse_args()
1579         
1580                 # Batch file verification
1581                 batchurls = []
1582                 if opts.batchfile is not None:
1583                         try:
1584                                 batchurls = open(opts.batchfile, 'r').readlines()
1585                                 batchurls = [x.strip() for x in batchurls]
1586                                 batchurls = [x for x in batchurls if len(x) > 0]
1587                         except IOError:
1588                                 sys.exit(u'ERROR: batch file could not be read')
1589                 all_urls = batchurls + args
1590
1591                 # Conflicting, missing and erroneous options
1592                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1593                         parser.error(u'using .netrc conflicts with giving username/password')
1594                 if opts.password is not None and opts.username is None:
1595                         parser.error(u'account username missing')
1596                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1597                         parser.error(u'using output template conflicts with using title or literal title')
1598                 if opts.usetitle and opts.useliteral:
1599                         parser.error(u'using title conflicts with using literal title')
1600                 if opts.username is not None and opts.password is None:
1601                         opts.password = getpass.getpass(u'Type account password and press return:')
1602                 if opts.ratelimit is not None:
1603                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1604                         if numeric_limit is None:
1605                                 parser.error(u'invalid rate limit specified')
1606                         opts.ratelimit = numeric_limit
1607
1608                 # Information extractors
1609                 youtube_ie = YoutubeIE()
1610                 metacafe_ie = MetacafeIE(youtube_ie)
1611                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1612                 youtube_user_ie = YoutubeUserIE(youtube_ie)
1613                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1614                 google_ie = GoogleIE()
1615                 photobucket_ie = PhotobucketIE()
1616                 generic_ie = GenericIE()
1617
1618                 # File downloader
1619                 fd = FileDownloader({
1620                         'usenetrc': opts.usenetrc,
1621                         'username': opts.username,
1622                         'password': opts.password,
1623                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1624                         'forceurl': opts.geturl,
1625                         'forcetitle': opts.gettitle,
1626                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1627                         'format': opts.format,
1628                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1629                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1630                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1631                                 or u'%(id)s.%(ext)s'),
1632                         'ignoreerrors': opts.ignoreerrors,
1633                         'ratelimit': opts.ratelimit,
1634                         'nooverwrites': opts.nooverwrites,
1635                         'continuedl': opts.continue_dl,
1636                         })
1637                 fd.add_info_extractor(youtube_search_ie)
1638                 fd.add_info_extractor(youtube_pl_ie)
1639                 fd.add_info_extractor(youtube_user_ie)
1640                 fd.add_info_extractor(metacafe_ie)
1641                 fd.add_info_extractor(youtube_ie)
1642                 fd.add_info_extractor(google_ie)
1643                 fd.add_info_extractor(photobucket_ie)
1644
1645                 # This must come last since it's the
1646                 # fallback if none of the others work
1647                 fd.add_info_extractor(generic_ie)
1648
1649                 # Update version
1650                 if opts.update_self:
1651                         update_self(fd, sys.argv[0])
1652
1653                 # Maybe do nothing
1654                 if len(all_urls) < 1:
1655                         if not opts.update_self:
1656                                 parser.error(u'you must provide at least one URL')
1657                         else:
1658                                 sys.exit()
1659                 retcode = fd.download(all_urls)
1660                 sys.exit(retcode)
1661
1662         except DownloadError:
1663                 sys.exit(1)
1664         except SameFileError:
1665                 sys.exit(u'ERROR: fixed output name but more than one file to download')
1666         except KeyboardInterrupt:
1667                 sys.exit(u'\nERROR: Interrupted by user')