Do not use 0% as the starting point in resumed downloads (closes #40)
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
8 import cookielib
9 import datetime
10 import htmlentitydefs
11 import httplib
12 import locale
13 import math
14 import netrc
15 import os
16 import os.path
17 import re
18 import socket
19 import string
20 import subprocess
21 import sys
22 import time
23 import urllib
24 import urllib2
25
26 # parse_qs was moved from the cgi module to the urlparse module recently.
27 try:
28         from urlparse import parse_qs
29 except ImportError:
30         from cgi import parse_qs
31
32 std_headers = {
33         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36         'Accept-Language': 'en-us,en;q=0.5',
37 }
38
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40
41 def preferredencoding():
42         """Get preferred encoding.
43
44         Returns the best encoding scheme for the system, based on
45         locale.getpreferredencoding() and some further tweaks.
46         """
47         def yield_preferredencoding():
48                 try:
49                         pref = locale.getpreferredencoding()
50                         u'TEST'.encode(pref)
51                 except:
52                         pref = 'UTF-8'
53                 while True:
54                         yield pref
55         return yield_preferredencoding().next()
56
57 def htmlentity_transform(matchobj):
58         """Transforms an HTML entity to a Unicode character.
59         
60         This function receives a match object and is intended to be used with
61         the re.sub() function.
62         """
63         entity = matchobj.group(1)
64
65         # Known non-numeric HTML entity
66         if entity in htmlentitydefs.name2codepoint:
67                 return unichr(htmlentitydefs.name2codepoint[entity])
68
69         # Unicode character
70         mobj = re.match(ur'(?u)#(x?\d+)', entity)
71         if mobj is not None:
72                 numstr = mobj.group(1)
73                 if numstr.startswith(u'x'):
74                         base = 16
75                         numstr = u'0%s' % numstr
76                 else:
77                         base = 10
78                 return unichr(long(numstr, base))
79
80         # Unknown entity in name, return its literal representation
81         return (u'&%s;' % entity)
82
83 def sanitize_title(utitle):
84         """Sanitizes a video title so it could be used as part of a filename."""
85         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86         return utitle.replace(unicode(os.sep), u'%')
87
88 def sanitize_open(filename, open_mode):
89         """Try to open the given filename, and slightly tweak it if this fails.
90
91         Attempts to open the given filename. If this fails, it tries to change
92         the filename slightly, step by step, until it's either able to open it
93         or it fails and raises a final exception, like the standard open()
94         function.
95
96         It returns the tuple (stream, definitive_file_name).
97         """
98         try:
99                 if filename == u'-':
100                         if sys.platform == 'win32':
101                                 import msvcrt
102                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103                         return (sys.stdout, filename)
104                 stream = open(filename, open_mode)
105                 return (stream, filename)
106         except (IOError, OSError), err:
107                 # In case of error, try to remove win32 forbidden chars
108                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109
110                 # An exception here should be caught in the caller
111                 stream = open(filename, open_mode)
112                 return (stream, filename)
113
114 class DownloadError(Exception):
115         """Download Error exception.
116         
117         This exception may be thrown by FileDownloader objects if they are not
118         configured to continue on errors. They will contain the appropriate
119         error message.
120         """
121         pass
122
123 class SameFileError(Exception):
124         """Same File exception.
125
126         This exception will be thrown by FileDownloader objects if they detect
127         multiple files would have to be downloaded to the same file on disk.
128         """
129         pass
130
131 class PostProcessingError(Exception):
132         """Post Processing exception.
133
134         This exception may be raised by PostProcessor's .run() method to
135         indicate an error in the postprocessing task.
136         """
137         pass
138
139 class UnavailableVideoError(Exception):
140         """Unavailable Format exception.
141
142         This exception will be thrown when a video is requested
143         in a format that is not available for that video.
144         """
145         pass
146
147 class ContentTooShortError(Exception):
148         """Content Too Short exception.
149
150         This exception may be raised by FileDownloader objects when a file they
151         download is too small for what the server announced first, indicating
152         the connection was probably interrupted.
153         """
154         # Both in bytes
155         downloaded = None
156         expected = None
157
158         def __init__(self, downloaded, expected):
159                 self.downloaded = downloaded
160                 self.expected = expected
161
162 class FileDownloader(object):
163         """File Downloader class.
164
165         File downloader objects are the ones responsible of downloading the
166         actual video file and writing it to disk if the user has requested
167         it, among some other tasks. In most cases there should be one per
168         program. As, given a video URL, the downloader doesn't know how to
169         extract all the needed information, task that InfoExtractors do, it
170         has to pass the URL to one of them.
171
172         For this, file downloader objects have a method that allows
173         InfoExtractors to be registered in a given order. When it is passed
174         a URL, the file downloader handles it to the first InfoExtractor it
175         finds that reports being able to handle it. The InfoExtractor extracts
176         all the information about the video or videos the URL refers to, and
177         asks the FileDownloader to process the video information, possibly
178         downloading the video.
179
180         File downloaders accept a lot of parameters. In order not to saturate
181         the object constructor with arguments, it receives a dictionary of
182         options instead. These options are available through the params
183         attribute for the InfoExtractors to use. The FileDownloader also
184         registers itself as the downloader in charge for the InfoExtractors
185         that are added to it, so this is a "mutual registration".
186
187         Available options:
188
189         username:         Username for authentication purposes.
190         password:         Password for authentication purposes.
191         usenetrc:         Use netrc for authentication instead.
192         quiet:            Do not print messages to stdout.
193         forceurl:         Force printing final URL.
194         forcetitle:       Force printing title.
195         forcethumbnail:   Force printing thumbnail URL.
196         forcedescription: Force printing description.
197         simulate:         Do not download the video files.
198         format:           Video format code.
199         format_limit:     Highest quality format to try.
200         outtmpl:          Template for output names.
201         ignoreerrors:     Do not stop on download errors.
202         ratelimit:        Download speed limit, in bytes/sec.
203         nooverwrites:     Prevent overwriting files.
204         retries:          Number of times to retry for HTTP error 5xx
205         continuedl:       Try to continue downloads if possible.
206         noprogress:       Do not print the progress bar.
207         playliststart:    Playlist item to start at.
208         playlistend:      Playlist item to end at.
209         logtostderr:      Log messages to stderr instead of stdout.
210         """
211
212         params = None
213         _ies = []
214         _pps = []
215         _download_retcode = None
216         _num_downloads = None
217         _screen_file = None
218
219         def __init__(self, params):
220                 """Create a FileDownloader object with the given options."""
221                 self._ies = []
222                 self._pps = []
223                 self._download_retcode = 0
224                 self._num_downloads = 0
225                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
226                 self.params = params
227         
228         @staticmethod
229         def pmkdir(filename):
230                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231                 components = filename.split(os.sep)
232                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234                 for dir in aggregate:
235                         if not os.path.exists(dir):
236                                 os.mkdir(dir)
237         
238         @staticmethod
239         def temp_name(filename):
240                 """Returns a temporary filename for the given filename."""
241                 if filename == u'-' or (os.path.exists(filename) and not os.path.isfile(filename)):
242                         return filename
243                 return filename + u'.part'
244         
245         @staticmethod
246         def format_bytes(bytes):
247                 if bytes is None:
248                         return 'N/A'
249                 if type(bytes) is str:
250                         bytes = float(bytes)
251                 if bytes == 0.0:
252                         exponent = 0
253                 else:
254                         exponent = long(math.log(bytes, 1024.0))
255                 suffix = 'bkMGTPEZY'[exponent]
256                 converted = float(bytes) / float(1024**exponent)
257                 return '%.2f%s' % (converted, suffix)
258
259         @staticmethod
260         def calc_percent(byte_counter, data_len):
261                 if data_len is None:
262                         return '---.-%'
263                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
264
265         @staticmethod
266         def calc_eta(start, now, total, current):
267                 if total is None:
268                         return '--:--'
269                 dif = now - start
270                 if current == 0 or dif < 0.001: # One millisecond
271                         return '--:--'
272                 rate = float(current) / dif
273                 eta = long((float(total) - float(current)) / rate)
274                 (eta_mins, eta_secs) = divmod(eta, 60)
275                 if eta_mins > 99:
276                         return '--:--'
277                 return '%02d:%02d' % (eta_mins, eta_secs)
278
279         @staticmethod
280         def calc_speed(start, now, bytes):
281                 dif = now - start
282                 if bytes == 0 or dif < 0.001: # One millisecond
283                         return '%10s' % '---b/s'
284                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
285
286         @staticmethod
287         def best_block_size(elapsed_time, bytes):
288                 new_min = max(bytes / 2.0, 1.0)
289                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290                 if elapsed_time < 0.001:
291                         return long(new_max)
292                 rate = bytes / elapsed_time
293                 if rate > new_max:
294                         return long(new_max)
295                 if rate < new_min:
296                         return long(new_min)
297                 return long(rate)
298
299         @staticmethod
300         def parse_bytes(bytestr):
301                 """Parse a string indicating a byte quantity into a long integer."""
302                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
303                 if matchobj is None:
304                         return None
305                 number = float(matchobj.group(1))
306                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
307                 return long(round(number * multiplier))
308
309         def add_info_extractor(self, ie):
310                 """Add an InfoExtractor object to the end of the list."""
311                 self._ies.append(ie)
312                 ie.set_downloader(self)
313         
314         def add_post_processor(self, pp):
315                 """Add a PostProcessor object to the end of the chain."""
316                 self._pps.append(pp)
317                 pp.set_downloader(self)
318         
319         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
320                 """Print message to stdout if not in quiet mode."""
321                 try:
322                         if not self.params.get('quiet', False):
323                                 terminator = [u'\n', u''][skip_eol]
324                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
325                         self._screen_file.flush()
326                 except (UnicodeEncodeError), err:
327                         if not ignore_encoding_errors:
328                                 raise
329         
330         def to_stderr(self, message):
331                 """Print message to stderr."""
332                 print >>sys.stderr, message.encode(preferredencoding())
333         
334         def fixed_template(self):
335                 """Checks if the output template is fixed."""
336                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
337
338         def trouble(self, message=None):
339                 """Determine action to take when a download problem appears.
340
341                 Depending on if the downloader has been configured to ignore
342                 download errors or not, this method may throw an exception or
343                 not when errors are found, after printing the message.
344                 """
345                 if message is not None:
346                         self.to_stderr(message)
347                 if not self.params.get('ignoreerrors', False):
348                         raise DownloadError(message)
349                 self._download_retcode = 1
350
351         def slow_down(self, start_time, byte_counter):
352                 """Sleep if the download speed is over the rate limit."""
353                 rate_limit = self.params.get('ratelimit', None)
354                 if rate_limit is None or byte_counter == 0:
355                         return
356                 now = time.time()
357                 elapsed = now - start_time
358                 if elapsed <= 0.0:
359                         return
360                 speed = float(byte_counter) / elapsed
361                 if speed > rate_limit:
362                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
363         
364         def try_rename(self, old_filename, new_filename):
365                 try:
366                         if old_filename == new_filename:
367                                 return
368                         os.rename(old_filename, new_filename)
369                 except (IOError, OSError), err:
370                         self.trouble(u'ERROR: unable to rename file')
371
372         def report_destination(self, filename):
373                 """Report destination filename."""
374                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
375         
376         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
377                 """Report download progress."""
378                 if self.params.get('noprogress', False):
379                         return
380                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
381                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
382
383         def report_resuming_byte(self, resume_len):
384                 """Report attempt to resume at given byte."""
385                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
386         
387         def report_retry(self, count, retries):
388                 """Report retry in case of HTTP error 5xx"""
389                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
390         
391         def report_file_already_downloaded(self, file_name):
392                 """Report file has already been fully downloaded."""
393                 try:
394                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
395                 except (UnicodeEncodeError), err:
396                         self.to_screen(u'[download] The file has already been downloaded')
397         
398         def report_unable_to_resume(self):
399                 """Report it was impossible to resume download."""
400                 self.to_screen(u'[download] Unable to resume')
401         
402         def report_finish(self):
403                 """Report download finished."""
404                 if self.params.get('noprogress', False):
405                         self.to_screen(u'[download] Download completed')
406                 else:
407                         self.to_screen(u'')
408         
409         def increment_downloads(self):
410                 """Increment the ordinal that assigns a number to each file."""
411                 self._num_downloads += 1
412
413         def process_info(self, info_dict):
414                 """Process a single dictionary returned by an InfoExtractor."""
415                 # Do nothing else if in simulate mode
416                 if self.params.get('simulate', False):
417                         # Forced printings
418                         if self.params.get('forcetitle', False):
419                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420                         if self.params.get('forceurl', False):
421                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
423                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424                         if self.params.get('forcedescription', False) and 'description' in info_dict:
425                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
426
427                         return
428                         
429                 try:
430                         template_dict = dict(info_dict)
431                         template_dict['epoch'] = unicode(long(time.time()))
432                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
433                         filename = self.params['outtmpl'] % template_dict
434                 except (ValueError, KeyError), err:
435                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
436                         return
437                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
438                         self.to_stderr(u'WARNING: file exists and will be skipped')
439                         return
440
441                 try:
442                         self.pmkdir(filename)
443                 except (OSError, IOError), err:
444                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
445                         return
446
447                 try:
448                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
449                 except (OSError, IOError), err:
450                         raise UnavailableVideoError
451                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
452                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
453                         return
454                 except (ContentTooShortError, ), err:
455                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
456                         return
457
458                 if success:
459                         try:
460                                 self.post_process(filename, info_dict)
461                         except (PostProcessingError), err:
462                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
463                                 return
464
465         def download(self, url_list):
466                 """Download a given list of URLs."""
467                 if len(url_list) > 1 and self.fixed_template():
468                         raise SameFileError(self.params['outtmpl'])
469
470                 for url in url_list:
471                         suitable_found = False
472                         for ie in self._ies:
473                                 # Go to next InfoExtractor if not suitable
474                                 if not ie.suitable(url):
475                                         continue
476
477                                 # Suitable InfoExtractor found
478                                 suitable_found = True
479
480                                 # Extract information from URL and process it
481                                 ie.extract(url)
482
483                                 # Suitable InfoExtractor had been found; go to next URL
484                                 break
485
486                         if not suitable_found:
487                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
488
489                 return self._download_retcode
490
491         def post_process(self, filename, ie_info):
492                 """Run the postprocessing chain on the given file."""
493                 info = dict(ie_info)
494                 info['filepath'] = filename
495                 for pp in self._pps:
496                         info = pp.run(info)
497                         if info is None:
498                                 break
499         
500         def _download_with_rtmpdump(self, filename, url, player_url):
501                 self.report_destination(filename)
502                 tmpfilename = self.temp_name(filename)
503
504                 # Check for rtmpdump first
505                 try:
506                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
507                 except (OSError, IOError):
508                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
509                         return False
510
511                 # Download using rtmpdump. rtmpdump returns exit code 2 when
512                 # the connection was interrumpted and resuming appears to be
513                 # possible. This is part of rtmpdump's normal usage, AFAIK.
514                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
515                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
516                 while retval == 2 or retval == 1:
517                         prevsize = os.path.getsize(tmpfilename)
518                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
519                         time.sleep(5.0) # This seems to be needed
520                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
521                         cursize = os.path.getsize(tmpfilename)
522                         if prevsize == cursize and retval == 1:
523                                 break
524                 if retval == 0:
525                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
526                         self.try_rename(tmpfilename, filename)
527                         return True
528                 else:
529                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
530                         return False
531
532         def _do_download(self, filename, url, player_url):
533                 # Check file already present
534                 if self.params.get('continuedl', False) and os.path.isfile(filename):
535                         self.report_file_already_downloaded(filename)
536                         return True
537
538                 # Attempt to download using rtmpdump
539                 if url.startswith('rtmp'):
540                         return self._download_with_rtmpdump(filename, url, player_url)
541
542                 tmpfilename = self.temp_name(filename)
543                 stream = None
544                 open_mode = 'wb'
545                 basic_request = urllib2.Request(url, None, std_headers)
546                 request = urllib2.Request(url, None, std_headers)
547
548                 # Establish possible resume length
549                 if os.path.isfile(tmpfilename):
550                         resume_len = os.path.getsize(tmpfilename)
551                 else:
552                         resume_len = 0
553
554                 # Request parameters in case of being able to resume
555                 if self.params.get('continuedl', False) and resume_len != 0:
556                         self.report_resuming_byte(resume_len)
557                         request.add_header('Range','bytes=%d-' % resume_len)
558                         open_mode = 'ab'
559
560                 count = 0
561                 retries = self.params.get('retries', 0)
562                 while count <= retries:
563                         # Establish connection
564                         try:
565                                 data = urllib2.urlopen(request)
566                                 break
567                         except (urllib2.HTTPError, ), err:
568                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
569                                         # Unexpected HTTP error
570                                         raise
571                                 elif err.code == 416:
572                                         # Unable to resume (requested range not satisfiable)
573                                         try:
574                                                 # Open the connection again without the range header
575                                                 data = urllib2.urlopen(basic_request)
576                                                 content_length = data.info()['Content-Length']
577                                         except (urllib2.HTTPError, ), err:
578                                                 if err.code < 500 or err.code >= 600:
579                                                         raise
580                                         else:
581                                                 # Examine the reported length
582                                                 if (content_length is not None and
583                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
584                                                         # The file had already been fully downloaded.
585                                                         # Explanation to the above condition: in issue #175 it was revealed that
586                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
587                                                         # changing the file size slightly and causing problems for some users. So
588                                                         # I decided to implement a suggested change and consider the file
589                                                         # completely downloaded if the file size differs less than 100 bytes from
590                                                         # the one in the hard drive.
591                                                         self.report_file_already_downloaded(filename)
592                                                         self.try_rename(tmpfilename, filename)
593                                                         return True
594                                                 else:
595                                                         # The length does not match, we start the download over
596                                                         self.report_unable_to_resume()
597                                                         open_mode = 'wb'
598                                                         break
599                         # Retry
600                         count += 1
601                         if count <= retries:
602                                 self.report_retry(count, retries)
603
604                 if count > retries:
605                         self.trouble(u'ERROR: giving up after %s retries' % retries)
606                         return False
607
608                 data_len = data.info().get('Content-length', None)
609                 if data_len is not None:
610                         data_len = long(data_len) + resume_len
611                 data_len_str = self.format_bytes(data_len)
612                 byte_counter = 0 + resume_len
613                 block_size = 1024
614                 start = time.time()
615                 while True:
616                         # Download and write
617                         before = time.time()
618                         data_block = data.read(block_size)
619                         after = time.time()
620                         data_block_len = len(data_block)
621                         if data_block_len == 0:
622                                 break
623                         byte_counter += data_block_len
624
625                         # Open file just in time
626                         if stream is None:
627                                 try:
628                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
629                                         self.report_destination(filename)
630                                 except (OSError, IOError), err:
631                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
632                                         return False
633                         try:
634                                 stream.write(data_block)
635                         except (IOError, OSError), err:
636                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
637                                 return False
638                         block_size = self.best_block_size(after - before, data_block_len)
639
640                         # Progress message
641                         percent_str = self.calc_percent(byte_counter, data_len)
642                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
643                         speed_str = self.calc_speed(start, time.time(), byte_counter)
644                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
645
646                         # Apply rate limit
647                         self.slow_down(start, byte_counter)
648
649                 stream.close()
650                 self.report_finish()
651                 if data_len is not None and str(byte_counter) != data_len:
652                         raise ContentTooShortError(byte_counter, long(data_len))
653                 self.try_rename(tmpfilename, filename)
654                 return True
655
656 class InfoExtractor(object):
657         """Information Extractor class.
658
659         Information extractors are the classes that, given a URL, extract
660         information from the video (or videos) the URL refers to. This
661         information includes the real video URL, the video title and simplified
662         title, author and others. The information is stored in a dictionary
663         which is then passed to the FileDownloader. The FileDownloader
664         processes this information possibly downloading the video to the file
665         system, among other possible outcomes. The dictionaries must include
666         the following fields:
667
668         id:             Video identifier.
669         url:            Final video URL.
670         uploader:       Nickname of the video uploader.
671         title:          Literal title.
672         stitle:         Simplified title.
673         ext:            Video filename extension.
674         format:         Video format.
675         player_url:     SWF Player URL (may be None).
676
677         The following fields are optional. Their primary purpose is to allow
678         youtube-dl to serve as the backend for a video search function, such
679         as the one in youtube2mp3.  They are only used when their respective
680         forced printing functions are called:
681
682         thumbnail:      Full URL to a video thumbnail image.
683         description:    One-line video description.
684
685         Subclasses of this one should re-define the _real_initialize() and
686         _real_extract() methods, as well as the suitable() static method.
687         Probably, they should also be instantiated and added to the main
688         downloader.
689         """
690
691         _ready = False
692         _downloader = None
693
694         def __init__(self, downloader=None):
695                 """Constructor. Receives an optional downloader."""
696                 self._ready = False
697                 self.set_downloader(downloader)
698
699         @staticmethod
700         def suitable(url):
701                 """Receives a URL and returns True if suitable for this IE."""
702                 return False
703
704         def initialize(self):
705                 """Initializes an instance (authentication, etc)."""
706                 if not self._ready:
707                         self._real_initialize()
708                         self._ready = True
709
710         def extract(self, url):
711                 """Extracts URL information and returns it in list of dicts."""
712                 self.initialize()
713                 return self._real_extract(url)
714
715         def set_downloader(self, downloader):
716                 """Sets the downloader for this IE."""
717                 self._downloader = downloader
718         
719         def _real_initialize(self):
720                 """Real initialization process. Redefine in subclasses."""
721                 pass
722
723         def _real_extract(self, url):
724                 """Real extraction process. Redefine in subclasses."""
725                 pass
726
727 class YoutubeIE(InfoExtractor):
728         """Information extractor for youtube.com."""
729
730         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
731         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
732         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
733         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
734         _NETRC_MACHINE = 'youtube'
735         # Listed in order of quality
736         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
737         _video_extensions = {
738                 '13': '3gp',
739                 '17': 'mp4',
740                 '18': 'mp4',
741                 '22': 'mp4',
742                 '37': 'mp4',
743                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
744                 '43': 'webm',
745                 '45': 'webm',
746         }
747
748         @staticmethod
749         def suitable(url):
750                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
751
752         def report_lang(self):
753                 """Report attempt to set language."""
754                 self._downloader.to_screen(u'[youtube] Setting language')
755
756         def report_login(self):
757                 """Report attempt to log in."""
758                 self._downloader.to_screen(u'[youtube] Logging in')
759         
760         def report_age_confirmation(self):
761                 """Report attempt to confirm age."""
762                 self._downloader.to_screen(u'[youtube] Confirming age')
763         
764         def report_video_webpage_download(self, video_id):
765                 """Report attempt to download video webpage."""
766                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
767         
768         def report_video_info_webpage_download(self, video_id):
769                 """Report attempt to download video info webpage."""
770                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
771         
772         def report_information_extraction(self, video_id):
773                 """Report attempt to extract video information."""
774                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
775         
776         def report_unavailable_format(self, video_id, format):
777                 """Report extracted video URL."""
778                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
779         
780         def report_rtmp_download(self):
781                 """Indicate the download will use the RTMP protocol."""
782                 self._downloader.to_screen(u'[youtube] RTMP download detected')
783         
784         def _real_initialize(self):
785                 if self._downloader is None:
786                         return
787
788                 username = None
789                 password = None
790                 downloader_params = self._downloader.params
791
792                 # Attempt to use provided username and password or .netrc data
793                 if downloader_params.get('username', None) is not None:
794                         username = downloader_params['username']
795                         password = downloader_params['password']
796                 elif downloader_params.get('usenetrc', False):
797                         try:
798                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
799                                 if info is not None:
800                                         username = info[0]
801                                         password = info[2]
802                                 else:
803                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
804                         except (IOError, netrc.NetrcParseError), err:
805                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
806                                 return
807
808                 # Set language
809                 request = urllib2.Request(self._LANG_URL, None, std_headers)
810                 try:
811                         self.report_lang()
812                         urllib2.urlopen(request).read()
813                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
814                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
815                         return
816
817                 # No authentication to be performed
818                 if username is None:
819                         return
820
821                 # Log in
822                 login_form = {
823                                 'current_form': 'loginForm',
824                                 'next':         '/',
825                                 'action_login': 'Log In',
826                                 'username':     username,
827                                 'password':     password,
828                                 }
829                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
830                 try:
831                         self.report_login()
832                         login_results = urllib2.urlopen(request).read()
833                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
834                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
835                                 return
836                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
837                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
838                         return
839         
840                 # Confirm age
841                 age_form = {
842                                 'next_url':             '/',
843                                 'action_confirm':       'Confirm',
844                                 }
845                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
846                 try:
847                         self.report_age_confirmation()
848                         age_results = urllib2.urlopen(request).read()
849                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
850                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
851                         return
852
853         def _real_extract(self, url):
854                 # Extract video id from URL
855                 mobj = re.match(self._VALID_URL, url)
856                 if mobj is None:
857                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
858                         return
859                 video_id = mobj.group(2)
860
861                 # Get video webpage
862                 self.report_video_webpage_download(video_id)
863                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
864                 try:
865                         video_webpage = urllib2.urlopen(request).read()
866                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
867                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
868                         return
869
870                 # Attempt to extract SWF player URL
871                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
872                 if mobj is not None:
873                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
874                 else:
875                         player_url = None
876
877                 # Get video info
878                 self.report_video_info_webpage_download(video_id)
879                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
880                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
881                                            % (video_id, el_type))
882                         request = urllib2.Request(video_info_url, None, std_headers)
883                         try:
884                                 video_info_webpage = urllib2.urlopen(request).read()
885                                 video_info = parse_qs(video_info_webpage)
886                                 if 'token' in video_info:
887                                         break
888                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
889                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
890                                 return
891                 if 'token' not in video_info:
892                         if 'reason' in video_info:
893                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
894                         else:
895                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
896                         return
897
898                 # Start extracting information
899                 self.report_information_extraction(video_id)
900
901                 # uploader
902                 if 'author' not in video_info:
903                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
904                         return
905                 video_uploader = urllib.unquote_plus(video_info['author'][0])
906
907                 # title
908                 if 'title' not in video_info:
909                         self._downloader.trouble(u'ERROR: unable to extract video title')
910                         return
911                 video_title = urllib.unquote_plus(video_info['title'][0])
912                 video_title = video_title.decode('utf-8')
913                 video_title = sanitize_title(video_title)
914
915                 # simplified title
916                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
917                 simple_title = simple_title.strip(ur'_')
918
919                 # thumbnail image
920                 if 'thumbnail_url' not in video_info:
921                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
922                         video_thumbnail = ''
923                 else:   # don't panic if we can't find it
924                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
925
926                 # upload date
927                 upload_date = u'NA'
928                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
929                 if mobj is not None:
930                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
931                         format_expressions = ['%d %B %Y', '%B %d %Y']
932                         for expression in format_expressions:
933                                 try:
934                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
935                                 except:
936                                         pass
937
938                 # description
939                 video_description = 'No description available.'
940                 if self._downloader.params.get('forcedescription', False):
941                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
942                         if mobj is not None:
943                                 video_description = mobj.group(1)
944
945                 # token
946                 video_token = urllib.unquote_plus(video_info['token'][0])
947
948                 # Decide which formats to download
949                 req_format = self._downloader.params.get('format', None)
950                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
951
952                 if 'fmt_url_map' in video_info:
953                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
954                         format_limit = self._downloader.params.get('format_limit', None)
955                         if format_limit is not None and format_limit in self._available_formats:
956                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
957                         else:
958                                 format_list = self._available_formats
959                         existing_formats = [x for x in format_list if x in url_map]
960                         if len(existing_formats) == 0:
961                                 self._downloader.trouble(u'ERROR: no known formats available for video')
962                                 return
963                         if req_format is None:
964                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
965                         elif req_format == '-1':
966                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
967                         else:
968                                 if req_format in url_map:
969                                         video_url_list = [(req_format, url_map[req_format])] # Specific format
970                                 else:
971                                         video_url_list = [(req_format, get_video_template % req_format)] # Specific format
972
973                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
974                         self.report_rtmp_download()
975                         video_url_list = [(None, video_info['conn'][0])]
976
977                 else:
978                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
979                         return
980
981                 for format_param, video_real_url in video_url_list:
982                         # At this point we have a new video
983                         self._downloader.increment_downloads()
984
985                         # Extension
986                         video_extension = self._video_extensions.get(format_param, 'flv')
987
988                         # Find the video URL in fmt_url_map or conn paramters
989                         try:
990                                 # Process video information
991                                 self._downloader.process_info({
992                                         'id':           video_id.decode('utf-8'),
993                                         'url':          video_real_url.decode('utf-8'),
994                                         'uploader':     video_uploader.decode('utf-8'),
995                                         'upload_date':  upload_date,
996                                         'title':        video_title,
997                                         'stitle':       simple_title,
998                                         'ext':          video_extension.decode('utf-8'),
999                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
1000                                         'thumbnail':    video_thumbnail.decode('utf-8'),
1001                                         'description':  video_description.decode('utf-8'),
1002                                         'player_url':   player_url,
1003                                 })
1004                         except UnavailableVideoError, err:
1005                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
1006
1007
1008 class MetacafeIE(InfoExtractor):
1009         """Information Extractor for metacafe.com."""
1010
1011         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1012         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1013         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1014         _youtube_ie = None
1015
1016         def __init__(self, youtube_ie, downloader=None):
1017                 InfoExtractor.__init__(self, downloader)
1018                 self._youtube_ie = youtube_ie
1019
1020         @staticmethod
1021         def suitable(url):
1022                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1023
1024         def report_disclaimer(self):
1025                 """Report disclaimer retrieval."""
1026                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1027
1028         def report_age_confirmation(self):
1029                 """Report attempt to confirm age."""
1030                 self._downloader.to_screen(u'[metacafe] Confirming age')
1031         
1032         def report_download_webpage(self, video_id):
1033                 """Report webpage download."""
1034                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1035         
1036         def report_extraction(self, video_id):
1037                 """Report information extraction."""
1038                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1039
1040         def _real_initialize(self):
1041                 # Retrieve disclaimer
1042                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1043                 try:
1044                         self.report_disclaimer()
1045                         disclaimer = urllib2.urlopen(request).read()
1046                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1047                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1048                         return
1049
1050                 # Confirm age
1051                 disclaimer_form = {
1052                         'filters': '0',
1053                         'submit': "Continue - I'm over 18",
1054                         }
1055                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1056                 try:
1057                         self.report_age_confirmation()
1058                         disclaimer = urllib2.urlopen(request).read()
1059                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1060                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1061                         return
1062         
1063         def _real_extract(self, url):
1064                 # Extract id and simplified title from URL
1065                 mobj = re.match(self._VALID_URL, url)
1066                 if mobj is None:
1067                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1068                         return
1069
1070                 video_id = mobj.group(1)
1071
1072                 # Check if video comes from YouTube
1073                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1074                 if mobj2 is not None:
1075                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1076                         return
1077
1078                 # At this point we have a new video
1079                 self._downloader.increment_downloads()
1080
1081                 simple_title = mobj.group(2).decode('utf-8')
1082
1083                 # Retrieve video webpage to extract further information
1084                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1085                 try:
1086                         self.report_download_webpage(video_id)
1087                         webpage = urllib2.urlopen(request).read()
1088                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1089                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1090                         return
1091
1092                 # Extract URL, uploader and title from webpage
1093                 self.report_extraction(video_id)
1094                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1095                 if mobj is not None:
1096                         mediaURL = urllib.unquote(mobj.group(1))
1097                         video_extension = mediaURL[-3:]
1098                         
1099                         # Extract gdaKey if available
1100                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1101                         if mobj is None:
1102                                 video_url = mediaURL
1103                         else:
1104                                 gdaKey = mobj.group(1)
1105                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1106                 else:
1107                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1108                         if mobj is None:
1109                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1110                                 return
1111                         vardict = parse_qs(mobj.group(1))
1112                         if 'mediaData' not in vardict:
1113                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1114                                 return
1115                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1116                         if mobj is None:
1117                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1118                                 return
1119                         mediaURL = mobj.group(1).replace('\\/', '/')
1120                         video_extension = mediaURL[-3:]
1121                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1122
1123                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1124                 if mobj is None:
1125                         self._downloader.trouble(u'ERROR: unable to extract title')
1126                         return
1127                 video_title = mobj.group(1).decode('utf-8')
1128                 video_title = sanitize_title(video_title)
1129
1130                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1131                 if mobj is None:
1132                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1133                         return
1134                 video_uploader = mobj.group(1)
1135
1136                 try:
1137                         # Process video information
1138                         self._downloader.process_info({
1139                                 'id':           video_id.decode('utf-8'),
1140                                 'url':          video_url.decode('utf-8'),
1141                                 'uploader':     video_uploader.decode('utf-8'),
1142                                 'upload_date':  u'NA',
1143                                 'title':        video_title,
1144                                 'stitle':       simple_title,
1145                                 'ext':          video_extension.decode('utf-8'),
1146                                 'format':       u'NA',
1147                                 'player_url':   None,
1148                         })
1149                 except UnavailableVideoError:
1150                         self._downloader.trouble(u'ERROR: unable to download video')
1151
1152
1153 class DailymotionIE(InfoExtractor):
1154         """Information Extractor for Dailymotion"""
1155
1156         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1157
1158         def __init__(self, downloader=None):
1159                 InfoExtractor.__init__(self, downloader)
1160
1161         @staticmethod
1162         def suitable(url):
1163                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1164
1165         def report_download_webpage(self, video_id):
1166                 """Report webpage download."""
1167                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1168         
1169         def report_extraction(self, video_id):
1170                 """Report information extraction."""
1171                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1172
1173         def _real_initialize(self):
1174                 return
1175
1176         def _real_extract(self, url):
1177                 # Extract id and simplified title from URL
1178                 mobj = re.match(self._VALID_URL, url)
1179                 if mobj is None:
1180                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1181                         return
1182
1183                 # At this point we have a new video
1184                 self._downloader.increment_downloads()
1185                 video_id = mobj.group(1)
1186
1187                 simple_title = mobj.group(2).decode('utf-8')
1188                 video_extension = 'flv'
1189
1190                 # Retrieve video webpage to extract further information
1191                 request = urllib2.Request(url)
1192                 try:
1193                         self.report_download_webpage(video_id)
1194                         webpage = urllib2.urlopen(request).read()
1195                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1196                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1197                         return
1198
1199                 # Extract URL, uploader and title from webpage
1200                 self.report_extraction(video_id)
1201                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1202                 if mobj is None:
1203                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1204                         return
1205                 mediaURL = urllib.unquote(mobj.group(1))
1206
1207                 # if needed add http://www.dailymotion.com/ if relative URL
1208
1209                 video_url = mediaURL
1210
1211                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1212                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1213                 if mobj is None:
1214                         self._downloader.trouble(u'ERROR: unable to extract title')
1215                         return
1216                 video_title = mobj.group(1).decode('utf-8')
1217                 video_title = sanitize_title(video_title)
1218
1219                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1220                 if mobj is None:
1221                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1222                         return
1223                 video_uploader = mobj.group(1)
1224
1225                 try:
1226                         # Process video information
1227                         self._downloader.process_info({
1228                                 'id':           video_id.decode('utf-8'),
1229                                 'url':          video_url.decode('utf-8'),
1230                                 'uploader':     video_uploader.decode('utf-8'),
1231                                 'upload_date':  u'NA',
1232                                 'title':        video_title,
1233                                 'stitle':       simple_title,
1234                                 'ext':          video_extension.decode('utf-8'),
1235                                 'format':       u'NA',
1236                                 'player_url':   None,
1237                         })
1238                 except UnavailableVideoError:
1239                         self._downloader.trouble(u'ERROR: unable to download video')
1240
1241 class GoogleIE(InfoExtractor):
1242         """Information extractor for video.google.com."""
1243
1244         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1245
1246         def __init__(self, downloader=None):
1247                 InfoExtractor.__init__(self, downloader)
1248
1249         @staticmethod
1250         def suitable(url):
1251                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1252
1253         def report_download_webpage(self, video_id):
1254                 """Report webpage download."""
1255                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1256
1257         def report_extraction(self, video_id):
1258                 """Report information extraction."""
1259                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1260
1261         def _real_initialize(self):
1262                 return
1263
1264         def _real_extract(self, url):
1265                 # Extract id from URL
1266                 mobj = re.match(self._VALID_URL, url)
1267                 if mobj is None:
1268                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1269                         return
1270
1271                 # At this point we have a new video
1272                 self._downloader.increment_downloads()
1273                 video_id = mobj.group(1)
1274
1275                 video_extension = 'mp4'
1276
1277                 # Retrieve video webpage to extract further information
1278                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1279                 try:
1280                         self.report_download_webpage(video_id)
1281                         webpage = urllib2.urlopen(request).read()
1282                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1284                         return
1285
1286                 # Extract URL, uploader, and title from webpage
1287                 self.report_extraction(video_id)
1288                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1289                 if mobj is None:
1290                         video_extension = 'flv'
1291                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1292                 if mobj is None:
1293                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1294                         return
1295                 mediaURL = urllib.unquote(mobj.group(1))
1296                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1297                 mediaURL = mediaURL.replace('\\x26', '\x26')
1298
1299                 video_url = mediaURL
1300
1301                 mobj = re.search(r'<title>(.*)</title>', webpage)
1302                 if mobj is None:
1303                         self._downloader.trouble(u'ERROR: unable to extract title')
1304                         return
1305                 video_title = mobj.group(1).decode('utf-8')
1306                 video_title = sanitize_title(video_title)
1307                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1308
1309                 # Extract video description
1310                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1311                 if mobj is None:
1312                         self._downloader.trouble(u'ERROR: unable to extract video description')
1313                         return
1314                 video_description = mobj.group(1).decode('utf-8')
1315                 if not video_description:
1316                         video_description = 'No description available.'
1317
1318                 # Extract video thumbnail
1319                 if self._downloader.params.get('forcethumbnail', False):
1320                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1321                         try:
1322                                 webpage = urllib2.urlopen(request).read()
1323                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1324                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1325                                 return
1326                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1327                         if mobj is None:
1328                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1329                                 return
1330                         video_thumbnail = mobj.group(1)
1331                 else:   # we need something to pass to process_info
1332                         video_thumbnail = ''
1333
1334
1335                 try:
1336                         # Process video information
1337                         self._downloader.process_info({
1338                                 'id':           video_id.decode('utf-8'),
1339                                 'url':          video_url.decode('utf-8'),
1340                                 'uploader':     u'NA',
1341                                 'upload_date':  u'NA',
1342                                 'title':        video_title,
1343                                 'stitle':       simple_title,
1344                                 'ext':          video_extension.decode('utf-8'),
1345                                 'format':       u'NA',
1346                                 'player_url':   None,
1347                         })
1348                 except UnavailableVideoError:
1349                         self._downloader.trouble(u'ERROR: unable to download video')
1350
1351
1352 class PhotobucketIE(InfoExtractor):
1353         """Information extractor for photobucket.com."""
1354
1355         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1356
1357         def __init__(self, downloader=None):
1358                 InfoExtractor.__init__(self, downloader)
1359
1360         @staticmethod
1361         def suitable(url):
1362                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1363
1364         def report_download_webpage(self, video_id):
1365                 """Report webpage download."""
1366                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1367
1368         def report_extraction(self, video_id):
1369                 """Report information extraction."""
1370                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1371
1372         def _real_initialize(self):
1373                 return
1374
1375         def _real_extract(self, url):
1376                 # Extract id from URL
1377                 mobj = re.match(self._VALID_URL, url)
1378                 if mobj is None:
1379                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380                         return
1381
1382                 # At this point we have a new video
1383                 self._downloader.increment_downloads()
1384                 video_id = mobj.group(1)
1385
1386                 video_extension = 'flv'
1387
1388                 # Retrieve video webpage to extract further information
1389                 request = urllib2.Request(url)
1390                 try:
1391                         self.report_download_webpage(video_id)
1392                         webpage = urllib2.urlopen(request).read()
1393                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1394                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1395                         return
1396
1397                 # Extract URL, uploader, and title from webpage
1398                 self.report_extraction(video_id)
1399                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1400                 if mobj is None:
1401                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1402                         return
1403                 mediaURL = urllib.unquote(mobj.group(1))
1404
1405                 video_url = mediaURL
1406
1407                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1408                 if mobj is None:
1409                         self._downloader.trouble(u'ERROR: unable to extract title')
1410                         return
1411                 video_title = mobj.group(1).decode('utf-8')
1412                 video_title = sanitize_title(video_title)
1413                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1414
1415                 video_uploader = mobj.group(2).decode('utf-8')
1416
1417                 try:
1418                         # Process video information
1419                         self._downloader.process_info({
1420                                 'id':           video_id.decode('utf-8'),
1421                                 'url':          video_url.decode('utf-8'),
1422                                 'uploader':     video_uploader,
1423                                 'upload_date':  u'NA',
1424                                 'title':        video_title,
1425                                 'stitle':       simple_title,
1426                                 'ext':          video_extension.decode('utf-8'),
1427                                 'format':       u'NA',
1428                                 'player_url':   None,
1429                         })
1430                 except UnavailableVideoError:
1431                         self._downloader.trouble(u'ERROR: unable to download video')
1432
1433
1434 class YahooIE(InfoExtractor):
1435         """Information extractor for video.yahoo.com."""
1436
1437         # _VALID_URL matches all Yahoo! Video URLs
1438         # _VPAGE_URL matches only the extractable '/watch/' URLs
1439         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1440         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1441
1442         def __init__(self, downloader=None):
1443                 InfoExtractor.__init__(self, downloader)
1444
1445         @staticmethod
1446         def suitable(url):
1447                 return (re.match(YahooIE._VALID_URL, url) is not None)
1448
1449         def report_download_webpage(self, video_id):
1450                 """Report webpage download."""
1451                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1452
1453         def report_extraction(self, video_id):
1454                 """Report information extraction."""
1455                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1456
1457         def _real_initialize(self):
1458                 return
1459
1460         def _real_extract(self, url, new_video=True):
1461                 # Extract ID from URL
1462                 mobj = re.match(self._VALID_URL, url)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1465                         return
1466
1467                 # At this point we have a new video
1468                 self._downloader.increment_downloads()
1469                 video_id = mobj.group(2)
1470                 video_extension = 'flv'
1471
1472                 # Rewrite valid but non-extractable URLs as
1473                 # extractable English language /watch/ URLs
1474                 if re.match(self._VPAGE_URL, url) is None:
1475                         request = urllib2.Request(url)
1476                         try:
1477                                 webpage = urllib2.urlopen(request).read()
1478                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1479                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1480                                 return
1481
1482                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1483                         if mobj is None:
1484                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1485                                 return
1486                         yahoo_id = mobj.group(1)
1487
1488                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1489                         if mobj is None:
1490                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1491                                 return
1492                         yahoo_vid = mobj.group(1)
1493
1494                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1495                         return self._real_extract(url, new_video=False)
1496
1497                 # Retrieve video webpage to extract further information
1498                 request = urllib2.Request(url)
1499                 try:
1500                         self.report_download_webpage(video_id)
1501                         webpage = urllib2.urlopen(request).read()
1502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1504                         return
1505
1506                 # Extract uploader and title from webpage
1507                 self.report_extraction(video_id)
1508                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1509                 if mobj is None:
1510                         self._downloader.trouble(u'ERROR: unable to extract video title')
1511                         return
1512                 video_title = mobj.group(1).decode('utf-8')
1513                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1514
1515                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1516                 if mobj is None:
1517                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1518                         return
1519                 video_uploader = mobj.group(1).decode('utf-8')
1520
1521                 # Extract video thumbnail
1522                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1523                 if mobj is None:
1524                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1525                         return
1526                 video_thumbnail = mobj.group(1).decode('utf-8')
1527
1528                 # Extract video description
1529                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1530                 if mobj is None:
1531                         self._downloader.trouble(u'ERROR: unable to extract video description')
1532                         return
1533                 video_description = mobj.group(1).decode('utf-8')
1534                 if not video_description: video_description = 'No description available.'
1535
1536                 # Extract video height and width
1537                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1538                 if mobj is None:
1539                         self._downloader.trouble(u'ERROR: unable to extract video height')
1540                         return
1541                 yv_video_height = mobj.group(1)
1542
1543                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1544                 if mobj is None:
1545                         self._downloader.trouble(u'ERROR: unable to extract video width')
1546                         return
1547                 yv_video_width = mobj.group(1)
1548
1549                 # Retrieve video playlist to extract media URL
1550                 # I'm not completely sure what all these options are, but we
1551                 # seem to need most of them, otherwise the server sends a 401.
1552                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1553                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1554                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1555                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1556                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1557                 try:
1558                         self.report_download_webpage(video_id)
1559                         webpage = urllib2.urlopen(request).read()
1560                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1561                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1562                         return
1563
1564                 # Extract media URL from playlist XML
1565                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1566                 if mobj is None:
1567                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1568                         return
1569                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1570                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1571
1572                 try:
1573                         # Process video information
1574                         self._downloader.process_info({
1575                                 'id':           video_id.decode('utf-8'),
1576                                 'url':          video_url,
1577                                 'uploader':     video_uploader,
1578                                 'upload_date':  u'NA',
1579                                 'title':        video_title,
1580                                 'stitle':       simple_title,
1581                                 'ext':          video_extension.decode('utf-8'),
1582                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1583                                 'description':  video_description,
1584                                 'thumbnail':    video_thumbnail,
1585                                 'description':  video_description,
1586                                 'player_url':   None,
1587                         })
1588                 except UnavailableVideoError:
1589                         self._downloader.trouble(u'ERROR: unable to download video')
1590
1591
1592 class GenericIE(InfoExtractor):
1593         """Generic last-resort information extractor."""
1594
1595         def __init__(self, downloader=None):
1596                 InfoExtractor.__init__(self, downloader)
1597
1598         @staticmethod
1599         def suitable(url):
1600                 return True
1601
1602         def report_download_webpage(self, video_id):
1603                 """Report webpage download."""
1604                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1605                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1606
1607         def report_extraction(self, video_id):
1608                 """Report information extraction."""
1609                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1610
1611         def _real_initialize(self):
1612                 return
1613
1614         def _real_extract(self, url):
1615                 # At this point we have a new video
1616                 self._downloader.increment_downloads()
1617
1618                 video_id = url.split('/')[-1]
1619                 request = urllib2.Request(url)
1620                 try:
1621                         self.report_download_webpage(video_id)
1622                         webpage = urllib2.urlopen(request).read()
1623                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1624                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1625                         return
1626                 except ValueError, err:
1627                         # since this is the last-resort InfoExtractor, if
1628                         # this error is thrown, it'll be thrown here
1629                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1630                         return
1631
1632                 self.report_extraction(video_id)
1633                 # Start with something easy: JW Player in SWFObject
1634                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1635                 if mobj is None:
1636                         # Broaden the search a little bit
1637                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1638                 if mobj is None:
1639                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1640                         return
1641
1642                 # It's possible that one of the regexes
1643                 # matched, but returned an empty group:
1644                 if mobj.group(1) is None:
1645                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1646                         return
1647
1648                 video_url = urllib.unquote(mobj.group(1))
1649                 video_id  = os.path.basename(video_url)
1650
1651                 # here's a fun little line of code for you:
1652                 video_extension = os.path.splitext(video_id)[1][1:]
1653                 video_id        = os.path.splitext(video_id)[0]
1654
1655                 # it's tempting to parse this further, but you would
1656                 # have to take into account all the variations like
1657                 #   Video Title - Site Name
1658                 #   Site Name | Video Title
1659                 #   Video Title - Tagline | Site Name
1660                 # and so on and so forth; it's just not practical
1661                 mobj = re.search(r'<title>(.*)</title>', webpage)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: unable to extract title')
1664                         return
1665                 video_title = mobj.group(1).decode('utf-8')
1666                 video_title = sanitize_title(video_title)
1667                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1668
1669                 # video uploader is domain name
1670                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1671                 if mobj is None:
1672                         self._downloader.trouble(u'ERROR: unable to extract title')
1673                         return
1674                 video_uploader = mobj.group(1).decode('utf-8')
1675
1676                 try:
1677                         # Process video information
1678                         self._downloader.process_info({
1679                                 'id':           video_id.decode('utf-8'),
1680                                 'url':          video_url.decode('utf-8'),
1681                                 'uploader':     video_uploader,
1682                                 'upload_date':  u'NA',
1683                                 'title':        video_title,
1684                                 'stitle':       simple_title,
1685                                 'ext':          video_extension.decode('utf-8'),
1686                                 'format':       u'NA',
1687                                 'player_url':   None,
1688                         })
1689                 except UnavailableVideoError, err:
1690                         self._downloader.trouble(u'ERROR: unable to download video')
1691
1692
1693 class YoutubeSearchIE(InfoExtractor):
1694         """Information Extractor for YouTube search queries."""
1695         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1696         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1697         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1698         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1699         _youtube_ie = None
1700         _max_youtube_results = 1000
1701
1702         def __init__(self, youtube_ie, downloader=None):
1703                 InfoExtractor.__init__(self, downloader)
1704                 self._youtube_ie = youtube_ie
1705         
1706         @staticmethod
1707         def suitable(url):
1708                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1709
1710         def report_download_page(self, query, pagenum):
1711                 """Report attempt to download playlist page with given number."""
1712                 query = query.decode(preferredencoding())
1713                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1714
1715         def _real_initialize(self):
1716                 self._youtube_ie.initialize()
1717         
1718         def _real_extract(self, query):
1719                 mobj = re.match(self._VALID_QUERY, query)
1720                 if mobj is None:
1721                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1722                         return
1723
1724                 prefix, query = query.split(':')
1725                 prefix = prefix[8:]
1726                 query  = query.encode('utf-8')
1727                 if prefix == '':
1728                         self._download_n_results(query, 1)
1729                         return
1730                 elif prefix == 'all':
1731                         self._download_n_results(query, self._max_youtube_results)
1732                         return
1733                 else:
1734                         try:
1735                                 n = long(prefix)
1736                                 if n <= 0:
1737                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1738                                         return
1739                                 elif n > self._max_youtube_results:
1740                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1741                                         n = self._max_youtube_results
1742                                 self._download_n_results(query, n)
1743                                 return
1744                         except ValueError: # parsing prefix as integer fails
1745                                 self._download_n_results(query, 1)
1746                                 return
1747
1748         def _download_n_results(self, query, n):
1749                 """Downloads a specified number of results for a query"""
1750
1751                 video_ids = []
1752                 already_seen = set()
1753                 pagenum = 1
1754
1755                 while True:
1756                         self.report_download_page(query, pagenum)
1757                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1758                         request = urllib2.Request(result_url, None, std_headers)
1759                         try:
1760                                 page = urllib2.urlopen(request).read()
1761                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763                                 return
1764
1765                         # Extract video identifiers
1766                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1767                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1768                                 if video_id not in already_seen:
1769                                         video_ids.append(video_id)
1770                                         already_seen.add(video_id)
1771                                         if len(video_ids) == n:
1772                                                 # Specified n videos reached
1773                                                 for id in video_ids:
1774                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1775                                                 return
1776
1777                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1778                                 for id in video_ids:
1779                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1780                                 return
1781
1782                         pagenum = pagenum + 1
1783
1784 class GoogleSearchIE(InfoExtractor):
1785         """Information Extractor for Google Video search queries."""
1786         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1787         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1788         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1789         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1790         _google_ie = None
1791         _max_google_results = 1000
1792
1793         def __init__(self, google_ie, downloader=None):
1794                 InfoExtractor.__init__(self, downloader)
1795                 self._google_ie = google_ie
1796         
1797         @staticmethod
1798         def suitable(url):
1799                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1800
1801         def report_download_page(self, query, pagenum):
1802                 """Report attempt to download playlist page with given number."""
1803                 query = query.decode(preferredencoding())
1804                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1805
1806         def _real_initialize(self):
1807                 self._google_ie.initialize()
1808         
1809         def _real_extract(self, query):
1810                 mobj = re.match(self._VALID_QUERY, query)
1811                 if mobj is None:
1812                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1813                         return
1814
1815                 prefix, query = query.split(':')
1816                 prefix = prefix[8:]
1817                 query  = query.encode('utf-8')
1818                 if prefix == '':
1819                         self._download_n_results(query, 1)
1820                         return
1821                 elif prefix == 'all':
1822                         self._download_n_results(query, self._max_google_results)
1823                         return
1824                 else:
1825                         try:
1826                                 n = long(prefix)
1827                                 if n <= 0:
1828                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1829                                         return
1830                                 elif n > self._max_google_results:
1831                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1832                                         n = self._max_google_results
1833                                 self._download_n_results(query, n)
1834                                 return
1835                         except ValueError: # parsing prefix as integer fails
1836                                 self._download_n_results(query, 1)
1837                                 return
1838
1839         def _download_n_results(self, query, n):
1840                 """Downloads a specified number of results for a query"""
1841
1842                 video_ids = []
1843                 already_seen = set()
1844                 pagenum = 1
1845
1846                 while True:
1847                         self.report_download_page(query, pagenum)
1848                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1849                         request = urllib2.Request(result_url, None, std_headers)
1850                         try:
1851                                 page = urllib2.urlopen(request).read()
1852                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1853                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854                                 return
1855
1856                         # Extract video identifiers
1857                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1858                                 video_id = mobj.group(1)
1859                                 if video_id not in already_seen:
1860                                         video_ids.append(video_id)
1861                                         already_seen.add(video_id)
1862                                         if len(video_ids) == n:
1863                                                 # Specified n videos reached
1864                                                 for id in video_ids:
1865                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1866                                                 return
1867
1868                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1869                                 for id in video_ids:
1870                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1871                                 return
1872
1873                         pagenum = pagenum + 1
1874
1875 class YahooSearchIE(InfoExtractor):
1876         """Information Extractor for Yahoo! Video search queries."""
1877         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1878         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1879         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1880         _MORE_PAGES_INDICATOR = r'\s*Next'
1881         _yahoo_ie = None
1882         _max_yahoo_results = 1000
1883
1884         def __init__(self, yahoo_ie, downloader=None):
1885                 InfoExtractor.__init__(self, downloader)
1886                 self._yahoo_ie = yahoo_ie
1887         
1888         @staticmethod
1889         def suitable(url):
1890                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1891
1892         def report_download_page(self, query, pagenum):
1893                 """Report attempt to download playlist page with given number."""
1894                 query = query.decode(preferredencoding())
1895                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1896
1897         def _real_initialize(self):
1898                 self._yahoo_ie.initialize()
1899         
1900         def _real_extract(self, query):
1901                 mobj = re.match(self._VALID_QUERY, query)
1902                 if mobj is None:
1903                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1904                         return
1905
1906                 prefix, query = query.split(':')
1907                 prefix = prefix[8:]
1908                 query  = query.encode('utf-8')
1909                 if prefix == '':
1910                         self._download_n_results(query, 1)
1911                         return
1912                 elif prefix == 'all':
1913                         self._download_n_results(query, self._max_yahoo_results)
1914                         return
1915                 else:
1916                         try:
1917                                 n = long(prefix)
1918                                 if n <= 0:
1919                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1920                                         return
1921                                 elif n > self._max_yahoo_results:
1922                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1923                                         n = self._max_yahoo_results
1924                                 self._download_n_results(query, n)
1925                                 return
1926                         except ValueError: # parsing prefix as integer fails
1927                                 self._download_n_results(query, 1)
1928                                 return
1929
1930         def _download_n_results(self, query, n):
1931                 """Downloads a specified number of results for a query"""
1932
1933                 video_ids = []
1934                 already_seen = set()
1935                 pagenum = 1
1936
1937                 while True:
1938                         self.report_download_page(query, pagenum)
1939                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1940                         request = urllib2.Request(result_url, None, std_headers)
1941                         try:
1942                                 page = urllib2.urlopen(request).read()
1943                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945                                 return
1946
1947                         # Extract video identifiers
1948                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1949                                 video_id = mobj.group(1)
1950                                 if video_id not in already_seen:
1951                                         video_ids.append(video_id)
1952                                         already_seen.add(video_id)
1953                                         if len(video_ids) == n:
1954                                                 # Specified n videos reached
1955                                                 for id in video_ids:
1956                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1957                                                 return
1958
1959                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1960                                 for id in video_ids:
1961                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1962                                 return
1963
1964                         pagenum = pagenum + 1
1965
1966 class YoutubePlaylistIE(InfoExtractor):
1967         """Information Extractor for YouTube playlists."""
1968
1969         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1970         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1971         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1972         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1973         _youtube_ie = None
1974
1975         def __init__(self, youtube_ie, downloader=None):
1976                 InfoExtractor.__init__(self, downloader)
1977                 self._youtube_ie = youtube_ie
1978         
1979         @staticmethod
1980         def suitable(url):
1981                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1982
1983         def report_download_page(self, playlist_id, pagenum):
1984                 """Report attempt to download playlist page with given number."""
1985                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1986
1987         def _real_initialize(self):
1988                 self._youtube_ie.initialize()
1989         
1990         def _real_extract(self, url):
1991                 # Extract playlist id
1992                 mobj = re.match(self._VALID_URL, url)
1993                 if mobj is None:
1994                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995                         return
1996
1997                 # Download playlist pages
1998                 playlist_id = mobj.group(1)
1999                 video_ids = []
2000                 pagenum = 1
2001
2002                 while True:
2003                         self.report_download_page(playlist_id, pagenum)
2004                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2005                         try:
2006                                 page = urllib2.urlopen(request).read()
2007                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2008                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2009                                 return
2010
2011                         # Extract video identifiers
2012                         ids_in_page = []
2013                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014                                 if mobj.group(1) not in ids_in_page:
2015                                         ids_in_page.append(mobj.group(1))
2016                         video_ids.extend(ids_in_page)
2017
2018                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2019                                 break
2020                         pagenum = pagenum + 1
2021
2022                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2023                 playlistend = self._downloader.params.get('playlistend', -1)
2024                 video_ids = video_ids[playliststart:playlistend]
2025
2026                 for id in video_ids:
2027                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2028                 return
2029
2030 class YoutubeUserIE(InfoExtractor):
2031         """Information Extractor for YouTube users."""
2032
2033         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2034         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2035         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2036         _youtube_ie = None
2037
2038         def __init__(self, youtube_ie, downloader=None):
2039                 InfoExtractor.__init__(self, downloader)
2040                 self._youtube_ie = youtube_ie
2041         
2042         @staticmethod
2043         def suitable(url):
2044                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2045
2046         def report_download_page(self, username):
2047                 """Report attempt to download user page."""
2048                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2049
2050         def _real_initialize(self):
2051                 self._youtube_ie.initialize()
2052         
2053         def _real_extract(self, url):
2054                 # Extract username
2055                 mobj = re.match(self._VALID_URL, url)
2056                 if mobj is None:
2057                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2058                         return
2059
2060                 # Download user page
2061                 username = mobj.group(1)
2062                 video_ids = []
2063                 pagenum = 1
2064
2065                 self.report_download_page(username)
2066                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2067                 try:
2068                         page = urllib2.urlopen(request).read()
2069                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2070                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2071                         return
2072
2073                 # Extract video identifiers
2074                 ids_in_page = []
2075
2076                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2077                         if mobj.group(1) not in ids_in_page:
2078                                 ids_in_page.append(mobj.group(1))
2079                 video_ids.extend(ids_in_page)
2080
2081                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2082                 playlistend = self._downloader.params.get('playlistend', -1)
2083                 video_ids = video_ids[playliststart:playlistend]
2084
2085                 for id in video_ids:
2086                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2087                 return
2088
2089 class DepositFilesIE(InfoExtractor):
2090         """Information extractor for depositfiles.com"""
2091
2092         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2093
2094         def __init__(self, downloader=None):
2095                 InfoExtractor.__init__(self, downloader)
2096
2097         @staticmethod
2098         def suitable(url):
2099                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2100
2101         def report_download_webpage(self, file_id):
2102                 """Report webpage download."""
2103                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2104
2105         def report_extraction(self, file_id):
2106                 """Report information extraction."""
2107                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2108
2109         def _real_initialize(self):
2110                 return
2111
2112         def _real_extract(self, url):
2113                 # At this point we have a new file
2114                 self._downloader.increment_downloads()
2115
2116                 file_id = url.split('/')[-1]
2117                 # Rebuild url in english locale
2118                 url = 'http://depositfiles.com/en/files/' + file_id
2119
2120                 # Retrieve file webpage with 'Free download' button pressed
2121                 free_download_indication = { 'gateway_result' : '1' }
2122                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2123                 try:
2124                         self.report_download_webpage(file_id)
2125                         webpage = urllib2.urlopen(request).read()
2126                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2127                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2128                         return
2129
2130                 # Search for the real file URL
2131                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2132                 if (mobj is None) or (mobj.group(1) is None):
2133                         # Try to figure out reason of the error.
2134                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2135                         if (mobj is not None) and (mobj.group(1) is not None):
2136                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2137                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2138                         else:
2139                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2140                         return
2141
2142                 file_url = mobj.group(1)
2143                 file_extension = os.path.splitext(file_url)[1][1:]
2144
2145                 # Search for file title
2146                 mobj = re.search(r'<b title="(.*?)">', webpage)
2147                 if mobj is None:
2148                         self._downloader.trouble(u'ERROR: unable to extract title')
2149                         return
2150                 file_title = mobj.group(1).decode('utf-8')
2151
2152                 try:
2153                         # Process file information
2154                         self._downloader.process_info({
2155                                 'id':           file_id.decode('utf-8'),
2156                                 'url':          file_url.decode('utf-8'),
2157                                 'uploader':     u'NA',
2158                                 'upload_date':  u'NA',
2159                                 'title':        file_title,
2160                                 'stitle':       file_title,
2161                                 'ext':          file_extension.decode('utf-8'),
2162                                 'format':       u'NA',
2163                                 'player_url':   None,
2164                         })
2165                 except UnavailableVideoError, err:
2166                         self._downloader.trouble(u'ERROR: unable to download file')
2167
2168 class PostProcessor(object):
2169         """Post Processor class.
2170
2171         PostProcessor objects can be added to downloaders with their
2172         add_post_processor() method. When the downloader has finished a
2173         successful download, it will take its internal chain of PostProcessors
2174         and start calling the run() method on each one of them, first with
2175         an initial argument and then with the returned value of the previous
2176         PostProcessor.
2177
2178         The chain will be stopped if one of them ever returns None or the end
2179         of the chain is reached.
2180
2181         PostProcessor objects follow a "mutual registration" process similar
2182         to InfoExtractor objects.
2183         """
2184
2185         _downloader = None
2186
2187         def __init__(self, downloader=None):
2188                 self._downloader = downloader
2189
2190         def set_downloader(self, downloader):
2191                 """Sets the downloader for this PP."""
2192                 self._downloader = downloader
2193         
2194         def run(self, information):
2195                 """Run the PostProcessor.
2196
2197                 The "information" argument is a dictionary like the ones
2198                 composed by InfoExtractors. The only difference is that this
2199                 one has an extra field called "filepath" that points to the
2200                 downloaded file.
2201
2202                 When this method returns None, the postprocessing chain is
2203                 stopped. However, this method may return an information
2204                 dictionary that will be passed to the next postprocessing
2205                 object in the chain. It can be the one it received after
2206                 changing some fields.
2207
2208                 In addition, this method may raise a PostProcessingError
2209                 exception that will be taken into account by the downloader
2210                 it was called from.
2211                 """
2212                 return information # by default, do nothing
2213         
2214 ### MAIN PROGRAM ###
2215 if __name__ == '__main__':
2216         try:
2217                 # Modules needed only when running the main program
2218                 import getpass
2219                 import optparse
2220
2221                 # Function to update the program file with the latest version from bitbucket.org
2222                 def update_self(downloader, filename):
2223                         # Note: downloader only used for options
2224                         if not os.access (filename, os.W_OK):
2225                                 sys.exit('ERROR: no write permissions on %s' % filename)
2226
2227                         downloader.to_screen('Updating to latest stable version...')
2228                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2229                         latest_version = urllib.urlopen(latest_url).read().strip()
2230                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2231                         newcontent = urllib.urlopen(prog_url).read()
2232                         stream = open(filename, 'w')
2233                         stream.write(newcontent)
2234                         stream.close()
2235                         downloader.to_screen('Updated to version %s' % latest_version)
2236
2237                 # Parse command line
2238                 parser = optparse.OptionParser(
2239                         usage='Usage: %prog [options] url...',
2240                         version='2010.12.09',
2241                         conflict_handler='resolve',
2242                 )
2243
2244                 parser.add_option('-h', '--help',
2245                                 action='help', help='print this help text and exit')
2246                 parser.add_option('-v', '--version',
2247                                 action='version', help='print program version and exit')
2248                 parser.add_option('-U', '--update',
2249                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2250                 parser.add_option('-i', '--ignore-errors',
2251                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2252                 parser.add_option('-r', '--rate-limit',
2253                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2254                 parser.add_option('-R', '--retries',
2255                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2256                 parser.add_option('--playlist-start',
2257                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2258                 parser.add_option('--playlist-end',
2259                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2260
2261                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2262                 authentication.add_option('-u', '--username',
2263                                 dest='username', metavar='USERNAME', help='account username')
2264                 authentication.add_option('-p', '--password',
2265                                 dest='password', metavar='PASSWORD', help='account password')
2266                 authentication.add_option('-n', '--netrc',
2267                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2268                 parser.add_option_group(authentication)
2269
2270                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2271                 video_format.add_option('-f', '--format',
2272                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2273                 video_format.add_option('-m', '--mobile-version',
2274                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2275                 video_format.add_option('--all-formats',
2276                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2277                 video_format.add_option('--max-quality',
2278                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2279                 video_format.add_option('-b', '--best-quality',
2280                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2281                 parser.add_option_group(video_format)
2282
2283                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2284                 verbosity.add_option('-q', '--quiet',
2285                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2286                 verbosity.add_option('-s', '--simulate',
2287                                 action='store_true', dest='simulate', help='do not download video', default=False)
2288                 verbosity.add_option('-g', '--get-url',
2289                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2290                 verbosity.add_option('-e', '--get-title',
2291                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2292                 verbosity.add_option('--get-thumbnail',
2293                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2294                 verbosity.add_option('--get-description',
2295                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2296                 verbosity.add_option('--no-progress',
2297                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2298                 parser.add_option_group(verbosity)
2299
2300                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2301                 filesystem.add_option('-t', '--title',
2302                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2303                 filesystem.add_option('-l', '--literal',
2304                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2305                 filesystem.add_option('-A', '--auto-number',
2306                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2307                 filesystem.add_option('-o', '--output',
2308                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2309                 filesystem.add_option('-a', '--batch-file',
2310                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2311                 filesystem.add_option('-w', '--no-overwrites',
2312                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2313                 filesystem.add_option('-c', '--continue',
2314                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2315                 filesystem.add_option('--cookies',
2316                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2317                 parser.add_option_group(filesystem)
2318
2319                 (opts, args) = parser.parse_args()
2320
2321                 # Open appropriate CookieJar
2322                 if opts.cookiefile is None:
2323                         jar = cookielib.CookieJar()
2324                 else:
2325                         try:
2326                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2327                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2328                                         jar.load()
2329                         except (IOError, OSError), err:
2330                                 sys.exit(u'ERROR: unable to open cookie file')
2331
2332                 # General configuration
2333                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2334                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2335                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2336                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2337
2338                 # Batch file verification
2339                 batchurls = []
2340                 if opts.batchfile is not None:
2341                         try:
2342                                 if opts.batchfile == '-':
2343                                         batchfd = sys.stdin
2344                                 else:
2345                                         batchfd = open(opts.batchfile, 'r')
2346                                 batchurls = batchfd.readlines()
2347                                 batchurls = [x.strip() for x in batchurls]
2348                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2349                         except IOError:
2350                                 sys.exit(u'ERROR: batch file could not be read')
2351                 all_urls = batchurls + args
2352
2353                 # Conflicting, missing and erroneous options
2354                 if opts.bestquality:
2355                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2356                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2357                         parser.error(u'using .netrc conflicts with giving username/password')
2358                 if opts.password is not None and opts.username is None:
2359                         parser.error(u'account username missing')
2360                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2361                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2362                 if opts.usetitle and opts.useliteral:
2363                         parser.error(u'using title conflicts with using literal title')
2364                 if opts.username is not None and opts.password is None:
2365                         opts.password = getpass.getpass(u'Type account password and press return:')
2366                 if opts.ratelimit is not None:
2367                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2368                         if numeric_limit is None:
2369                                 parser.error(u'invalid rate limit specified')
2370                         opts.ratelimit = numeric_limit
2371                 if opts.retries is not None:
2372                         try:
2373                                 opts.retries = long(opts.retries)
2374                         except (TypeError, ValueError), err:
2375                                 parser.error(u'invalid retry count specified')
2376                 try:
2377                         opts.playliststart = long(opts.playliststart)
2378                         if opts.playliststart <= 0:
2379                                 raise ValueError
2380                 except (TypeError, ValueError), err:
2381                         parser.error(u'invalid playlist start number specified')
2382                 try:
2383                         opts.playlistend = long(opts.playlistend)
2384                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2385                                 raise ValueError
2386                 except (TypeError, ValueError), err:
2387                         parser.error(u'invalid playlist end number specified')
2388
2389                 # Information extractors
2390                 youtube_ie = YoutubeIE()
2391                 metacafe_ie = MetacafeIE(youtube_ie)
2392                 dailymotion_ie = DailymotionIE()
2393                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2394                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2395                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2396                 google_ie = GoogleIE()
2397                 google_search_ie = GoogleSearchIE(google_ie)
2398                 photobucket_ie = PhotobucketIE()
2399                 yahoo_ie = YahooIE()
2400                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2401                 deposit_files_ie = DepositFilesIE()
2402                 generic_ie = GenericIE()
2403
2404                 # File downloader
2405                 fd = FileDownloader({
2406                         'usenetrc': opts.usenetrc,
2407                         'username': opts.username,
2408                         'password': opts.password,
2409                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2410                         'forceurl': opts.geturl,
2411                         'forcetitle': opts.gettitle,
2412                         'forcethumbnail': opts.getthumbnail,
2413                         'forcedescription': opts.getdescription,
2414                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2415                         'format': opts.format,
2416                         'format_limit': opts.format_limit,
2417                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2418                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2419                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2420                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2421                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2422                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2423                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2424                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2425                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2426                                 or u'%(id)s.%(ext)s'),
2427                         'ignoreerrors': opts.ignoreerrors,
2428                         'ratelimit': opts.ratelimit,
2429                         'nooverwrites': opts.nooverwrites,
2430                         'retries': opts.retries,
2431                         'continuedl': opts.continue_dl,
2432                         'noprogress': opts.noprogress,
2433                         'playliststart': opts.playliststart,
2434                         'playlistend': opts.playlistend,
2435                         'logtostderr': opts.outtmpl == '-',
2436                         })
2437                 fd.add_info_extractor(youtube_search_ie)
2438                 fd.add_info_extractor(youtube_pl_ie)
2439                 fd.add_info_extractor(youtube_user_ie)
2440                 fd.add_info_extractor(metacafe_ie)
2441                 fd.add_info_extractor(dailymotion_ie)
2442                 fd.add_info_extractor(youtube_ie)
2443                 fd.add_info_extractor(google_ie)
2444                 fd.add_info_extractor(google_search_ie)
2445                 fd.add_info_extractor(photobucket_ie)
2446                 fd.add_info_extractor(yahoo_ie)
2447                 fd.add_info_extractor(yahoo_search_ie)
2448                 fd.add_info_extractor(deposit_files_ie)
2449
2450                 # This must come last since it's the
2451                 # fallback if none of the others work
2452                 fd.add_info_extractor(generic_ie)
2453
2454                 # Update version
2455                 if opts.update_self:
2456                         update_self(fd, sys.argv[0])
2457
2458                 # Maybe do nothing
2459                 if len(all_urls) < 1:
2460                         if not opts.update_self:
2461                                 parser.error(u'you must provide at least one URL')
2462                         else:
2463                                 sys.exit()
2464                 retcode = fd.download(all_urls)
2465
2466                 # Dump cookie jar if requested
2467                 if opts.cookiefile is not None:
2468                         try:
2469                                 jar.save()
2470                         except (IOError, OSError), err:
2471                                 sys.exit(u'ERROR: unable to save cookie jar')
2472
2473                 sys.exit(retcode)
2474
2475         except DownloadError:
2476                 sys.exit(1)
2477         except SameFileError:
2478                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2479         except KeyboardInterrupt:
2480                 sys.exit(u'\nERROR: Interrupted by user')