d60fa60c9ea3b9ef162dbc1114332fcd1f850ddd
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
8 import cookielib
9 import datetime
10 import htmlentitydefs
11 import httplib
12 import locale
13 import math
14 import netrc
15 import os
16 import os.path
17 import re
18 import socket
19 import string
20 import subprocess
21 import sys
22 import time
23 import urllib
24 import urllib2
25
26 # parse_qs was moved from the cgi module to the urlparse module recently.
27 try:
28         from urlparse import parse_qs
29 except ImportError:
30         from cgi import parse_qs
31
32 std_headers = {
33         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36         'Accept-Language': 'en-us,en;q=0.5',
37 }
38
39 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
40
41 def preferredencoding():
42         """Get preferred encoding.
43
44         Returns the best encoding scheme for the system, based on
45         locale.getpreferredencoding() and some further tweaks.
46         """
47         def yield_preferredencoding():
48                 try:
49                         pref = locale.getpreferredencoding()
50                         u'TEST'.encode(pref)
51                 except:
52                         pref = 'UTF-8'
53                 while True:
54                         yield pref
55         return yield_preferredencoding().next()
56
57 def htmlentity_transform(matchobj):
58         """Transforms an HTML entity to a Unicode character.
59         
60         This function receives a match object and is intended to be used with
61         the re.sub() function.
62         """
63         entity = matchobj.group(1)
64
65         # Known non-numeric HTML entity
66         if entity in htmlentitydefs.name2codepoint:
67                 return unichr(htmlentitydefs.name2codepoint[entity])
68
69         # Unicode character
70         mobj = re.match(ur'(?u)#(x?\d+)', entity)
71         if mobj is not None:
72                 numstr = mobj.group(1)
73                 if numstr.startswith(u'x'):
74                         base = 16
75                         numstr = u'0%s' % numstr
76                 else:
77                         base = 10
78                 return unichr(long(numstr, base))
79
80         # Unknown entity in name, return its literal representation
81         return (u'&%s;' % entity)
82
83 def sanitize_title(utitle):
84         """Sanitizes a video title so it could be used as part of a filename."""
85         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
86         return utitle.replace(unicode(os.sep), u'%')
87
88 def sanitize_open(filename, open_mode):
89         """Try to open the given filename, and slightly tweak it if this fails.
90
91         Attempts to open the given filename. If this fails, it tries to change
92         the filename slightly, step by step, until it's either able to open it
93         or it fails and raises a final exception, like the standard open()
94         function.
95
96         It returns the tuple (stream, definitive_file_name).
97         """
98         try:
99                 if filename == u'-':
100                         if sys.platform == 'win32':
101                                 import msvcrt
102                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
103                         return (sys.stdout, filename)
104                 stream = open(filename, open_mode)
105                 return (stream, filename)
106         except (IOError, OSError), err:
107                 # In case of error, try to remove win32 forbidden chars
108                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
109
110                 # An exception here should be caught in the caller
111                 stream = open(filename, open_mode)
112                 return (stream, filename)
113
114 class DownloadError(Exception):
115         """Download Error exception.
116         
117         This exception may be thrown by FileDownloader objects if they are not
118         configured to continue on errors. They will contain the appropriate
119         error message.
120         """
121         pass
122
123 class SameFileError(Exception):
124         """Same File exception.
125
126         This exception will be thrown by FileDownloader objects if they detect
127         multiple files would have to be downloaded to the same file on disk.
128         """
129         pass
130
131 class PostProcessingError(Exception):
132         """Post Processing exception.
133
134         This exception may be raised by PostProcessor's .run() method to
135         indicate an error in the postprocessing task.
136         """
137         pass
138
139 class UnavailableVideoError(Exception):
140         """Unavailable Format exception.
141
142         This exception will be thrown when a video is requested
143         in a format that is not available for that video.
144         """
145         pass
146
147 class ContentTooShortError(Exception):
148         """Content Too Short exception.
149
150         This exception may be raised by FileDownloader objects when a file they
151         download is too small for what the server announced first, indicating
152         the connection was probably interrupted.
153         """
154         # Both in bytes
155         downloaded = None
156         expected = None
157
158         def __init__(self, downloaded, expected):
159                 self.downloaded = downloaded
160                 self.expected = expected
161
162 class FileDownloader(object):
163         """File Downloader class.
164
165         File downloader objects are the ones responsible of downloading the
166         actual video file and writing it to disk if the user has requested
167         it, among some other tasks. In most cases there should be one per
168         program. As, given a video URL, the downloader doesn't know how to
169         extract all the needed information, task that InfoExtractors do, it
170         has to pass the URL to one of them.
171
172         For this, file downloader objects have a method that allows
173         InfoExtractors to be registered in a given order. When it is passed
174         a URL, the file downloader handles it to the first InfoExtractor it
175         finds that reports being able to handle it. The InfoExtractor extracts
176         all the information about the video or videos the URL refers to, and
177         asks the FileDownloader to process the video information, possibly
178         downloading the video.
179
180         File downloaders accept a lot of parameters. In order not to saturate
181         the object constructor with arguments, it receives a dictionary of
182         options instead. These options are available through the params
183         attribute for the InfoExtractors to use. The FileDownloader also
184         registers itself as the downloader in charge for the InfoExtractors
185         that are added to it, so this is a "mutual registration".
186
187         Available options:
188
189         username:         Username for authentication purposes.
190         password:         Password for authentication purposes.
191         usenetrc:         Use netrc for authentication instead.
192         quiet:            Do not print messages to stdout.
193         forceurl:         Force printing final URL.
194         forcetitle:       Force printing title.
195         forcethumbnail:   Force printing thumbnail URL.
196         forcedescription: Force printing description.
197         simulate:         Do not download the video files.
198         format:           Video format code.
199         format_limit:     Highest quality format to try.
200         outtmpl:          Template for output names.
201         ignoreerrors:     Do not stop on download errors.
202         ratelimit:        Download speed limit, in bytes/sec.
203         nooverwrites:     Prevent overwriting files.
204         retries:          Number of times to retry for HTTP error 5xx
205         continuedl:       Try to continue downloads if possible.
206         noprogress:       Do not print the progress bar.
207         playliststart:    Playlist item to start at.
208         playlistend:      Playlist item to end at.
209         logtostderr:      Log messages to stderr instead of stdout.
210         """
211
212         params = None
213         _ies = []
214         _pps = []
215         _download_retcode = None
216         _num_downloads = None
217         _screen_file = None
218
219         def __init__(self, params):
220                 """Create a FileDownloader object with the given options."""
221                 self._ies = []
222                 self._pps = []
223                 self._download_retcode = 0
224                 self._num_downloads = 0
225                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
226                 self.params = params
227         
228         @staticmethod
229         def pmkdir(filename):
230                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231                 components = filename.split(os.sep)
232                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
233                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
234                 for dir in aggregate:
235                         if not os.path.exists(dir):
236                                 os.mkdir(dir)
237         
238         @staticmethod
239         def temp_name(filename):
240                 """Returns a temporary filename for the given filename."""
241                 return filename + '.part'
242         
243         @staticmethod
244         def format_bytes(bytes):
245                 if bytes is None:
246                         return 'N/A'
247                 if type(bytes) is str:
248                         bytes = float(bytes)
249                 if bytes == 0.0:
250                         exponent = 0
251                 else:
252                         exponent = long(math.log(bytes, 1024.0))
253                 suffix = 'bkMGTPEZY'[exponent]
254                 converted = float(bytes) / float(1024**exponent)
255                 return '%.2f%s' % (converted, suffix)
256
257         @staticmethod
258         def calc_percent(byte_counter, data_len):
259                 if data_len is None:
260                         return '---.-%'
261                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
262
263         @staticmethod
264         def calc_eta(start, now, total, current):
265                 if total is None:
266                         return '--:--'
267                 dif = now - start
268                 if current == 0 or dif < 0.001: # One millisecond
269                         return '--:--'
270                 rate = float(current) / dif
271                 eta = long((float(total) - float(current)) / rate)
272                 (eta_mins, eta_secs) = divmod(eta, 60)
273                 if eta_mins > 99:
274                         return '--:--'
275                 return '%02d:%02d' % (eta_mins, eta_secs)
276
277         @staticmethod
278         def calc_speed(start, now, bytes):
279                 dif = now - start
280                 if bytes == 0 or dif < 0.001: # One millisecond
281                         return '%10s' % '---b/s'
282                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
283
284         @staticmethod
285         def best_block_size(elapsed_time, bytes):
286                 new_min = max(bytes / 2.0, 1.0)
287                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
288                 if elapsed_time < 0.001:
289                         return long(new_max)
290                 rate = bytes / elapsed_time
291                 if rate > new_max:
292                         return long(new_max)
293                 if rate < new_min:
294                         return long(new_min)
295                 return long(rate)
296
297         @staticmethod
298         def parse_bytes(bytestr):
299                 """Parse a string indicating a byte quantity into a long integer."""
300                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
301                 if matchobj is None:
302                         return None
303                 number = float(matchobj.group(1))
304                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
305                 return long(round(number * multiplier))
306
307         def add_info_extractor(self, ie):
308                 """Add an InfoExtractor object to the end of the list."""
309                 self._ies.append(ie)
310                 ie.set_downloader(self)
311         
312         def add_post_processor(self, pp):
313                 """Add a PostProcessor object to the end of the chain."""
314                 self._pps.append(pp)
315                 pp.set_downloader(self)
316         
317         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
318                 """Print message to stdout if not in quiet mode."""
319                 try:
320                         if not self.params.get('quiet', False):
321                                 terminator = [u'\n', u''][skip_eol]
322                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
323                         self._screen_file.flush()
324                 except (UnicodeEncodeError), err:
325                         if not ignore_encoding_errors:
326                                 raise
327         
328         def to_stderr(self, message):
329                 """Print message to stderr."""
330                 print >>sys.stderr, message.encode(preferredencoding())
331         
332         def fixed_template(self):
333                 """Checks if the output template is fixed."""
334                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
335
336         def trouble(self, message=None):
337                 """Determine action to take when a download problem appears.
338
339                 Depending on if the downloader has been configured to ignore
340                 download errors or not, this method may throw an exception or
341                 not when errors are found, after printing the message.
342                 """
343                 if message is not None:
344                         self.to_stderr(message)
345                 if not self.params.get('ignoreerrors', False):
346                         raise DownloadError(message)
347                 self._download_retcode = 1
348
349         def slow_down(self, start_time, byte_counter):
350                 """Sleep if the download speed is over the rate limit."""
351                 rate_limit = self.params.get('ratelimit', None)
352                 if rate_limit is None or byte_counter == 0:
353                         return
354                 now = time.time()
355                 elapsed = now - start_time
356                 if elapsed <= 0.0:
357                         return
358                 speed = float(byte_counter) / elapsed
359                 if speed > rate_limit:
360                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
361         
362         def try_rename(self, old_filename, new_filename):
363                 try:
364                         os.rename(old_filename, new_filename)
365                 except (IOError, OSError), err:
366                         self.trouble(u'ERROR: unable to rename file')
367
368         def report_destination(self, filename):
369                 """Report destination filename."""
370                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
371         
372         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
373                 """Report download progress."""
374                 if self.params.get('noprogress', False):
375                         return
376                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
377                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
378
379         def report_resuming_byte(self, resume_len):
380                 """Report attempt to resume at given byte."""
381                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
382         
383         def report_retry(self, count, retries):
384                 """Report retry in case of HTTP error 5xx"""
385                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
386         
387         def report_file_already_downloaded(self, file_name):
388                 """Report file has already been fully downloaded."""
389                 try:
390                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
391                 except (UnicodeEncodeError), err:
392                         self.to_screen(u'[download] The file has already been downloaded')
393         
394         def report_unable_to_resume(self):
395                 """Report it was impossible to resume download."""
396                 self.to_screen(u'[download] Unable to resume')
397         
398         def report_finish(self):
399                 """Report download finished."""
400                 if self.params.get('noprogress', False):
401                         self.to_screen(u'[download] Download completed')
402                 else:
403                         self.to_screen(u'')
404         
405         def increment_downloads(self):
406                 """Increment the ordinal that assigns a number to each file."""
407                 self._num_downloads += 1
408
409         def process_info(self, info_dict):
410                 """Process a single dictionary returned by an InfoExtractor."""
411                 # Do nothing else if in simulate mode
412                 if self.params.get('simulate', False):
413                         # Forced printings
414                         if self.params.get('forcetitle', False):
415                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
416                         if self.params.get('forceurl', False):
417                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
418                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
419                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
420                         if self.params.get('forcedescription', False) and 'description' in info_dict:
421                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
422
423                         return
424                         
425                 try:
426                         template_dict = dict(info_dict)
427                         template_dict['epoch'] = unicode(long(time.time()))
428                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
429                         filename = self.params['outtmpl'] % template_dict
430                 except (ValueError, KeyError), err:
431                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
432                         return
433                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
434                         self.to_stderr(u'WARNING: file exists and will be skipped')
435                         return
436
437                 try:
438                         self.pmkdir(filename)
439                 except (OSError, IOError), err:
440                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
441                         return
442
443                 try:
444                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
445                 except (OSError, IOError), err:
446                         raise UnavailableVideoError
447                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
448                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
449                         return
450                 except (ContentTooShortError, ), err:
451                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
452                         return
453
454                 if success:
455                         try:
456                                 self.post_process(filename, info_dict)
457                         except (PostProcessingError), err:
458                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
459                                 return
460
461         def download(self, url_list):
462                 """Download a given list of URLs."""
463                 if len(url_list) > 1 and self.fixed_template():
464                         raise SameFileError(self.params['outtmpl'])
465
466                 for url in url_list:
467                         suitable_found = False
468                         for ie in self._ies:
469                                 # Go to next InfoExtractor if not suitable
470                                 if not ie.suitable(url):
471                                         continue
472
473                                 # Suitable InfoExtractor found
474                                 suitable_found = True
475
476                                 # Extract information from URL and process it
477                                 ie.extract(url)
478
479                                 # Suitable InfoExtractor had been found; go to next URL
480                                 break
481
482                         if not suitable_found:
483                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
484
485                 return self._download_retcode
486
487         def post_process(self, filename, ie_info):
488                 """Run the postprocessing chain on the given file."""
489                 info = dict(ie_info)
490                 info['filepath'] = filename
491                 for pp in self._pps:
492                         info = pp.run(info)
493                         if info is None:
494                                 break
495         
496         def _download_with_rtmpdump(self, filename, url, player_url):
497                 self.report_destination(filename)
498                 tmpfilename = self.temp_name(filename)
499
500                 # Check for rtmpdump first
501                 try:
502                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
503                 except (OSError, IOError):
504                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
505                         return False
506
507                 # Download using rtmpdump. rtmpdump returns exit code 2 when
508                 # the connection was interrumpted and resuming appears to be
509                 # possible. This is part of rtmpdump's normal usage, AFAIK.
510                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
511                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
512                 while retval == 2 or retval == 1:
513                         prevsize = os.path.getsize(tmpfilename)
514                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
515                         time.sleep(5.0) # This seems to be needed
516                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
517                         cursize = os.path.getsize(tmpfilename)
518                         if prevsize == cursize and retval == 1:
519                                 break
520                 if retval == 0:
521                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
522                         self.try_rename(tmpfilename, filename)
523                         return True
524                 else:
525                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
526                         return False
527
528         def _do_download(self, filename, url, player_url):
529                 # Check file already present
530                 if self.params.get('continuedl', False) and os.path.isfile(filename):
531                         self.report_file_already_downloaded(filename)
532                         return True
533
534                 # Attempt to download using rtmpdump
535                 if url.startswith('rtmp'):
536                         return self._download_with_rtmpdump(filename, url, player_url)
537
538                 tmpfilename = self.temp_name(filename)
539                 stream = None
540                 open_mode = 'wb'
541                 basic_request = urllib2.Request(url, None, std_headers)
542                 request = urllib2.Request(url, None, std_headers)
543
544                 # Establish possible resume length
545                 if os.path.isfile(tmpfilename):
546                         resume_len = os.path.getsize(tmpfilename)
547                 else:
548                         resume_len = 0
549
550                 # Request parameters in case of being able to resume
551                 if self.params.get('continuedl', False) and resume_len != 0:
552                         self.report_resuming_byte(resume_len)
553                         request.add_header('Range','bytes=%d-' % resume_len)
554                         open_mode = 'ab'
555
556                 count = 0
557                 retries = self.params.get('retries', 0)
558                 while count <= retries:
559                         # Establish connection
560                         try:
561                                 data = urllib2.urlopen(request)
562                                 break
563                         except (urllib2.HTTPError, ), err:
564                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
565                                         # Unexpected HTTP error
566                                         raise
567                                 elif err.code == 416:
568                                         # Unable to resume (requested range not satisfiable)
569                                         try:
570                                                 # Open the connection again without the range header
571                                                 data = urllib2.urlopen(basic_request)
572                                                 content_length = data.info()['Content-Length']
573                                         except (urllib2.HTTPError, ), err:
574                                                 if err.code < 500 or err.code >= 600:
575                                                         raise
576                                         else:
577                                                 # Examine the reported length
578                                                 if (content_length is not None and
579                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
580                                                         # The file had already been fully downloaded.
581                                                         # Explanation to the above condition: in issue #175 it was revealed that
582                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
583                                                         # changing the file size slightly and causing problems for some users. So
584                                                         # I decided to implement a suggested change and consider the file
585                                                         # completely downloaded if the file size differs less than 100 bytes from
586                                                         # the one in the hard drive.
587                                                         self.report_file_already_downloaded(filename)
588                                                         self.try_rename(tmpfilename, filename)
589                                                         return True
590                                                 else:
591                                                         # The length does not match, we start the download over
592                                                         self.report_unable_to_resume()
593                                                         open_mode = 'wb'
594                                                         break
595                         # Retry
596                         count += 1
597                         if count <= retries:
598                                 self.report_retry(count, retries)
599
600                 if count > retries:
601                         self.trouble(u'ERROR: giving up after %s retries' % retries)
602                         return False
603
604                 data_len = data.info().get('Content-length', None)
605                 data_len_str = self.format_bytes(data_len)
606                 byte_counter = 0
607                 block_size = 1024
608                 start = time.time()
609                 while True:
610                         # Download and write
611                         before = time.time()
612                         data_block = data.read(block_size)
613                         after = time.time()
614                         data_block_len = len(data_block)
615                         if data_block_len == 0:
616                                 break
617                         byte_counter += data_block_len
618
619                         # Open file just in time
620                         if stream is None:
621                                 try:
622                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
623                                         self.report_destination(filename)
624                                 except (OSError, IOError), err:
625                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
626                                         return False
627                         try:
628                                 stream.write(data_block)
629                         except (IOError, OSError), err:
630                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
631                                 return False
632                         block_size = self.best_block_size(after - before, data_block_len)
633
634                         # Progress message
635                         percent_str = self.calc_percent(byte_counter, data_len)
636                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
637                         speed_str = self.calc_speed(start, time.time(), byte_counter)
638                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
639
640                         # Apply rate limit
641                         self.slow_down(start, byte_counter)
642
643                 stream.close()
644                 self.report_finish()
645                 if data_len is not None and str(byte_counter) != data_len:
646                         raise ContentTooShortError(byte_counter, long(data_len))
647                 self.try_rename(tmpfilename, filename)
648                 return True
649
650 class InfoExtractor(object):
651         """Information Extractor class.
652
653         Information extractors are the classes that, given a URL, extract
654         information from the video (or videos) the URL refers to. This
655         information includes the real video URL, the video title and simplified
656         title, author and others. The information is stored in a dictionary
657         which is then passed to the FileDownloader. The FileDownloader
658         processes this information possibly downloading the video to the file
659         system, among other possible outcomes. The dictionaries must include
660         the following fields:
661
662         id:             Video identifier.
663         url:            Final video URL.
664         uploader:       Nickname of the video uploader.
665         title:          Literal title.
666         stitle:         Simplified title.
667         ext:            Video filename extension.
668         format:         Video format.
669         player_url:     SWF Player URL (may be None).
670
671         The following fields are optional. Their primary purpose is to allow
672         youtube-dl to serve as the backend for a video search function, such
673         as the one in youtube2mp3.  They are only used when their respective
674         forced printing functions are called:
675
676         thumbnail:      Full URL to a video thumbnail image.
677         description:    One-line video description.
678
679         Subclasses of this one should re-define the _real_initialize() and
680         _real_extract() methods, as well as the suitable() static method.
681         Probably, they should also be instantiated and added to the main
682         downloader.
683         """
684
685         _ready = False
686         _downloader = None
687
688         def __init__(self, downloader=None):
689                 """Constructor. Receives an optional downloader."""
690                 self._ready = False
691                 self.set_downloader(downloader)
692
693         @staticmethod
694         def suitable(url):
695                 """Receives a URL and returns True if suitable for this IE."""
696                 return False
697
698         def initialize(self):
699                 """Initializes an instance (authentication, etc)."""
700                 if not self._ready:
701                         self._real_initialize()
702                         self._ready = True
703
704         def extract(self, url):
705                 """Extracts URL information and returns it in list of dicts."""
706                 self.initialize()
707                 return self._real_extract(url)
708
709         def set_downloader(self, downloader):
710                 """Sets the downloader for this IE."""
711                 self._downloader = downloader
712         
713         def _real_initialize(self):
714                 """Real initialization process. Redefine in subclasses."""
715                 pass
716
717         def _real_extract(self, url):
718                 """Real extraction process. Redefine in subclasses."""
719                 pass
720
721 class YoutubeIE(InfoExtractor):
722         """Information extractor for youtube.com."""
723
724         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
725         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
726         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
727         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
728         _NETRC_MACHINE = 'youtube'
729         # Listed in order of quality
730         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
731         _video_extensions = {
732                 '13': '3gp',
733                 '17': 'mp4',
734                 '18': 'mp4',
735                 '22': 'mp4',
736                 '37': 'mp4',
737                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
738                 '43': 'webm',
739                 '45': 'webm',
740         }
741
742         @staticmethod
743         def suitable(url):
744                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
745
746         def report_lang(self):
747                 """Report attempt to set language."""
748                 self._downloader.to_screen(u'[youtube] Setting language')
749
750         def report_login(self):
751                 """Report attempt to log in."""
752                 self._downloader.to_screen(u'[youtube] Logging in')
753         
754         def report_age_confirmation(self):
755                 """Report attempt to confirm age."""
756                 self._downloader.to_screen(u'[youtube] Confirming age')
757         
758         def report_video_webpage_download(self, video_id):
759                 """Report attempt to download video webpage."""
760                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
761         
762         def report_video_info_webpage_download(self, video_id):
763                 """Report attempt to download video info webpage."""
764                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
765         
766         def report_information_extraction(self, video_id):
767                 """Report attempt to extract video information."""
768                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
769         
770         def report_unavailable_format(self, video_id, format):
771                 """Report extracted video URL."""
772                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
773         
774         def report_rtmp_download(self):
775                 """Indicate the download will use the RTMP protocol."""
776                 self._downloader.to_screen(u'[youtube] RTMP download detected')
777         
778         def _real_initialize(self):
779                 if self._downloader is None:
780                         return
781
782                 username = None
783                 password = None
784                 downloader_params = self._downloader.params
785
786                 # Attempt to use provided username and password or .netrc data
787                 if downloader_params.get('username', None) is not None:
788                         username = downloader_params['username']
789                         password = downloader_params['password']
790                 elif downloader_params.get('usenetrc', False):
791                         try:
792                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
793                                 if info is not None:
794                                         username = info[0]
795                                         password = info[2]
796                                 else:
797                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
798                         except (IOError, netrc.NetrcParseError), err:
799                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
800                                 return
801
802                 # Set language
803                 request = urllib2.Request(self._LANG_URL, None, std_headers)
804                 try:
805                         self.report_lang()
806                         urllib2.urlopen(request).read()
807                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
808                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
809                         return
810
811                 # No authentication to be performed
812                 if username is None:
813                         return
814
815                 # Log in
816                 login_form = {
817                                 'current_form': 'loginForm',
818                                 'next':         '/',
819                                 'action_login': 'Log In',
820                                 'username':     username,
821                                 'password':     password,
822                                 }
823                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
824                 try:
825                         self.report_login()
826                         login_results = urllib2.urlopen(request).read()
827                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
828                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
829                                 return
830                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
831                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
832                         return
833         
834                 # Confirm age
835                 age_form = {
836                                 'next_url':             '/',
837                                 'action_confirm':       'Confirm',
838                                 }
839                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
840                 try:
841                         self.report_age_confirmation()
842                         age_results = urllib2.urlopen(request).read()
843                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
844                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
845                         return
846
847         def _real_extract(self, url):
848                 # Extract video id from URL
849                 mobj = re.match(self._VALID_URL, url)
850                 if mobj is None:
851                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
852                         return
853                 video_id = mobj.group(2)
854
855                 # Get video webpage
856                 self.report_video_webpage_download(video_id)
857                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
858                 try:
859                         video_webpage = urllib2.urlopen(request).read()
860                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
861                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
862                         return
863
864                 # Attempt to extract SWF player URL
865                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
866                 if mobj is not None:
867                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
868                 else:
869                         player_url = None
870
871                 # Get video info
872                 self.report_video_info_webpage_download(video_id)
873                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
874                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
875                                            % (video_id, el_type))
876                         request = urllib2.Request(video_info_url, None, std_headers)
877                         try:
878                                 video_info_webpage = urllib2.urlopen(request).read()
879                                 video_info = parse_qs(video_info_webpage)
880                                 if 'token' in video_info:
881                                         break
882                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
883                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
884                                 return
885                 if 'token' not in video_info:
886                         if 'reason' in video_info:
887                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
888                         else:
889                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
890                         return
891
892                 # Start extracting information
893                 self.report_information_extraction(video_id)
894
895                 # uploader
896                 if 'author' not in video_info:
897                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
898                         return
899                 video_uploader = urllib.unquote_plus(video_info['author'][0])
900
901                 # title
902                 if 'title' not in video_info:
903                         self._downloader.trouble(u'ERROR: unable to extract video title')
904                         return
905                 video_title = urllib.unquote_plus(video_info['title'][0])
906                 video_title = video_title.decode('utf-8')
907                 video_title = sanitize_title(video_title)
908
909                 # simplified title
910                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
911                 simple_title = simple_title.strip(ur'_')
912
913                 # thumbnail image
914                 if 'thumbnail_url' not in video_info:
915                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
916                         video_thumbnail = ''
917                 else:   # don't panic if we can't find it
918                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
919
920                 # upload date
921                 upload_date = u'NA'
922                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
923                 if mobj is not None:
924                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
925                         format_expressions = ['%d %B %Y', '%B %d %Y']
926                         for expression in format_expressions:
927                                 try:
928                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
929                                 except:
930                                         pass
931
932                 # description
933                 video_description = 'No description available.'
934                 if self._downloader.params.get('forcedescription', False):
935                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
936                         if mobj is not None:
937                                 video_description = mobj.group(1)
938
939                 # token
940                 video_token = urllib.unquote_plus(video_info['token'][0])
941
942                 # Decide which formats to download
943                 requested_format = self._downloader.params.get('format', None)
944                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
945
946                 if 'fmt_url_map' in video_info:
947                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
948                         format_limit = self._downloader.params.get('format_limit', None)
949                         if format_limit is not None and format_limit in self._available_formats:
950                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
951                         else:
952                                 format_list = self._available_formats
953                         existing_formats = [x for x in format_list if x in url_map]
954                         if len(existing_formats) == 0:
955                                 self._downloader.trouble(u'ERROR: no known formats available for video')
956                                 return
957                         if requested_format is None:
958                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
959                         elif requested_format == '-1':
960                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
961                         else:
962                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
963
964                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
965                         self.report_rtmp_download()
966                         video_url_list = [(None, video_info['conn'][0])]
967
968                 else:
969                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
970                         return
971
972                 for format_param, video_real_url in video_url_list:
973                         # At this point we have a new video
974                         self._downloader.increment_downloads()
975
976                         # Extension
977                         video_extension = self._video_extensions.get(format_param, 'flv')
978
979                         # Find the video URL in fmt_url_map or conn paramters
980                         try:
981                                 # Process video information
982                                 self._downloader.process_info({
983                                         'id':           video_id.decode('utf-8'),
984                                         'url':          video_real_url.decode('utf-8'),
985                                         'uploader':     video_uploader.decode('utf-8'),
986                                         'upload_date':  upload_date,
987                                         'title':        video_title,
988                                         'stitle':       simple_title,
989                                         'ext':          video_extension.decode('utf-8'),
990                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
991                                         'thumbnail':    video_thumbnail.decode('utf-8'),
992                                         'description':  video_description.decode('utf-8'),
993                                         'player_url':   player_url,
994                                 })
995                         except UnavailableVideoError, err:
996                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
997
998
999 class MetacafeIE(InfoExtractor):
1000         """Information Extractor for metacafe.com."""
1001
1002         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1003         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1004         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1005         _youtube_ie = None
1006
1007         def __init__(self, youtube_ie, downloader=None):
1008                 InfoExtractor.__init__(self, downloader)
1009                 self._youtube_ie = youtube_ie
1010
1011         @staticmethod
1012         def suitable(url):
1013                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1014
1015         def report_disclaimer(self):
1016                 """Report disclaimer retrieval."""
1017                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1018
1019         def report_age_confirmation(self):
1020                 """Report attempt to confirm age."""
1021                 self._downloader.to_screen(u'[metacafe] Confirming age')
1022         
1023         def report_download_webpage(self, video_id):
1024                 """Report webpage download."""
1025                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1026         
1027         def report_extraction(self, video_id):
1028                 """Report information extraction."""
1029                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1030
1031         def _real_initialize(self):
1032                 # Retrieve disclaimer
1033                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1034                 try:
1035                         self.report_disclaimer()
1036                         disclaimer = urllib2.urlopen(request).read()
1037                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1039                         return
1040
1041                 # Confirm age
1042                 disclaimer_form = {
1043                         'filters': '0',
1044                         'submit': "Continue - I'm over 18",
1045                         }
1046                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1047                 try:
1048                         self.report_age_confirmation()
1049                         disclaimer = urllib2.urlopen(request).read()
1050                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1051                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1052                         return
1053         
1054         def _real_extract(self, url):
1055                 # Extract id and simplified title from URL
1056                 mobj = re.match(self._VALID_URL, url)
1057                 if mobj is None:
1058                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1059                         return
1060
1061                 video_id = mobj.group(1)
1062
1063                 # Check if video comes from YouTube
1064                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1065                 if mobj2 is not None:
1066                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1067                         return
1068
1069                 # At this point we have a new video
1070                 self._downloader.increment_downloads()
1071
1072                 simple_title = mobj.group(2).decode('utf-8')
1073
1074                 # Retrieve video webpage to extract further information
1075                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1076                 try:
1077                         self.report_download_webpage(video_id)
1078                         webpage = urllib2.urlopen(request).read()
1079                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1080                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1081                         return
1082
1083                 # Extract URL, uploader and title from webpage
1084                 self.report_extraction(video_id)
1085                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1086                 if mobj is not None:
1087                         mediaURL = urllib.unquote(mobj.group(1))
1088                         video_extension = mediaURL[-3:]
1089                         
1090                         # Extract gdaKey if available
1091                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1092                         if mobj is None:
1093                                 video_url = mediaURL
1094                         else:
1095                                 gdaKey = mobj.group(1)
1096                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1097                 else:
1098                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1099                         if mobj is None:
1100                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1101                                 return
1102                         vardict = parse_qs(mobj.group(1))
1103                         if 'mediaData' not in vardict:
1104                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1105                                 return
1106                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1107                         if mobj is None:
1108                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1109                                 return
1110                         mediaURL = mobj.group(1).replace('\\/', '/')
1111                         video_extension = mediaURL[-3:]
1112                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1113
1114                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1115                 if mobj is None:
1116                         self._downloader.trouble(u'ERROR: unable to extract title')
1117                         return
1118                 video_title = mobj.group(1).decode('utf-8')
1119                 video_title = sanitize_title(video_title)
1120
1121                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1122                 if mobj is None:
1123                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1124                         return
1125                 video_uploader = mobj.group(1)
1126
1127                 try:
1128                         # Process video information
1129                         self._downloader.process_info({
1130                                 'id':           video_id.decode('utf-8'),
1131                                 'url':          video_url.decode('utf-8'),
1132                                 'uploader':     video_uploader.decode('utf-8'),
1133                                 'upload_date':  u'NA',
1134                                 'title':        video_title,
1135                                 'stitle':       simple_title,
1136                                 'ext':          video_extension.decode('utf-8'),
1137                                 'format':       u'NA',
1138                                 'player_url':   None,
1139                         })
1140                 except UnavailableVideoError:
1141                         self._downloader.trouble(u'ERROR: unable to download video')
1142
1143
1144 class DailymotionIE(InfoExtractor):
1145         """Information Extractor for Dailymotion"""
1146
1147         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1148
1149         def __init__(self, downloader=None):
1150                 InfoExtractor.__init__(self, downloader)
1151
1152         @staticmethod
1153         def suitable(url):
1154                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1155
1156         def report_download_webpage(self, video_id):
1157                 """Report webpage download."""
1158                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1159         
1160         def report_extraction(self, video_id):
1161                 """Report information extraction."""
1162                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1163
1164         def _real_initialize(self):
1165                 return
1166
1167         def _real_extract(self, url):
1168                 # Extract id and simplified title from URL
1169                 mobj = re.match(self._VALID_URL, url)
1170                 if mobj is None:
1171                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1172                         return
1173
1174                 # At this point we have a new video
1175                 self._downloader.increment_downloads()
1176                 video_id = mobj.group(1)
1177
1178                 simple_title = mobj.group(2).decode('utf-8')
1179                 video_extension = 'flv'
1180
1181                 # Retrieve video webpage to extract further information
1182                 request = urllib2.Request(url)
1183                 try:
1184                         self.report_download_webpage(video_id)
1185                         webpage = urllib2.urlopen(request).read()
1186                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1187                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1188                         return
1189
1190                 # Extract URL, uploader and title from webpage
1191                 self.report_extraction(video_id)
1192                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1193                 if mobj is None:
1194                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1195                         return
1196                 mediaURL = urllib.unquote(mobj.group(1))
1197
1198                 # if needed add http://www.dailymotion.com/ if relative URL
1199
1200                 video_url = mediaURL
1201
1202                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1203                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1204                 if mobj is None:
1205                         self._downloader.trouble(u'ERROR: unable to extract title')
1206                         return
1207                 video_title = mobj.group(1).decode('utf-8')
1208                 video_title = sanitize_title(video_title)
1209
1210                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1211                 if mobj is None:
1212                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1213                         return
1214                 video_uploader = mobj.group(1)
1215
1216                 try:
1217                         # Process video information
1218                         self._downloader.process_info({
1219                                 'id':           video_id.decode('utf-8'),
1220                                 'url':          video_url.decode('utf-8'),
1221                                 'uploader':     video_uploader.decode('utf-8'),
1222                                 'upload_date':  u'NA',
1223                                 'title':        video_title,
1224                                 'stitle':       simple_title,
1225                                 'ext':          video_extension.decode('utf-8'),
1226                                 'format':       u'NA',
1227                                 'player_url':   None,
1228                         })
1229                 except UnavailableVideoError:
1230                         self._downloader.trouble(u'ERROR: unable to download video')
1231
1232 class GoogleIE(InfoExtractor):
1233         """Information extractor for video.google.com."""
1234
1235         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1236
1237         def __init__(self, downloader=None):
1238                 InfoExtractor.__init__(self, downloader)
1239
1240         @staticmethod
1241         def suitable(url):
1242                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1243
1244         def report_download_webpage(self, video_id):
1245                 """Report webpage download."""
1246                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1247
1248         def report_extraction(self, video_id):
1249                 """Report information extraction."""
1250                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1251
1252         def _real_initialize(self):
1253                 return
1254
1255         def _real_extract(self, url):
1256                 # Extract id from URL
1257                 mobj = re.match(self._VALID_URL, url)
1258                 if mobj is None:
1259                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1260                         return
1261
1262                 # At this point we have a new video
1263                 self._downloader.increment_downloads()
1264                 video_id = mobj.group(1)
1265
1266                 video_extension = 'mp4'
1267
1268                 # Retrieve video webpage to extract further information
1269                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1270                 try:
1271                         self.report_download_webpage(video_id)
1272                         webpage = urllib2.urlopen(request).read()
1273                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1274                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1275                         return
1276
1277                 # Extract URL, uploader, and title from webpage
1278                 self.report_extraction(video_id)
1279                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1280                 if mobj is None:
1281                         video_extension = 'flv'
1282                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1283                 if mobj is None:
1284                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1285                         return
1286                 mediaURL = urllib.unquote(mobj.group(1))
1287                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1288                 mediaURL = mediaURL.replace('\\x26', '\x26')
1289
1290                 video_url = mediaURL
1291
1292                 mobj = re.search(r'<title>(.*)</title>', webpage)
1293                 if mobj is None:
1294                         self._downloader.trouble(u'ERROR: unable to extract title')
1295                         return
1296                 video_title = mobj.group(1).decode('utf-8')
1297                 video_title = sanitize_title(video_title)
1298                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1299
1300                 # Extract video description
1301                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1302                 if mobj is None:
1303                         self._downloader.trouble(u'ERROR: unable to extract video description')
1304                         return
1305                 video_description = mobj.group(1).decode('utf-8')
1306                 if not video_description:
1307                         video_description = 'No description available.'
1308
1309                 # Extract video thumbnail
1310                 if self._downloader.params.get('forcethumbnail', False):
1311                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1312                         try:
1313                                 webpage = urllib2.urlopen(request).read()
1314                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1315                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1316                                 return
1317                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1318                         if mobj is None:
1319                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1320                                 return
1321                         video_thumbnail = mobj.group(1)
1322                 else:   # we need something to pass to process_info
1323                         video_thumbnail = ''
1324
1325
1326                 try:
1327                         # Process video information
1328                         self._downloader.process_info({
1329                                 'id':           video_id.decode('utf-8'),
1330                                 'url':          video_url.decode('utf-8'),
1331                                 'uploader':     u'NA',
1332                                 'upload_date':  u'NA',
1333                                 'title':        video_title,
1334                                 'stitle':       simple_title,
1335                                 'ext':          video_extension.decode('utf-8'),
1336                                 'format':       u'NA',
1337                                 'player_url':   None,
1338                         })
1339                 except UnavailableVideoError:
1340                         self._downloader.trouble(u'ERROR: unable to download video')
1341
1342
1343 class PhotobucketIE(InfoExtractor):
1344         """Information extractor for photobucket.com."""
1345
1346         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1347
1348         def __init__(self, downloader=None):
1349                 InfoExtractor.__init__(self, downloader)
1350
1351         @staticmethod
1352         def suitable(url):
1353                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1354
1355         def report_download_webpage(self, video_id):
1356                 """Report webpage download."""
1357                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1358
1359         def report_extraction(self, video_id):
1360                 """Report information extraction."""
1361                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1362
1363         def _real_initialize(self):
1364                 return
1365
1366         def _real_extract(self, url):
1367                 # Extract id from URL
1368                 mobj = re.match(self._VALID_URL, url)
1369                 if mobj is None:
1370                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371                         return
1372
1373                 # At this point we have a new video
1374                 self._downloader.increment_downloads()
1375                 video_id = mobj.group(1)
1376
1377                 video_extension = 'flv'
1378
1379                 # Retrieve video webpage to extract further information
1380                 request = urllib2.Request(url)
1381                 try:
1382                         self.report_download_webpage(video_id)
1383                         webpage = urllib2.urlopen(request).read()
1384                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1385                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1386                         return
1387
1388                 # Extract URL, uploader, and title from webpage
1389                 self.report_extraction(video_id)
1390                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1391                 if mobj is None:
1392                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1393                         return
1394                 mediaURL = urllib.unquote(mobj.group(1))
1395
1396                 video_url = mediaURL
1397
1398                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1399                 if mobj is None:
1400                         self._downloader.trouble(u'ERROR: unable to extract title')
1401                         return
1402                 video_title = mobj.group(1).decode('utf-8')
1403                 video_title = sanitize_title(video_title)
1404                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1405
1406                 video_uploader = mobj.group(2).decode('utf-8')
1407
1408                 try:
1409                         # Process video information
1410                         self._downloader.process_info({
1411                                 'id':           video_id.decode('utf-8'),
1412                                 'url':          video_url.decode('utf-8'),
1413                                 'uploader':     video_uploader,
1414                                 'upload_date':  u'NA',
1415                                 'title':        video_title,
1416                                 'stitle':       simple_title,
1417                                 'ext':          video_extension.decode('utf-8'),
1418                                 'format':       u'NA',
1419                                 'player_url':   None,
1420                         })
1421                 except UnavailableVideoError:
1422                         self._downloader.trouble(u'ERROR: unable to download video')
1423
1424
1425 class YahooIE(InfoExtractor):
1426         """Information extractor for video.yahoo.com."""
1427
1428         # _VALID_URL matches all Yahoo! Video URLs
1429         # _VPAGE_URL matches only the extractable '/watch/' URLs
1430         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1431         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1432
1433         def __init__(self, downloader=None):
1434                 InfoExtractor.__init__(self, downloader)
1435
1436         @staticmethod
1437         def suitable(url):
1438                 return (re.match(YahooIE._VALID_URL, url) is not None)
1439
1440         def report_download_webpage(self, video_id):
1441                 """Report webpage download."""
1442                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1443
1444         def report_extraction(self, video_id):
1445                 """Report information extraction."""
1446                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1447
1448         def _real_initialize(self):
1449                 return
1450
1451         def _real_extract(self, url, new_video=True):
1452                 # Extract ID from URL
1453                 mobj = re.match(self._VALID_URL, url)
1454                 if mobj is None:
1455                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1456                         return
1457
1458                 # At this point we have a new video
1459                 self._downloader.increment_downloads()
1460                 video_id = mobj.group(2)
1461                 video_extension = 'flv'
1462
1463                 # Rewrite valid but non-extractable URLs as
1464                 # extractable English language /watch/ URLs
1465                 if re.match(self._VPAGE_URL, url) is None:
1466                         request = urllib2.Request(url)
1467                         try:
1468                                 webpage = urllib2.urlopen(request).read()
1469                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1470                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1471                                 return
1472
1473                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1474                         if mobj is None:
1475                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1476                                 return
1477                         yahoo_id = mobj.group(1)
1478
1479                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1480                         if mobj is None:
1481                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1482                                 return
1483                         yahoo_vid = mobj.group(1)
1484
1485                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1486                         return self._real_extract(url, new_video=False)
1487
1488                 # Retrieve video webpage to extract further information
1489                 request = urllib2.Request(url)
1490                 try:
1491                         self.report_download_webpage(video_id)
1492                         webpage = urllib2.urlopen(request).read()
1493                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1494                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1495                         return
1496
1497                 # Extract uploader and title from webpage
1498                 self.report_extraction(video_id)
1499                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1500                 if mobj is None:
1501                         self._downloader.trouble(u'ERROR: unable to extract video title')
1502                         return
1503                 video_title = mobj.group(1).decode('utf-8')
1504                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1505
1506                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1507                 if mobj is None:
1508                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1509                         return
1510                 video_uploader = mobj.group(1).decode('utf-8')
1511
1512                 # Extract video thumbnail
1513                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1514                 if mobj is None:
1515                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1516                         return
1517                 video_thumbnail = mobj.group(1).decode('utf-8')
1518
1519                 # Extract video description
1520                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1521                 if mobj is None:
1522                         self._downloader.trouble(u'ERROR: unable to extract video description')
1523                         return
1524                 video_description = mobj.group(1).decode('utf-8')
1525                 if not video_description: video_description = 'No description available.'
1526
1527                 # Extract video height and width
1528                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1529                 if mobj is None:
1530                         self._downloader.trouble(u'ERROR: unable to extract video height')
1531                         return
1532                 yv_video_height = mobj.group(1)
1533
1534                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1535                 if mobj is None:
1536                         self._downloader.trouble(u'ERROR: unable to extract video width')
1537                         return
1538                 yv_video_width = mobj.group(1)
1539
1540                 # Retrieve video playlist to extract media URL
1541                 # I'm not completely sure what all these options are, but we
1542                 # seem to need most of them, otherwise the server sends a 401.
1543                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1544                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1545                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1546                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1547                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1548                 try:
1549                         self.report_download_webpage(video_id)
1550                         webpage = urllib2.urlopen(request).read()
1551                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1552                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1553                         return
1554
1555                 # Extract media URL from playlist XML
1556                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1557                 if mobj is None:
1558                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1559                         return
1560                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1561                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1562
1563                 try:
1564                         # Process video information
1565                         self._downloader.process_info({
1566                                 'id':           video_id.decode('utf-8'),
1567                                 'url':          video_url,
1568                                 'uploader':     video_uploader,
1569                                 'upload_date':  u'NA',
1570                                 'title':        video_title,
1571                                 'stitle':       simple_title,
1572                                 'ext':          video_extension.decode('utf-8'),
1573                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1574                                 'description':  video_description,
1575                                 'thumbnail':    video_thumbnail,
1576                                 'description':  video_description,
1577                                 'player_url':   None,
1578                         })
1579                 except UnavailableVideoError:
1580                         self._downloader.trouble(u'ERROR: unable to download video')
1581
1582
1583 class GenericIE(InfoExtractor):
1584         """Generic last-resort information extractor."""
1585
1586         def __init__(self, downloader=None):
1587                 InfoExtractor.__init__(self, downloader)
1588
1589         @staticmethod
1590         def suitable(url):
1591                 return True
1592
1593         def report_download_webpage(self, video_id):
1594                 """Report webpage download."""
1595                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1596                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1597
1598         def report_extraction(self, video_id):
1599                 """Report information extraction."""
1600                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1601
1602         def _real_initialize(self):
1603                 return
1604
1605         def _real_extract(self, url):
1606                 # At this point we have a new video
1607                 self._downloader.increment_downloads()
1608
1609                 video_id = url.split('/')[-1]
1610                 request = urllib2.Request(url)
1611                 try:
1612                         self.report_download_webpage(video_id)
1613                         webpage = urllib2.urlopen(request).read()
1614                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1615                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1616                         return
1617                 except ValueError, err:
1618                         # since this is the last-resort InfoExtractor, if
1619                         # this error is thrown, it'll be thrown here
1620                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1621                         return
1622
1623                 self.report_extraction(video_id)
1624                 # Start with something easy: JW Player in SWFObject
1625                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1626                 if mobj is None:
1627                         # Broaden the search a little bit
1628                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1629                 if mobj is None:
1630                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1631                         return
1632
1633                 # It's possible that one of the regexes
1634                 # matched, but returned an empty group:
1635                 if mobj.group(1) is None:
1636                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1637                         return
1638
1639                 video_url = urllib.unquote(mobj.group(1))
1640                 video_id  = os.path.basename(video_url)
1641
1642                 # here's a fun little line of code for you:
1643                 video_extension = os.path.splitext(video_id)[1][1:]
1644                 video_id        = os.path.splitext(video_id)[0]
1645
1646                 # it's tempting to parse this further, but you would
1647                 # have to take into account all the variations like
1648                 #   Video Title - Site Name
1649                 #   Site Name | Video Title
1650                 #   Video Title - Tagline | Site Name
1651                 # and so on and so forth; it's just not practical
1652                 mobj = re.search(r'<title>(.*)</title>', webpage)
1653                 if mobj is None:
1654                         self._downloader.trouble(u'ERROR: unable to extract title')
1655                         return
1656                 video_title = mobj.group(1).decode('utf-8')
1657                 video_title = sanitize_title(video_title)
1658                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1659
1660                 # video uploader is domain name
1661                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1662                 if mobj is None:
1663                         self._downloader.trouble(u'ERROR: unable to extract title')
1664                         return
1665                 video_uploader = mobj.group(1).decode('utf-8')
1666
1667                 try:
1668                         # Process video information
1669                         self._downloader.process_info({
1670                                 'id':           video_id.decode('utf-8'),
1671                                 'url':          video_url.decode('utf-8'),
1672                                 'uploader':     video_uploader,
1673                                 'upload_date':  u'NA',
1674                                 'title':        video_title,
1675                                 'stitle':       simple_title,
1676                                 'ext':          video_extension.decode('utf-8'),
1677                                 'format':       u'NA',
1678                                 'player_url':   None,
1679                         })
1680                 except UnavailableVideoError, err:
1681                         self._downloader.trouble(u'ERROR: unable to download video')
1682
1683
1684 class YoutubeSearchIE(InfoExtractor):
1685         """Information Extractor for YouTube search queries."""
1686         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1687         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1688         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1689         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1690         _youtube_ie = None
1691         _max_youtube_results = 1000
1692
1693         def __init__(self, youtube_ie, downloader=None):
1694                 InfoExtractor.__init__(self, downloader)
1695                 self._youtube_ie = youtube_ie
1696         
1697         @staticmethod
1698         def suitable(url):
1699                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1700
1701         def report_download_page(self, query, pagenum):
1702                 """Report attempt to download playlist page with given number."""
1703                 query = query.decode(preferredencoding())
1704                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1705
1706         def _real_initialize(self):
1707                 self._youtube_ie.initialize()
1708         
1709         def _real_extract(self, query):
1710                 mobj = re.match(self._VALID_QUERY, query)
1711                 if mobj is None:
1712                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1713                         return
1714
1715                 prefix, query = query.split(':')
1716                 prefix = prefix[8:]
1717                 query  = query.encode('utf-8')
1718                 if prefix == '':
1719                         self._download_n_results(query, 1)
1720                         return
1721                 elif prefix == 'all':
1722                         self._download_n_results(query, self._max_youtube_results)
1723                         return
1724                 else:
1725                         try:
1726                                 n = long(prefix)
1727                                 if n <= 0:
1728                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1729                                         return
1730                                 elif n > self._max_youtube_results:
1731                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1732                                         n = self._max_youtube_results
1733                                 self._download_n_results(query, n)
1734                                 return
1735                         except ValueError: # parsing prefix as integer fails
1736                                 self._download_n_results(query, 1)
1737                                 return
1738
1739         def _download_n_results(self, query, n):
1740                 """Downloads a specified number of results for a query"""
1741
1742                 video_ids = []
1743                 already_seen = set()
1744                 pagenum = 1
1745
1746                 while True:
1747                         self.report_download_page(query, pagenum)
1748                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1749                         request = urllib2.Request(result_url, None, std_headers)
1750                         try:
1751                                 page = urllib2.urlopen(request).read()
1752                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1753                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1754                                 return
1755
1756                         # Extract video identifiers
1757                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1758                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1759                                 if video_id not in already_seen:
1760                                         video_ids.append(video_id)
1761                                         already_seen.add(video_id)
1762                                         if len(video_ids) == n:
1763                                                 # Specified n videos reached
1764                                                 for id in video_ids:
1765                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1766                                                 return
1767
1768                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1769                                 for id in video_ids:
1770                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1771                                 return
1772
1773                         pagenum = pagenum + 1
1774
1775 class GoogleSearchIE(InfoExtractor):
1776         """Information Extractor for Google Video search queries."""
1777         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1778         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1779         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1780         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1781         _google_ie = None
1782         _max_google_results = 1000
1783
1784         def __init__(self, google_ie, downloader=None):
1785                 InfoExtractor.__init__(self, downloader)
1786                 self._google_ie = google_ie
1787         
1788         @staticmethod
1789         def suitable(url):
1790                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1791
1792         def report_download_page(self, query, pagenum):
1793                 """Report attempt to download playlist page with given number."""
1794                 query = query.decode(preferredencoding())
1795                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1796
1797         def _real_initialize(self):
1798                 self._google_ie.initialize()
1799         
1800         def _real_extract(self, query):
1801                 mobj = re.match(self._VALID_QUERY, query)
1802                 if mobj is None:
1803                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1804                         return
1805
1806                 prefix, query = query.split(':')
1807                 prefix = prefix[8:]
1808                 query  = query.encode('utf-8')
1809                 if prefix == '':
1810                         self._download_n_results(query, 1)
1811                         return
1812                 elif prefix == 'all':
1813                         self._download_n_results(query, self._max_google_results)
1814                         return
1815                 else:
1816                         try:
1817                                 n = long(prefix)
1818                                 if n <= 0:
1819                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1820                                         return
1821                                 elif n > self._max_google_results:
1822                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1823                                         n = self._max_google_results
1824                                 self._download_n_results(query, n)
1825                                 return
1826                         except ValueError: # parsing prefix as integer fails
1827                                 self._download_n_results(query, 1)
1828                                 return
1829
1830         def _download_n_results(self, query, n):
1831                 """Downloads a specified number of results for a query"""
1832
1833                 video_ids = []
1834                 already_seen = set()
1835                 pagenum = 1
1836
1837                 while True:
1838                         self.report_download_page(query, pagenum)
1839                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1840                         request = urllib2.Request(result_url, None, std_headers)
1841                         try:
1842                                 page = urllib2.urlopen(request).read()
1843                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1844                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1845                                 return
1846
1847                         # Extract video identifiers
1848                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1849                                 video_id = mobj.group(1)
1850                                 if video_id not in already_seen:
1851                                         video_ids.append(video_id)
1852                                         already_seen.add(video_id)
1853                                         if len(video_ids) == n:
1854                                                 # Specified n videos reached
1855                                                 for id in video_ids:
1856                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1857                                                 return
1858
1859                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1860                                 for id in video_ids:
1861                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1862                                 return
1863
1864                         pagenum = pagenum + 1
1865
1866 class YahooSearchIE(InfoExtractor):
1867         """Information Extractor for Yahoo! Video search queries."""
1868         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1869         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1870         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1871         _MORE_PAGES_INDICATOR = r'\s*Next'
1872         _yahoo_ie = None
1873         _max_yahoo_results = 1000
1874
1875         def __init__(self, yahoo_ie, downloader=None):
1876                 InfoExtractor.__init__(self, downloader)
1877                 self._yahoo_ie = yahoo_ie
1878         
1879         @staticmethod
1880         def suitable(url):
1881                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1882
1883         def report_download_page(self, query, pagenum):
1884                 """Report attempt to download playlist page with given number."""
1885                 query = query.decode(preferredencoding())
1886                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1887
1888         def _real_initialize(self):
1889                 self._yahoo_ie.initialize()
1890         
1891         def _real_extract(self, query):
1892                 mobj = re.match(self._VALID_QUERY, query)
1893                 if mobj is None:
1894                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1895                         return
1896
1897                 prefix, query = query.split(':')
1898                 prefix = prefix[8:]
1899                 query  = query.encode('utf-8')
1900                 if prefix == '':
1901                         self._download_n_results(query, 1)
1902                         return
1903                 elif prefix == 'all':
1904                         self._download_n_results(query, self._max_yahoo_results)
1905                         return
1906                 else:
1907                         try:
1908                                 n = long(prefix)
1909                                 if n <= 0:
1910                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1911                                         return
1912                                 elif n > self._max_yahoo_results:
1913                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1914                                         n = self._max_yahoo_results
1915                                 self._download_n_results(query, n)
1916                                 return
1917                         except ValueError: # parsing prefix as integer fails
1918                                 self._download_n_results(query, 1)
1919                                 return
1920
1921         def _download_n_results(self, query, n):
1922                 """Downloads a specified number of results for a query"""
1923
1924                 video_ids = []
1925                 already_seen = set()
1926                 pagenum = 1
1927
1928                 while True:
1929                         self.report_download_page(query, pagenum)
1930                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1931                         request = urllib2.Request(result_url, None, std_headers)
1932                         try:
1933                                 page = urllib2.urlopen(request).read()
1934                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1935                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1936                                 return
1937
1938                         # Extract video identifiers
1939                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940                                 video_id = mobj.group(1)
1941                                 if video_id not in already_seen:
1942                                         video_ids.append(video_id)
1943                                         already_seen.add(video_id)
1944                                         if len(video_ids) == n:
1945                                                 # Specified n videos reached
1946                                                 for id in video_ids:
1947                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1948                                                 return
1949
1950                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1951                                 for id in video_ids:
1952                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1953                                 return
1954
1955                         pagenum = pagenum + 1
1956
1957 class YoutubePlaylistIE(InfoExtractor):
1958         """Information Extractor for YouTube playlists."""
1959
1960         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1961         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1962         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1963         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1964         _youtube_ie = None
1965
1966         def __init__(self, youtube_ie, downloader=None):
1967                 InfoExtractor.__init__(self, downloader)
1968                 self._youtube_ie = youtube_ie
1969         
1970         @staticmethod
1971         def suitable(url):
1972                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1973
1974         def report_download_page(self, playlist_id, pagenum):
1975                 """Report attempt to download playlist page with given number."""
1976                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1977
1978         def _real_initialize(self):
1979                 self._youtube_ie.initialize()
1980         
1981         def _real_extract(self, url):
1982                 # Extract playlist id
1983                 mobj = re.match(self._VALID_URL, url)
1984                 if mobj is None:
1985                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1986                         return
1987
1988                 # Download playlist pages
1989                 playlist_id = mobj.group(1)
1990                 video_ids = []
1991                 pagenum = 1
1992
1993                 while True:
1994                         self.report_download_page(playlist_id, pagenum)
1995                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1996                         try:
1997                                 page = urllib2.urlopen(request).read()
1998                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1999                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2000                                 return
2001
2002                         # Extract video identifiers
2003                         ids_in_page = []
2004                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2005                                 if mobj.group(1) not in ids_in_page:
2006                                         ids_in_page.append(mobj.group(1))
2007                         video_ids.extend(ids_in_page)
2008
2009                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2010                                 break
2011                         pagenum = pagenum + 1
2012
2013                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2014                 playlistend = self._downloader.params.get('playlistend', -1)
2015                 video_ids = video_ids[playliststart:playlistend]
2016
2017                 for id in video_ids:
2018                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2019                 return
2020
2021 class YoutubeUserIE(InfoExtractor):
2022         """Information Extractor for YouTube users."""
2023
2024         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2025         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2026         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2027         _youtube_ie = None
2028
2029         def __init__(self, youtube_ie, downloader=None):
2030                 InfoExtractor.__init__(self, downloader)
2031                 self._youtube_ie = youtube_ie
2032         
2033         @staticmethod
2034         def suitable(url):
2035                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2036
2037         def report_download_page(self, username):
2038                 """Report attempt to download user page."""
2039                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2040
2041         def _real_initialize(self):
2042                 self._youtube_ie.initialize()
2043         
2044         def _real_extract(self, url):
2045                 # Extract username
2046                 mobj = re.match(self._VALID_URL, url)
2047                 if mobj is None:
2048                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2049                         return
2050
2051                 # Download user page
2052                 username = mobj.group(1)
2053                 video_ids = []
2054                 pagenum = 1
2055
2056                 self.report_download_page(username)
2057                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2058                 try:
2059                         page = urllib2.urlopen(request).read()
2060                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2061                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2062                         return
2063
2064                 # Extract video identifiers
2065                 ids_in_page = []
2066
2067                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2068                         if mobj.group(1) not in ids_in_page:
2069                                 ids_in_page.append(mobj.group(1))
2070                 video_ids.extend(ids_in_page)
2071
2072                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2073                 playlistend = self._downloader.params.get('playlistend', -1)
2074                 video_ids = video_ids[playliststart:playlistend]
2075
2076                 for id in video_ids:
2077                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2078                 return
2079
2080 class DepositFilesIE(InfoExtractor):
2081         """Information extractor for depositfiles.com"""
2082
2083         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2084
2085         def __init__(self, downloader=None):
2086                 InfoExtractor.__init__(self, downloader)
2087
2088         @staticmethod
2089         def suitable(url):
2090                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2091
2092         def report_download_webpage(self, file_id):
2093                 """Report webpage download."""
2094                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2095
2096         def report_extraction(self, file_id):
2097                 """Report information extraction."""
2098                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2099
2100         def _real_initialize(self):
2101                 return
2102
2103         def _real_extract(self, url):
2104                 # At this point we have a new file
2105                 self._downloader.increment_downloads()
2106
2107                 file_id = url.split('/')[-1]
2108                 # Rebuild url in english locale
2109                 url = 'http://depositfiles.com/en/files/' + file_id
2110
2111                 # Retrieve file webpage with 'Free download' button pressed
2112                 free_download_indication = { 'gateway_result' : '1' }
2113                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2114                 try:
2115                         self.report_download_webpage(file_id)
2116                         webpage = urllib2.urlopen(request).read()
2117                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2118                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2119                         return
2120
2121                 # Search for the real file URL
2122                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2123                 if (mobj is None) or (mobj.group(1) is None):
2124                         # Try to figure out reason of the error.
2125                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2126                         if (mobj is not None) and (mobj.group(1) is not None):
2127                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2128                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2129                         else:
2130                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2131                         return
2132
2133                 file_url = mobj.group(1)
2134                 file_extension = os.path.splitext(file_url)[1][1:]
2135
2136                 # Search for file title
2137                 mobj = re.search(r'<b title="(.*?)">', webpage)
2138                 if mobj is None:
2139                         self._downloader.trouble(u'ERROR: unable to extract title')
2140                         return
2141                 file_title = mobj.group(1).decode('utf-8')
2142
2143                 try:
2144                         # Process file information
2145                         self._downloader.process_info({
2146                                 'id':           file_id.decode('utf-8'),
2147                                 'url':          file_url.decode('utf-8'),
2148                                 'uploader':     u'NA',
2149                                 'upload_date':  u'NA',
2150                                 'title':        file_title,
2151                                 'stitle':       file_title,
2152                                 'ext':          file_extension.decode('utf-8'),
2153                                 'format':       u'NA',
2154                                 'player_url':   None,
2155                         })
2156                 except UnavailableVideoError, err:
2157                         self._downloader.trouble(u'ERROR: unable to download file')
2158
2159 class PostProcessor(object):
2160         """Post Processor class.
2161
2162         PostProcessor objects can be added to downloaders with their
2163         add_post_processor() method. When the downloader has finished a
2164         successful download, it will take its internal chain of PostProcessors
2165         and start calling the run() method on each one of them, first with
2166         an initial argument and then with the returned value of the previous
2167         PostProcessor.
2168
2169         The chain will be stopped if one of them ever returns None or the end
2170         of the chain is reached.
2171
2172         PostProcessor objects follow a "mutual registration" process similar
2173         to InfoExtractor objects.
2174         """
2175
2176         _downloader = None
2177
2178         def __init__(self, downloader=None):
2179                 self._downloader = downloader
2180
2181         def set_downloader(self, downloader):
2182                 """Sets the downloader for this PP."""
2183                 self._downloader = downloader
2184         
2185         def run(self, information):
2186                 """Run the PostProcessor.
2187
2188                 The "information" argument is a dictionary like the ones
2189                 composed by InfoExtractors. The only difference is that this
2190                 one has an extra field called "filepath" that points to the
2191                 downloaded file.
2192
2193                 When this method returns None, the postprocessing chain is
2194                 stopped. However, this method may return an information
2195                 dictionary that will be passed to the next postprocessing
2196                 object in the chain. It can be the one it received after
2197                 changing some fields.
2198
2199                 In addition, this method may raise a PostProcessingError
2200                 exception that will be taken into account by the downloader
2201                 it was called from.
2202                 """
2203                 return information # by default, do nothing
2204         
2205 ### MAIN PROGRAM ###
2206 if __name__ == '__main__':
2207         try:
2208                 # Modules needed only when running the main program
2209                 import getpass
2210                 import optparse
2211
2212                 # Function to update the program file with the latest version from bitbucket.org
2213                 def update_self(downloader, filename):
2214                         # Note: downloader only used for options
2215                         if not os.access (filename, os.W_OK):
2216                                 sys.exit('ERROR: no write permissions on %s' % filename)
2217
2218                         downloader.to_screen('Updating to latest stable version...')
2219                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2220                         latest_version = urllib.urlopen(latest_url).read().strip()
2221                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2222                         newcontent = urllib.urlopen(prog_url).read()
2223                         stream = open(filename, 'w')
2224                         stream.write(newcontent)
2225                         stream.close()
2226                         downloader.to_screen('Updated to version %s' % latest_version)
2227
2228                 # Parse command line
2229                 parser = optparse.OptionParser(
2230                         usage='Usage: %prog [options] url...',
2231                         version='2010.11.19',
2232                         conflict_handler='resolve',
2233                 )
2234
2235                 parser.add_option('-h', '--help',
2236                                 action='help', help='print this help text and exit')
2237                 parser.add_option('-v', '--version',
2238                                 action='version', help='print program version and exit')
2239                 parser.add_option('-U', '--update',
2240                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2241                 parser.add_option('-i', '--ignore-errors',
2242                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2243                 parser.add_option('-r', '--rate-limit',
2244                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2245                 parser.add_option('-R', '--retries',
2246                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2247                 parser.add_option('--playlist-start',
2248                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2249                 parser.add_option('--playlist-end',
2250                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2251
2252                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2253                 authentication.add_option('-u', '--username',
2254                                 dest='username', metavar='USERNAME', help='account username')
2255                 authentication.add_option('-p', '--password',
2256                                 dest='password', metavar='PASSWORD', help='account password')
2257                 authentication.add_option('-n', '--netrc',
2258                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2259                 parser.add_option_group(authentication)
2260
2261                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2262                 video_format.add_option('-f', '--format',
2263                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2264                 video_format.add_option('-m', '--mobile-version',
2265                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2266                 video_format.add_option('--all-formats',
2267                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2268                 video_format.add_option('--max-quality',
2269                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2270                 video_format.add_option('-b', '--best-quality',
2271                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2272                 parser.add_option_group(video_format)
2273
2274                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2275                 verbosity.add_option('-q', '--quiet',
2276                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2277                 verbosity.add_option('-s', '--simulate',
2278                                 action='store_true', dest='simulate', help='do not download video', default=False)
2279                 verbosity.add_option('-g', '--get-url',
2280                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2281                 verbosity.add_option('-e', '--get-title',
2282                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2283                 verbosity.add_option('--get-thumbnail',
2284                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2285                 verbosity.add_option('--get-description',
2286                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2287                 verbosity.add_option('--no-progress',
2288                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2289                 parser.add_option_group(verbosity)
2290
2291                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2292                 filesystem.add_option('-t', '--title',
2293                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2294                 filesystem.add_option('-l', '--literal',
2295                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2296                 filesystem.add_option('-A', '--auto-number',
2297                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2298                 filesystem.add_option('-o', '--output',
2299                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2300                 filesystem.add_option('-a', '--batch-file',
2301                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2302                 filesystem.add_option('-w', '--no-overwrites',
2303                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2304                 filesystem.add_option('-c', '--continue',
2305                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2306                 filesystem.add_option('--cookies',
2307                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2308                 parser.add_option_group(filesystem)
2309
2310                 (opts, args) = parser.parse_args()
2311
2312                 # Open appropriate CookieJar
2313                 if opts.cookiefile is None:
2314                         jar = cookielib.CookieJar()
2315                 else:
2316                         try:
2317                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2318                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2319                                         jar.load()
2320                         except (IOError, OSError), err:
2321                                 sys.exit(u'ERROR: unable to open cookie file')
2322
2323                 # General configuration
2324                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2325                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2326                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2327                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2328
2329                 # Batch file verification
2330                 batchurls = []
2331                 if opts.batchfile is not None:
2332                         try:
2333                                 if opts.batchfile == '-':
2334                                         batchfd = sys.stdin
2335                                 else:
2336                                         batchfd = open(opts.batchfile, 'r')
2337                                 batchurls = batchfd.readlines()
2338                                 batchurls = [x.strip() for x in batchurls]
2339                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2340                         except IOError:
2341                                 sys.exit(u'ERROR: batch file could not be read')
2342                 all_urls = batchurls + args
2343
2344                 # Conflicting, missing and erroneous options
2345                 if opts.bestquality:
2346                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2347                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2348                         parser.error(u'using .netrc conflicts with giving username/password')
2349                 if opts.password is not None and opts.username is None:
2350                         parser.error(u'account username missing')
2351                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2352                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2353                 if opts.usetitle and opts.useliteral:
2354                         parser.error(u'using title conflicts with using literal title')
2355                 if opts.username is not None and opts.password is None:
2356                         opts.password = getpass.getpass(u'Type account password and press return:')
2357                 if opts.ratelimit is not None:
2358                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2359                         if numeric_limit is None:
2360                                 parser.error(u'invalid rate limit specified')
2361                         opts.ratelimit = numeric_limit
2362                 if opts.retries is not None:
2363                         try:
2364                                 opts.retries = long(opts.retries)
2365                         except (TypeError, ValueError), err:
2366                                 parser.error(u'invalid retry count specified')
2367                 try:
2368                         opts.playliststart = long(opts.playliststart)
2369                         if opts.playliststart <= 0:
2370                                 raise ValueError
2371                 except (TypeError, ValueError), err:
2372                         parser.error(u'invalid playlist start number specified')
2373                 try:
2374                         opts.playlistend = long(opts.playlistend)
2375                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2376                                 raise ValueError
2377                 except (TypeError, ValueError), err:
2378                         parser.error(u'invalid playlist end number specified')
2379
2380                 # Information extractors
2381                 youtube_ie = YoutubeIE()
2382                 metacafe_ie = MetacafeIE(youtube_ie)
2383                 dailymotion_ie = DailymotionIE()
2384                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2385                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2386                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2387                 google_ie = GoogleIE()
2388                 google_search_ie = GoogleSearchIE(google_ie)
2389                 photobucket_ie = PhotobucketIE()
2390                 yahoo_ie = YahooIE()
2391                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2392                 deposit_files_ie = DepositFilesIE()
2393                 generic_ie = GenericIE()
2394
2395                 # File downloader
2396                 fd = FileDownloader({
2397                         'usenetrc': opts.usenetrc,
2398                         'username': opts.username,
2399                         'password': opts.password,
2400                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2401                         'forceurl': opts.geturl,
2402                         'forcetitle': opts.gettitle,
2403                         'forcethumbnail': opts.getthumbnail,
2404                         'forcedescription': opts.getdescription,
2405                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2406                         'format': opts.format,
2407                         'format_limit': opts.format_limit,
2408                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2409                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2410                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2411                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2412                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2413                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2414                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2415                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2416                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2417                                 or u'%(id)s.%(ext)s'),
2418                         'ignoreerrors': opts.ignoreerrors,
2419                         'ratelimit': opts.ratelimit,
2420                         'nooverwrites': opts.nooverwrites,
2421                         'retries': opts.retries,
2422                         'continuedl': opts.continue_dl,
2423                         'noprogress': opts.noprogress,
2424                         'playliststart': opts.playliststart,
2425                         'playlistend': opts.playlistend,
2426                         'logtostderr': opts.outtmpl == '-',
2427                         })
2428                 fd.add_info_extractor(youtube_search_ie)
2429                 fd.add_info_extractor(youtube_pl_ie)
2430                 fd.add_info_extractor(youtube_user_ie)
2431                 fd.add_info_extractor(metacafe_ie)
2432                 fd.add_info_extractor(dailymotion_ie)
2433                 fd.add_info_extractor(youtube_ie)
2434                 fd.add_info_extractor(google_ie)
2435                 fd.add_info_extractor(google_search_ie)
2436                 fd.add_info_extractor(photobucket_ie)
2437                 fd.add_info_extractor(yahoo_ie)
2438                 fd.add_info_extractor(yahoo_search_ie)
2439                 fd.add_info_extractor(deposit_files_ie)
2440
2441                 # This must come last since it's the
2442                 # fallback if none of the others work
2443                 fd.add_info_extractor(generic_ie)
2444
2445                 # Update version
2446                 if opts.update_self:
2447                         update_self(fd, sys.argv[0])
2448
2449                 # Maybe do nothing
2450                 if len(all_urls) < 1:
2451                         if not opts.update_self:
2452                                 parser.error(u'you must provide at least one URL')
2453                         else:
2454                                 sys.exit()
2455                 retcode = fd.download(all_urls)
2456
2457                 # Dump cookie jar if requested
2458                 if opts.cookiefile is not None:
2459                         try:
2460                                 jar.save()
2461                         except (IOError, OSError), err:
2462                                 sys.exit(u'ERROR: unable to save cookie jar')
2463
2464                 sys.exit(retcode)
2465
2466         except DownloadError:
2467                 sys.exit(1)
2468         except SameFileError:
2469                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2470         except KeyboardInterrupt:
2471                 sys.exit(u'\nERROR: Interrupted by user')