]> git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl
Implemented depositfiles.com support
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import cookielib
8 import datetime
9 import htmlentitydefs
10 import httplib
11 import locale
12 import math
13 import netrc
14 import os
15 import os.path
16 import re
17 import socket
18 import string
19 import subprocess
20 import sys
21 import time
22 import urllib
23 import urllib2
24
25 # parse_qs was moved from the cgi module to the urlparse module recently.
26 try:
27         from urlparse import parse_qs
28 except ImportError:
29         from cgi import parse_qs
30
31 std_headers = {
32         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
33         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
34         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
35         'Accept-Language': 'en-us,en;q=0.5',
36 }
37
38 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
39
40 def preferredencoding():
41         """Get preferred encoding.
42
43         Returns the best encoding scheme for the system, based on
44         locale.getpreferredencoding() and some further tweaks.
45         """
46         def yield_preferredencoding():
47                 try:
48                         pref = locale.getpreferredencoding()
49                         u'TEST'.encode(pref)
50                 except:
51                         pref = 'UTF-8'
52                 while True:
53                         yield pref
54         return yield_preferredencoding().next()
55
56 def htmlentity_transform(matchobj):
57         """Transforms an HTML entity to a Unicode character.
58         
59         This function receives a match object and is intended to be used with
60         the re.sub() function.
61         """
62         entity = matchobj.group(1)
63
64         # Known non-numeric HTML entity
65         if entity in htmlentitydefs.name2codepoint:
66                 return unichr(htmlentitydefs.name2codepoint[entity])
67
68         # Unicode character
69         mobj = re.match(ur'(?u)#(x?\d+)', entity)
70         if mobj is not None:
71                 numstr = mobj.group(1)
72                 if numstr.startswith(u'x'):
73                         base = 16
74                         numstr = u'0%s' % numstr
75                 else:
76                         base = 10
77                 return unichr(long(numstr, base))
78
79         # Unknown entity in name, return its literal representation
80         return (u'&%s;' % entity)
81
82 def sanitize_title(utitle):
83         """Sanitizes a video title so it could be used as part of a filename."""
84         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
85         return utitle.replace(unicode(os.sep), u'%')
86
87 def sanitize_open(filename, open_mode):
88         """Try to open the given filename, and slightly tweak it if this fails.
89
90         Attempts to open the given filename. If this fails, it tries to change
91         the filename slightly, step by step, until it's either able to open it
92         or it fails and raises a final exception, like the standard open()
93         function.
94
95         It returns the tuple (stream, definitive_file_name).
96         """
97         try:
98                 if filename == u'-':
99                         if sys.platform == 'win32':
100                                 import msvcrt
101                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
102                         return (sys.stdout, filename)
103                 stream = open(filename, open_mode)
104                 return (stream, filename)
105         except (IOError, OSError), err:
106                 # In case of error, try to remove win32 forbidden chars
107                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
108
109                 # An exception here should be caught in the caller
110                 stream = open(filename, open_mode)
111                 return (stream, filename)
112
113 class DownloadError(Exception):
114         """Download Error exception.
115         
116         This exception may be thrown by FileDownloader objects if they are not
117         configured to continue on errors. They will contain the appropriate
118         error message.
119         """
120         pass
121
122 class SameFileError(Exception):
123         """Same File exception.
124
125         This exception will be thrown by FileDownloader objects if they detect
126         multiple files would have to be downloaded to the same file on disk.
127         """
128         pass
129
130 class PostProcessingError(Exception):
131         """Post Processing exception.
132
133         This exception may be raised by PostProcessor's .run() method to
134         indicate an error in the postprocessing task.
135         """
136         pass
137
138 class UnavailableVideoError(Exception):
139         """Unavailable Format exception.
140
141         This exception will be thrown when a video is requested
142         in a format that is not available for that video.
143         """
144         pass
145
146 class ContentTooShortError(Exception):
147         """Content Too Short exception.
148
149         This exception may be raised by FileDownloader objects when a file they
150         download is too small for what the server announced first, indicating
151         the connection was probably interrupted.
152         """
153         # Both in bytes
154         downloaded = None
155         expected = None
156
157         def __init__(self, downloaded, expected):
158                 self.downloaded = downloaded
159                 self.expected = expected
160
161 class FileDownloader(object):
162         """File Downloader class.
163
164         File downloader objects are the ones responsible of downloading the
165         actual video file and writing it to disk if the user has requested
166         it, among some other tasks. In most cases there should be one per
167         program. As, given a video URL, the downloader doesn't know how to
168         extract all the needed information, task that InfoExtractors do, it
169         has to pass the URL to one of them.
170
171         For this, file downloader objects have a method that allows
172         InfoExtractors to be registered in a given order. When it is passed
173         a URL, the file downloader handles it to the first InfoExtractor it
174         finds that reports being able to handle it. The InfoExtractor extracts
175         all the information about the video or videos the URL refers to, and
176         asks the FileDownloader to process the video information, possibly
177         downloading the video.
178
179         File downloaders accept a lot of parameters. In order not to saturate
180         the object constructor with arguments, it receives a dictionary of
181         options instead. These options are available through the params
182         attribute for the InfoExtractors to use. The FileDownloader also
183         registers itself as the downloader in charge for the InfoExtractors
184         that are added to it, so this is a "mutual registration".
185
186         Available options:
187
188         username:         Username for authentication purposes.
189         password:         Password for authentication purposes.
190         usenetrc:         Use netrc for authentication instead.
191         quiet:            Do not print messages to stdout.
192         forceurl:         Force printing final URL.
193         forcetitle:       Force printing title.
194         forcethumbnail:   Force printing thumbnail URL.
195         forcedescription: Force printing description.
196         simulate:         Do not download the video files.
197         format:           Video format code.
198         format_limit:     Highest quality format to try.
199         outtmpl:          Template for output names.
200         ignoreerrors:     Do not stop on download errors.
201         ratelimit:        Download speed limit, in bytes/sec.
202         nooverwrites:     Prevent overwriting files.
203         retries:          Number of times to retry for HTTP error 5xx
204         continuedl:       Try to continue downloads if possible.
205         noprogress:       Do not print the progress bar.
206         playliststart:    Playlist item to start at.
207         playlistend:      Playlist item to end at.
208         logtostderr:      Log messages to stderr instead of stdout.
209         """
210
211         params = None
212         _ies = []
213         _pps = []
214         _download_retcode = None
215         _num_downloads = None
216         _screen_file = None
217
218         def __init__(self, params):
219                 """Create a FileDownloader object with the given options."""
220                 self._ies = []
221                 self._pps = []
222                 self._download_retcode = 0
223                 self._num_downloads = 0
224                 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
225                 self.params = params
226         
227         @staticmethod
228         def pmkdir(filename):
229                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
230                 components = filename.split(os.sep)
231                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
232                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
233                 for dir in aggregate:
234                         if not os.path.exists(dir):
235                                 os.mkdir(dir)
236         
237         @staticmethod
238         def temp_name(filename):
239                 """Returns a temporary filename for the given filename."""
240                 return filename + '.part'
241         
242         @staticmethod
243         def format_bytes(bytes):
244                 if bytes is None:
245                         return 'N/A'
246                 if type(bytes) is str:
247                         bytes = float(bytes)
248                 if bytes == 0.0:
249                         exponent = 0
250                 else:
251                         exponent = long(math.log(bytes, 1024.0))
252                 suffix = 'bkMGTPEZY'[exponent]
253                 converted = float(bytes) / float(1024**exponent)
254                 return '%.2f%s' % (converted, suffix)
255
256         @staticmethod
257         def calc_percent(byte_counter, data_len):
258                 if data_len is None:
259                         return '---.-%'
260                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
261
262         @staticmethod
263         def calc_eta(start, now, total, current):
264                 if total is None:
265                         return '--:--'
266                 dif = now - start
267                 if current == 0 or dif < 0.001: # One millisecond
268                         return '--:--'
269                 rate = float(current) / dif
270                 eta = long((float(total) - float(current)) / rate)
271                 (eta_mins, eta_secs) = divmod(eta, 60)
272                 if eta_mins > 99:
273                         return '--:--'
274                 return '%02d:%02d' % (eta_mins, eta_secs)
275
276         @staticmethod
277         def calc_speed(start, now, bytes):
278                 dif = now - start
279                 if bytes == 0 or dif < 0.001: # One millisecond
280                         return '%10s' % '---b/s'
281                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
282
283         @staticmethod
284         def best_block_size(elapsed_time, bytes):
285                 new_min = max(bytes / 2.0, 1.0)
286                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
287                 if elapsed_time < 0.001:
288                         return long(new_max)
289                 rate = bytes / elapsed_time
290                 if rate > new_max:
291                         return long(new_max)
292                 if rate < new_min:
293                         return long(new_min)
294                 return long(rate)
295
296         @staticmethod
297         def parse_bytes(bytestr):
298                 """Parse a string indicating a byte quantity into a long integer."""
299                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
300                 if matchobj is None:
301                         return None
302                 number = float(matchobj.group(1))
303                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
304                 return long(round(number * multiplier))
305
306         def add_info_extractor(self, ie):
307                 """Add an InfoExtractor object to the end of the list."""
308                 self._ies.append(ie)
309                 ie.set_downloader(self)
310         
311         def add_post_processor(self, pp):
312                 """Add a PostProcessor object to the end of the chain."""
313                 self._pps.append(pp)
314                 pp.set_downloader(self)
315         
316         def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
317                 """Print message to stdout if not in quiet mode."""
318                 try:
319                         if not self.params.get('quiet', False):
320                                 terminator = [u'\n', u''][skip_eol]
321                                 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
322                         self._screen_file.flush()
323                 except (UnicodeEncodeError), err:
324                         if not ignore_encoding_errors:
325                                 raise
326         
327         def to_stderr(self, message):
328                 """Print message to stderr."""
329                 print >>sys.stderr, message.encode(preferredencoding())
330         
331         def fixed_template(self):
332                 """Checks if the output template is fixed."""
333                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
334
335         def trouble(self, message=None):
336                 """Determine action to take when a download problem appears.
337
338                 Depending on if the downloader has been configured to ignore
339                 download errors or not, this method may throw an exception or
340                 not when errors are found, after printing the message.
341                 """
342                 if message is not None:
343                         self.to_stderr(message)
344                 if not self.params.get('ignoreerrors', False):
345                         raise DownloadError(message)
346                 self._download_retcode = 1
347
348         def slow_down(self, start_time, byte_counter):
349                 """Sleep if the download speed is over the rate limit."""
350                 rate_limit = self.params.get('ratelimit', None)
351                 if rate_limit is None or byte_counter == 0:
352                         return
353                 now = time.time()
354                 elapsed = now - start_time
355                 if elapsed <= 0.0:
356                         return
357                 speed = float(byte_counter) / elapsed
358                 if speed > rate_limit:
359                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
360         
361         def try_rename(self, old_filename, new_filename):
362                 try:
363                         os.rename(old_filename, new_filename)
364                 except (IOError, OSError), err:
365                         self.trouble(u'ERROR: unable to rename file')
366
367         def report_destination(self, filename):
368                 """Report destination filename."""
369                 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
370         
371         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
372                 """Report download progress."""
373                 if self.params.get('noprogress', False):
374                         return
375                 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
376                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
377
378         def report_resuming_byte(self, resume_len):
379                 """Report attempt to resume at given byte."""
380                 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
381         
382         def report_retry(self, count, retries):
383                 """Report retry in case of HTTP error 5xx"""
384                 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
385         
386         def report_file_already_downloaded(self, file_name):
387                 """Report file has already been fully downloaded."""
388                 try:
389                         self.to_screen(u'[download] %s has already been downloaded' % file_name)
390                 except (UnicodeEncodeError), err:
391                         self.to_screen(u'[download] The file has already been downloaded')
392         
393         def report_unable_to_resume(self):
394                 """Report it was impossible to resume download."""
395                 self.to_screen(u'[download] Unable to resume')
396         
397         def report_finish(self):
398                 """Report download finished."""
399                 if self.params.get('noprogress', False):
400                         self.to_screen(u'[download] Download completed')
401                 else:
402                         self.to_screen(u'')
403         
404         def increment_downloads(self):
405                 """Increment the ordinal that assigns a number to each file."""
406                 self._num_downloads += 1
407
408         def process_info(self, info_dict):
409                 """Process a single dictionary returned by an InfoExtractor."""
410                 # Do nothing else if in simulate mode
411                 if self.params.get('simulate', False):
412                         # Forced printings
413                         if self.params.get('forcetitle', False):
414                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
415                         if self.params.get('forceurl', False):
416                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
417                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
418                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
419                         if self.params.get('forcedescription', False) and 'description' in info_dict:
420                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
421
422                         return
423                         
424                 try:
425                         template_dict = dict(info_dict)
426                         template_dict['epoch'] = unicode(long(time.time()))
427                         template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
428                         filename = self.params['outtmpl'] % template_dict
429                 except (ValueError, KeyError), err:
430                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
431                         return
432                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
433                         self.to_stderr(u'WARNING: file exists and will be skipped')
434                         return
435
436                 try:
437                         self.pmkdir(filename)
438                 except (OSError, IOError), err:
439                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
440                         return
441
442                 try:
443                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
444                 except (OSError, IOError), err:
445                         raise UnavailableVideoError
446                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
447                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
448                         return
449                 except (ContentTooShortError, ), err:
450                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
451                         return
452
453                 if success:
454                         try:
455                                 self.post_process(filename, info_dict)
456                         except (PostProcessingError), err:
457                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
458                                 return
459
460         def download(self, url_list):
461                 """Download a given list of URLs."""
462                 if len(url_list) > 1 and self.fixed_template():
463                         raise SameFileError(self.params['outtmpl'])
464
465                 for url in url_list:
466                         suitable_found = False
467                         for ie in self._ies:
468                                 # Go to next InfoExtractor if not suitable
469                                 if not ie.suitable(url):
470                                         continue
471
472                                 # Suitable InfoExtractor found
473                                 suitable_found = True
474
475                                 # Extract information from URL and process it
476                                 ie.extract(url)
477
478                                 # Suitable InfoExtractor had been found; go to next URL
479                                 break
480
481                         if not suitable_found:
482                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
483
484                 return self._download_retcode
485
486         def post_process(self, filename, ie_info):
487                 """Run the postprocessing chain on the given file."""
488                 info = dict(ie_info)
489                 info['filepath'] = filename
490                 for pp in self._pps:
491                         info = pp.run(info)
492                         if info is None:
493                                 break
494         
495         def _download_with_rtmpdump(self, filename, url, player_url):
496                 self.report_destination(filename)
497                 tmpfilename = self.temp_name(filename)
498
499                 # Check for rtmpdump first
500                 try:
501                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
502                 except (OSError, IOError):
503                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
504                         return False
505
506                 # Download using rtmpdump. rtmpdump returns exit code 2 when
507                 # the connection was interrumpted and resuming appears to be
508                 # possible. This is part of rtmpdump's normal usage, AFAIK.
509                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
510                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
511                 while retval == 2 or retval == 1:
512                         prevsize = os.path.getsize(tmpfilename)
513                         self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
514                         time.sleep(5.0) # This seems to be needed
515                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
516                         cursize = os.path.getsize(tmpfilename)
517                         if prevsize == cursize and retval == 1:
518                                 break
519                 if retval == 0:
520                         self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
521                         self.try_rename(tmpfilename, filename)
522                         return True
523                 else:
524                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
525                         return False
526
527         def _do_download(self, filename, url, player_url):
528                 # Check file already present
529                 if self.params.get('continuedl', False) and os.path.isfile(filename):
530                         self.report_file_already_downloaded(filename)
531                         return True
532
533                 # Attempt to download using rtmpdump
534                 if url.startswith('rtmp'):
535                         return self._download_with_rtmpdump(filename, url, player_url)
536
537                 tmpfilename = self.temp_name(filename)
538                 stream = None
539                 open_mode = 'wb'
540                 basic_request = urllib2.Request(url, None, std_headers)
541                 request = urllib2.Request(url, None, std_headers)
542
543                 # Establish possible resume length
544                 if os.path.isfile(tmpfilename):
545                         resume_len = os.path.getsize(tmpfilename)
546                 else:
547                         resume_len = 0
548
549                 # Request parameters in case of being able to resume
550                 if self.params.get('continuedl', False) and resume_len != 0:
551                         self.report_resuming_byte(resume_len)
552                         request.add_header('Range','bytes=%d-' % resume_len)
553                         open_mode = 'ab'
554
555                 count = 0
556                 retries = self.params.get('retries', 0)
557                 while count <= retries:
558                         # Establish connection
559                         try:
560                                 data = urllib2.urlopen(request)
561                                 break
562                         except (urllib2.HTTPError, ), err:
563                                 if (err.code < 500 or err.code >= 600) and err.code != 416:
564                                         # Unexpected HTTP error
565                                         raise
566                                 elif err.code == 416:
567                                         # Unable to resume (requested range not satisfiable)
568                                         try:
569                                                 # Open the connection again without the range header
570                                                 data = urllib2.urlopen(basic_request)
571                                                 content_length = data.info()['Content-Length']
572                                         except (urllib2.HTTPError, ), err:
573                                                 if err.code < 500 or err.code >= 600:
574                                                         raise
575                                         else:
576                                                 # Examine the reported length
577                                                 if (content_length is not None and
578                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
579                                                         # The file had already been fully downloaded.
580                                                         # Explanation to the above condition: in issue #175 it was revealed that
581                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
582                                                         # changing the file size slightly and causing problems for some users. So
583                                                         # I decided to implement a suggested change and consider the file
584                                                         # completely downloaded if the file size differs less than 100 bytes from
585                                                         # the one in the hard drive.
586                                                         self.report_file_already_downloaded(filename)
587                                                         self.try_rename(tmpfilename, filename)
588                                                         return True
589                                                 else:
590                                                         # The length does not match, we start the download over
591                                                         self.report_unable_to_resume()
592                                                         open_mode = 'wb'
593                                                         break
594                         # Retry
595                         count += 1
596                         if count <= retries:
597                                 self.report_retry(count, retries)
598
599                 if count > retries:
600                         self.trouble(u'ERROR: giving up after %s retries' % retries)
601                         return False
602
603                 data_len = data.info().get('Content-length', None)
604                 data_len_str = self.format_bytes(data_len)
605                 byte_counter = 0
606                 block_size = 1024
607                 start = time.time()
608                 while True:
609                         # Download and write
610                         before = time.time()
611                         data_block = data.read(block_size)
612                         after = time.time()
613                         data_block_len = len(data_block)
614                         if data_block_len == 0:
615                                 break
616                         byte_counter += data_block_len
617
618                         # Open file just in time
619                         if stream is None:
620                                 try:
621                                         (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
622                                         self.report_destination(filename)
623                                 except (OSError, IOError), err:
624                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
625                                         return False
626                         try:
627                                 stream.write(data_block)
628                         except (IOError, OSError), err:
629                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
630                                 return False
631                         block_size = self.best_block_size(after - before, data_block_len)
632
633                         # Progress message
634                         percent_str = self.calc_percent(byte_counter, data_len)
635                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
636                         speed_str = self.calc_speed(start, time.time(), byte_counter)
637                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
638
639                         # Apply rate limit
640                         self.slow_down(start, byte_counter)
641
642                 stream.close()
643                 self.report_finish()
644                 if data_len is not None and str(byte_counter) != data_len:
645                         raise ContentTooShortError(byte_counter, long(data_len))
646                 self.try_rename(tmpfilename, filename)
647                 return True
648
649 class InfoExtractor(object):
650         """Information Extractor class.
651
652         Information extractors are the classes that, given a URL, extract
653         information from the video (or videos) the URL refers to. This
654         information includes the real video URL, the video title and simplified
655         title, author and others. The information is stored in a dictionary
656         which is then passed to the FileDownloader. The FileDownloader
657         processes this information possibly downloading the video to the file
658         system, among other possible outcomes. The dictionaries must include
659         the following fields:
660
661         id:             Video identifier.
662         url:            Final video URL.
663         uploader:       Nickname of the video uploader.
664         title:          Literal title.
665         stitle:         Simplified title.
666         ext:            Video filename extension.
667         format:         Video format.
668         player_url:     SWF Player URL (may be None).
669
670         The following fields are optional. Their primary purpose is to allow
671         youtube-dl to serve as the backend for a video search function, such
672         as the one in youtube2mp3.  They are only used when their respective
673         forced printing functions are called:
674
675         thumbnail:      Full URL to a video thumbnail image.
676         description:    One-line video description.
677
678         Subclasses of this one should re-define the _real_initialize() and
679         _real_extract() methods, as well as the suitable() static method.
680         Probably, they should also be instantiated and added to the main
681         downloader.
682         """
683
684         _ready = False
685         _downloader = None
686
687         def __init__(self, downloader=None):
688                 """Constructor. Receives an optional downloader."""
689                 self._ready = False
690                 self.set_downloader(downloader)
691
692         @staticmethod
693         def suitable(url):
694                 """Receives a URL and returns True if suitable for this IE."""
695                 return False
696
697         def initialize(self):
698                 """Initializes an instance (authentication, etc)."""
699                 if not self._ready:
700                         self._real_initialize()
701                         self._ready = True
702
703         def extract(self, url):
704                 """Extracts URL information and returns it in list of dicts."""
705                 self.initialize()
706                 return self._real_extract(url)
707
708         def set_downloader(self, downloader):
709                 """Sets the downloader for this IE."""
710                 self._downloader = downloader
711         
712         def _real_initialize(self):
713                 """Real initialization process. Redefine in subclasses."""
714                 pass
715
716         def _real_extract(self, url):
717                 """Real extraction process. Redefine in subclasses."""
718                 pass
719
720 class YoutubeIE(InfoExtractor):
721         """Information extractor for youtube.com."""
722
723         _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
724         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
725         _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
726         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
727         _NETRC_MACHINE = 'youtube'
728         # Listed in order of quality
729         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
730         _video_extensions = {
731                 '13': '3gp',
732                 '17': 'mp4',
733                 '18': 'mp4',
734                 '22': 'mp4',
735                 '37': 'mp4',
736                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
737                 '43': 'webm',
738                 '45': 'webm',
739         }
740
741         @staticmethod
742         def suitable(url):
743                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
744
745         def report_lang(self):
746                 """Report attempt to set language."""
747                 self._downloader.to_screen(u'[youtube] Setting language')
748
749         def report_login(self):
750                 """Report attempt to log in."""
751                 self._downloader.to_screen(u'[youtube] Logging in')
752         
753         def report_age_confirmation(self):
754                 """Report attempt to confirm age."""
755                 self._downloader.to_screen(u'[youtube] Confirming age')
756         
757         def report_video_webpage_download(self, video_id):
758                 """Report attempt to download video webpage."""
759                 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
760         
761         def report_video_info_webpage_download(self, video_id):
762                 """Report attempt to download video info webpage."""
763                 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
764         
765         def report_information_extraction(self, video_id):
766                 """Report attempt to extract video information."""
767                 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
768         
769         def report_unavailable_format(self, video_id, format):
770                 """Report extracted video URL."""
771                 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
772         
773         def report_rtmp_download(self):
774                 """Indicate the download will use the RTMP protocol."""
775                 self._downloader.to_screen(u'[youtube] RTMP download detected')
776         
777         def _real_initialize(self):
778                 if self._downloader is None:
779                         return
780
781                 username = None
782                 password = None
783                 downloader_params = self._downloader.params
784
785                 # Attempt to use provided username and password or .netrc data
786                 if downloader_params.get('username', None) is not None:
787                         username = downloader_params['username']
788                         password = downloader_params['password']
789                 elif downloader_params.get('usenetrc', False):
790                         try:
791                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
792                                 if info is not None:
793                                         username = info[0]
794                                         password = info[2]
795                                 else:
796                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
797                         except (IOError, netrc.NetrcParseError), err:
798                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
799                                 return
800
801                 # Set language
802                 request = urllib2.Request(self._LANG_URL, None, std_headers)
803                 try:
804                         self.report_lang()
805                         urllib2.urlopen(request).read()
806                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
808                         return
809
810                 # No authentication to be performed
811                 if username is None:
812                         return
813
814                 # Log in
815                 login_form = {
816                                 'current_form': 'loginForm',
817                                 'next':         '/',
818                                 'action_login': 'Log In',
819                                 'username':     username,
820                                 'password':     password,
821                                 }
822                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
823                 try:
824                         self.report_login()
825                         login_results = urllib2.urlopen(request).read()
826                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
827                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
828                                 return
829                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
830                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
831                         return
832         
833                 # Confirm age
834                 age_form = {
835                                 'next_url':             '/',
836                                 'action_confirm':       'Confirm',
837                                 }
838                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
839                 try:
840                         self.report_age_confirmation()
841                         age_results = urllib2.urlopen(request).read()
842                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
843                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
844                         return
845
846         def _real_extract(self, url):
847                 # Extract video id from URL
848                 mobj = re.match(self._VALID_URL, url)
849                 if mobj is None:
850                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
851                         return
852                 video_id = mobj.group(2)
853
854                 # Get video webpage
855                 self.report_video_webpage_download(video_id)
856                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
857                 try:
858                         video_webpage = urllib2.urlopen(request).read()
859                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
860                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
861                         return
862
863                 # Attempt to extract SWF player URL
864                 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
865                 if mobj is not None:
866                         player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
867                 else:
868                         player_url = None
869
870                 # Get video info
871                 self.report_video_info_webpage_download(video_id)
872                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
873                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
874                                            % (video_id, el_type))
875                         request = urllib2.Request(video_info_url, None, std_headers)
876                         try:
877                                 video_info_webpage = urllib2.urlopen(request).read()
878                                 video_info = parse_qs(video_info_webpage)
879                                 if 'token' in video_info:
880                                         break
881                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
882                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
883                                 return
884                 if 'token' not in video_info:
885                         if 'reason' in video_info:
886                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
887                         else:
888                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
889                         return
890
891                 # Start extracting information
892                 self.report_information_extraction(video_id)
893
894                 # uploader
895                 if 'author' not in video_info:
896                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
897                         return
898                 video_uploader = urllib.unquote_plus(video_info['author'][0])
899
900                 # title
901                 if 'title' not in video_info:
902                         self._downloader.trouble(u'ERROR: unable to extract video title')
903                         return
904                 video_title = urllib.unquote_plus(video_info['title'][0])
905                 video_title = video_title.decode('utf-8')
906                 video_title = sanitize_title(video_title)
907
908                 # simplified title
909                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
910                 simple_title = simple_title.strip(ur'_')
911
912                 # thumbnail image
913                 if 'thumbnail_url' not in video_info:
914                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
915                         video_thumbnail = ''
916                 else:   # don't panic if we can't find it
917                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
918
919                 # upload date
920                 upload_date = u'NA'
921                 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
922                 if mobj is not None:
923                         upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
924                         format_expressions = ['%d %B %Y', '%B %d %Y']
925                         for expression in format_expressions:
926                                 try:
927                                         upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
928                                 except:
929                                         pass
930
931                 # description
932                 video_description = 'No description available.'
933                 if self._downloader.params.get('forcedescription', False):
934                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
935                         if mobj is not None:
936                                 video_description = mobj.group(1)
937
938                 # token
939                 video_token = urllib.unquote_plus(video_info['token'][0])
940
941                 # Decide which formats to download
942                 requested_format = self._downloader.params.get('format', None)
943                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
944
945                 if 'fmt_url_map' in video_info:
946                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
947                         format_limit = self._downloader.params.get('format_limit', None)
948                         if format_limit is not None and format_limit in self._available_formats:
949                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
950                         else:
951                                 format_list = self._available_formats
952                         existing_formats = [x for x in format_list if x in url_map]
953                         if len(existing_formats) == 0:
954                                 self._downloader.trouble(u'ERROR: no known formats available for video')
955                                 return
956                         if requested_format is None:
957                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
958                         elif requested_format == '-1':
959                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
960                         else:
961                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
962
963                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
964                         self.report_rtmp_download()
965                         video_url_list = [(None, video_info['conn'][0])]
966
967                 else:
968                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
969                         return
970
971                 for format_param, video_real_url in video_url_list:
972                         # At this point we have a new video
973                         self._downloader.increment_downloads()
974
975                         # Extension
976                         video_extension = self._video_extensions.get(format_param, 'flv')
977
978                         # Find the video URL in fmt_url_map or conn paramters
979                         try:
980                                 # Process video information
981                                 self._downloader.process_info({
982                                         'id':           video_id.decode('utf-8'),
983                                         'url':          video_real_url.decode('utf-8'),
984                                         'uploader':     video_uploader.decode('utf-8'),
985                                         'upload_date':  upload_date,
986                                         'title':        video_title,
987                                         'stitle':       simple_title,
988                                         'ext':          video_extension.decode('utf-8'),
989                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
990                                         'thumbnail':    video_thumbnail.decode('utf-8'),
991                                         'description':  video_description.decode('utf-8'),
992                                         'player_url':   player_url,
993                                 })
994                         except UnavailableVideoError, err:
995                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
996
997
998 class MetacafeIE(InfoExtractor):
999         """Information Extractor for metacafe.com."""
1000
1001         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1002         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1003         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1004         _youtube_ie = None
1005
1006         def __init__(self, youtube_ie, downloader=None):
1007                 InfoExtractor.__init__(self, downloader)
1008                 self._youtube_ie = youtube_ie
1009
1010         @staticmethod
1011         def suitable(url):
1012                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1013
1014         def report_disclaimer(self):
1015                 """Report disclaimer retrieval."""
1016                 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1017
1018         def report_age_confirmation(self):
1019                 """Report attempt to confirm age."""
1020                 self._downloader.to_screen(u'[metacafe] Confirming age')
1021         
1022         def report_download_webpage(self, video_id):
1023                 """Report webpage download."""
1024                 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1025         
1026         def report_extraction(self, video_id):
1027                 """Report information extraction."""
1028                 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1029
1030         def _real_initialize(self):
1031                 # Retrieve disclaimer
1032                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1033                 try:
1034                         self.report_disclaimer()
1035                         disclaimer = urllib2.urlopen(request).read()
1036                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1037                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1038                         return
1039
1040                 # Confirm age
1041                 disclaimer_form = {
1042                         'filters': '0',
1043                         'submit': "Continue - I'm over 18",
1044                         }
1045                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1046                 try:
1047                         self.report_age_confirmation()
1048                         disclaimer = urllib2.urlopen(request).read()
1049                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1050                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1051                         return
1052         
1053         def _real_extract(self, url):
1054                 # Extract id and simplified title from URL
1055                 mobj = re.match(self._VALID_URL, url)
1056                 if mobj is None:
1057                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1058                         return
1059
1060                 video_id = mobj.group(1)
1061
1062                 # Check if video comes from YouTube
1063                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1064                 if mobj2 is not None:
1065                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1066                         return
1067
1068                 # At this point we have a new video
1069                 self._downloader.increment_downloads()
1070
1071                 simple_title = mobj.group(2).decode('utf-8')
1072
1073                 # Retrieve video webpage to extract further information
1074                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1075                 try:
1076                         self.report_download_webpage(video_id)
1077                         webpage = urllib2.urlopen(request).read()
1078                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1079                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1080                         return
1081
1082                 # Extract URL, uploader and title from webpage
1083                 self.report_extraction(video_id)
1084                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1085                 if mobj is not None:
1086                         mediaURL = urllib.unquote(mobj.group(1))
1087                         video_extension = mediaURL[-3:]
1088                         
1089                         # Extract gdaKey if available
1090                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1091                         if mobj is None:
1092                                 video_url = mediaURL
1093                         else:
1094                                 gdaKey = mobj.group(1)
1095                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1096                 else:
1097                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1098                         if mobj is None:
1099                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1100                                 return
1101                         vardict = parse_qs(mobj.group(1))
1102                         if 'mediaData' not in vardict:
1103                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1104                                 return
1105                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1106                         if mobj is None:
1107                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1108                                 return
1109                         mediaURL = mobj.group(1).replace('\\/', '/')
1110                         video_extension = mediaURL[-3:]
1111                         video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1112
1113                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1114                 if mobj is None:
1115                         self._downloader.trouble(u'ERROR: unable to extract title')
1116                         return
1117                 video_title = mobj.group(1).decode('utf-8')
1118                 video_title = sanitize_title(video_title)
1119
1120                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1121                 if mobj is None:
1122                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1123                         return
1124                 video_uploader = mobj.group(1)
1125
1126                 try:
1127                         # Process video information
1128                         self._downloader.process_info({
1129                                 'id':           video_id.decode('utf-8'),
1130                                 'url':          video_url.decode('utf-8'),
1131                                 'uploader':     video_uploader.decode('utf-8'),
1132                                 'upload_date':  u'NA',
1133                                 'title':        video_title,
1134                                 'stitle':       simple_title,
1135                                 'ext':          video_extension.decode('utf-8'),
1136                                 'format':       u'NA',
1137                                 'player_url':   None,
1138                         })
1139                 except UnavailableVideoError:
1140                         self._downloader.trouble(u'ERROR: unable to download video')
1141
1142
1143 class DailymotionIE(InfoExtractor):
1144         """Information Extractor for Dailymotion"""
1145
1146         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1147
1148         def __init__(self, downloader=None):
1149                 InfoExtractor.__init__(self, downloader)
1150
1151         @staticmethod
1152         def suitable(url):
1153                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1154
1155         def report_download_webpage(self, video_id):
1156                 """Report webpage download."""
1157                 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1158         
1159         def report_extraction(self, video_id):
1160                 """Report information extraction."""
1161                 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1162
1163         def _real_initialize(self):
1164                 return
1165
1166         def _real_extract(self, url):
1167                 # Extract id and simplified title from URL
1168                 mobj = re.match(self._VALID_URL, url)
1169                 if mobj is None:
1170                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1171                         return
1172
1173                 # At this point we have a new video
1174                 self._downloader.increment_downloads()
1175                 video_id = mobj.group(1)
1176
1177                 simple_title = mobj.group(2).decode('utf-8')
1178                 video_extension = 'flv'
1179
1180                 # Retrieve video webpage to extract further information
1181                 request = urllib2.Request(url)
1182                 try:
1183                         self.report_download_webpage(video_id)
1184                         webpage = urllib2.urlopen(request).read()
1185                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1187                         return
1188
1189                 # Extract URL, uploader and title from webpage
1190                 self.report_extraction(video_id)
1191                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1192                 if mobj is None:
1193                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1194                         return
1195                 mediaURL = urllib.unquote(mobj.group(1))
1196
1197                 # if needed add http://www.dailymotion.com/ if relative URL
1198
1199                 video_url = mediaURL
1200
1201                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1202                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1203                 if mobj is None:
1204                         self._downloader.trouble(u'ERROR: unable to extract title')
1205                         return
1206                 video_title = mobj.group(1).decode('utf-8')
1207                 video_title = sanitize_title(video_title)
1208
1209                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1212                         return
1213                 video_uploader = mobj.group(1)
1214
1215                 try:
1216                         # Process video information
1217                         self._downloader.process_info({
1218                                 'id':           video_id.decode('utf-8'),
1219                                 'url':          video_url.decode('utf-8'),
1220                                 'uploader':     video_uploader.decode('utf-8'),
1221                                 'upload_date':  u'NA',
1222                                 'title':        video_title,
1223                                 'stitle':       simple_title,
1224                                 'ext':          video_extension.decode('utf-8'),
1225                                 'format':       u'NA',
1226                                 'player_url':   None,
1227                         })
1228                 except UnavailableVideoError:
1229                         self._downloader.trouble(u'ERROR: unable to download video')
1230
1231 class GoogleIE(InfoExtractor):
1232         """Information extractor for video.google.com."""
1233
1234         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1235
1236         def __init__(self, downloader=None):
1237                 InfoExtractor.__init__(self, downloader)
1238
1239         @staticmethod
1240         def suitable(url):
1241                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1242
1243         def report_download_webpage(self, video_id):
1244                 """Report webpage download."""
1245                 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1246
1247         def report_extraction(self, video_id):
1248                 """Report information extraction."""
1249                 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1250
1251         def _real_initialize(self):
1252                 return
1253
1254         def _real_extract(self, url):
1255                 # Extract id from URL
1256                 mobj = re.match(self._VALID_URL, url)
1257                 if mobj is None:
1258                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1259                         return
1260
1261                 # At this point we have a new video
1262                 self._downloader.increment_downloads()
1263                 video_id = mobj.group(1)
1264
1265                 video_extension = 'mp4'
1266
1267                 # Retrieve video webpage to extract further information
1268                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1269                 try:
1270                         self.report_download_webpage(video_id)
1271                         webpage = urllib2.urlopen(request).read()
1272                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1273                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1274                         return
1275
1276                 # Extract URL, uploader, and title from webpage
1277                 self.report_extraction(video_id)
1278                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1279                 if mobj is None:
1280                         video_extension = 'flv'
1281                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1282                 if mobj is None:
1283                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1284                         return
1285                 mediaURL = urllib.unquote(mobj.group(1))
1286                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1287                 mediaURL = mediaURL.replace('\\x26', '\x26')
1288
1289                 video_url = mediaURL
1290
1291                 mobj = re.search(r'<title>(.*)</title>', webpage)
1292                 if mobj is None:
1293                         self._downloader.trouble(u'ERROR: unable to extract title')
1294                         return
1295                 video_title = mobj.group(1).decode('utf-8')
1296                 video_title = sanitize_title(video_title)
1297                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1298
1299                 # Extract video description
1300                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1301                 if mobj is None:
1302                         self._downloader.trouble(u'ERROR: unable to extract video description')
1303                         return
1304                 video_description = mobj.group(1).decode('utf-8')
1305                 if not video_description:
1306                         video_description = 'No description available.'
1307
1308                 # Extract video thumbnail
1309                 if self._downloader.params.get('forcethumbnail', False):
1310                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1311                         try:
1312                                 webpage = urllib2.urlopen(request).read()
1313                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1314                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1315                                 return
1316                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1317                         if mobj is None:
1318                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1319                                 return
1320                         video_thumbnail = mobj.group(1)
1321                 else:   # we need something to pass to process_info
1322                         video_thumbnail = ''
1323
1324
1325                 try:
1326                         # Process video information
1327                         self._downloader.process_info({
1328                                 'id':           video_id.decode('utf-8'),
1329                                 'url':          video_url.decode('utf-8'),
1330                                 'uploader':     u'NA',
1331                                 'upload_date':  u'NA',
1332                                 'title':        video_title,
1333                                 'stitle':       simple_title,
1334                                 'ext':          video_extension.decode('utf-8'),
1335                                 'format':       u'NA',
1336                                 'player_url':   None,
1337                         })
1338                 except UnavailableVideoError:
1339                         self._downloader.trouble(u'ERROR: unable to download video')
1340
1341
1342 class PhotobucketIE(InfoExtractor):
1343         """Information extractor for photobucket.com."""
1344
1345         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1346
1347         def __init__(self, downloader=None):
1348                 InfoExtractor.__init__(self, downloader)
1349
1350         @staticmethod
1351         def suitable(url):
1352                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1353
1354         def report_download_webpage(self, video_id):
1355                 """Report webpage download."""
1356                 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1357
1358         def report_extraction(self, video_id):
1359                 """Report information extraction."""
1360                 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1361
1362         def _real_initialize(self):
1363                 return
1364
1365         def _real_extract(self, url):
1366                 # Extract id from URL
1367                 mobj = re.match(self._VALID_URL, url)
1368                 if mobj is None:
1369                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1370                         return
1371
1372                 # At this point we have a new video
1373                 self._downloader.increment_downloads()
1374                 video_id = mobj.group(1)
1375
1376                 video_extension = 'flv'
1377
1378                 # Retrieve video webpage to extract further information
1379                 request = urllib2.Request(url)
1380                 try:
1381                         self.report_download_webpage(video_id)
1382                         webpage = urllib2.urlopen(request).read()
1383                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1384                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1385                         return
1386
1387                 # Extract URL, uploader, and title from webpage
1388                 self.report_extraction(video_id)
1389                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1390                 if mobj is None:
1391                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1392                         return
1393                 mediaURL = urllib.unquote(mobj.group(1))
1394
1395                 video_url = mediaURL
1396
1397                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1398                 if mobj is None:
1399                         self._downloader.trouble(u'ERROR: unable to extract title')
1400                         return
1401                 video_title = mobj.group(1).decode('utf-8')
1402                 video_title = sanitize_title(video_title)
1403                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1404
1405                 video_uploader = mobj.group(2).decode('utf-8')
1406
1407                 try:
1408                         # Process video information
1409                         self._downloader.process_info({
1410                                 'id':           video_id.decode('utf-8'),
1411                                 'url':          video_url.decode('utf-8'),
1412                                 'uploader':     video_uploader,
1413                                 'upload_date':  u'NA',
1414                                 'title':        video_title,
1415                                 'stitle':       simple_title,
1416                                 'ext':          video_extension.decode('utf-8'),
1417                                 'format':       u'NA',
1418                                 'player_url':   None,
1419                         })
1420                 except UnavailableVideoError:
1421                         self._downloader.trouble(u'ERROR: unable to download video')
1422
1423
1424 class YahooIE(InfoExtractor):
1425         """Information extractor for video.yahoo.com."""
1426
1427         # _VALID_URL matches all Yahoo! Video URLs
1428         # _VPAGE_URL matches only the extractable '/watch/' URLs
1429         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1430         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1431
1432         def __init__(self, downloader=None):
1433                 InfoExtractor.__init__(self, downloader)
1434
1435         @staticmethod
1436         def suitable(url):
1437                 return (re.match(YahooIE._VALID_URL, url) is not None)
1438
1439         def report_download_webpage(self, video_id):
1440                 """Report webpage download."""
1441                 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1442
1443         def report_extraction(self, video_id):
1444                 """Report information extraction."""
1445                 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1446
1447         def _real_initialize(self):
1448                 return
1449
1450         def _real_extract(self, url, new_video=True):
1451                 # Extract ID from URL
1452                 mobj = re.match(self._VALID_URL, url)
1453                 if mobj is None:
1454                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1455                         return
1456
1457                 # At this point we have a new video
1458                 self._downloader.increment_downloads()
1459                 video_id = mobj.group(2)
1460                 video_extension = 'flv'
1461
1462                 # Rewrite valid but non-extractable URLs as
1463                 # extractable English language /watch/ URLs
1464                 if re.match(self._VPAGE_URL, url) is None:
1465                         request = urllib2.Request(url)
1466                         try:
1467                                 webpage = urllib2.urlopen(request).read()
1468                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1469                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1470                                 return
1471
1472                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1473                         if mobj is None:
1474                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1475                                 return
1476                         yahoo_id = mobj.group(1)
1477
1478                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1479                         if mobj is None:
1480                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1481                                 return
1482                         yahoo_vid = mobj.group(1)
1483
1484                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1485                         return self._real_extract(url, new_video=False)
1486
1487                 # Retrieve video webpage to extract further information
1488                 request = urllib2.Request(url)
1489                 try:
1490                         self.report_download_webpage(video_id)
1491                         webpage = urllib2.urlopen(request).read()
1492                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1493                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1494                         return
1495
1496                 # Extract uploader and title from webpage
1497                 self.report_extraction(video_id)
1498                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1499                 if mobj is None:
1500                         self._downloader.trouble(u'ERROR: unable to extract video title')
1501                         return
1502                 video_title = mobj.group(1).decode('utf-8')
1503                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1504
1505                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1506                 if mobj is None:
1507                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1508                         return
1509                 video_uploader = mobj.group(1).decode('utf-8')
1510
1511                 # Extract video thumbnail
1512                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1513                 if mobj is None:
1514                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1515                         return
1516                 video_thumbnail = mobj.group(1).decode('utf-8')
1517
1518                 # Extract video description
1519                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1520                 if mobj is None:
1521                         self._downloader.trouble(u'ERROR: unable to extract video description')
1522                         return
1523                 video_description = mobj.group(1).decode('utf-8')
1524                 if not video_description: video_description = 'No description available.'
1525
1526                 # Extract video height and width
1527                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1528                 if mobj is None:
1529                         self._downloader.trouble(u'ERROR: unable to extract video height')
1530                         return
1531                 yv_video_height = mobj.group(1)
1532
1533                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1534                 if mobj is None:
1535                         self._downloader.trouble(u'ERROR: unable to extract video width')
1536                         return
1537                 yv_video_width = mobj.group(1)
1538
1539                 # Retrieve video playlist to extract media URL
1540                 # I'm not completely sure what all these options are, but we
1541                 # seem to need most of them, otherwise the server sends a 401.
1542                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1543                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1544                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1545                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1546                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1547                 try:
1548                         self.report_download_webpage(video_id)
1549                         webpage = urllib2.urlopen(request).read()
1550                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1552                         return
1553
1554                 # Extract media URL from playlist XML
1555                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1556                 if mobj is None:
1557                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1558                         return
1559                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1560                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1561
1562                 try:
1563                         # Process video information
1564                         self._downloader.process_info({
1565                                 'id':           video_id.decode('utf-8'),
1566                                 'url':          video_url,
1567                                 'uploader':     video_uploader,
1568                                 'upload_date':  u'NA',
1569                                 'title':        video_title,
1570                                 'stitle':       simple_title,
1571                                 'ext':          video_extension.decode('utf-8'),
1572                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1573                                 'description':  video_description,
1574                                 'thumbnail':    video_thumbnail,
1575                                 'description':  video_description,
1576                                 'player_url':   None,
1577                         })
1578                 except UnavailableVideoError:
1579                         self._downloader.trouble(u'ERROR: unable to download video')
1580
1581
1582 class GenericIE(InfoExtractor):
1583         """Generic last-resort information extractor."""
1584
1585         def __init__(self, downloader=None):
1586                 InfoExtractor.__init__(self, downloader)
1587
1588         @staticmethod
1589         def suitable(url):
1590                 return True
1591
1592         def report_download_webpage(self, video_id):
1593                 """Report webpage download."""
1594                 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1595                 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1596
1597         def report_extraction(self, video_id):
1598                 """Report information extraction."""
1599                 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1600
1601         def _real_initialize(self):
1602                 return
1603
1604         def _real_extract(self, url):
1605                 # At this point we have a new video
1606                 self._downloader.increment_downloads()
1607
1608                 video_id = url.split('/')[-1]
1609                 request = urllib2.Request(url)
1610                 try:
1611                         self.report_download_webpage(video_id)
1612                         webpage = urllib2.urlopen(request).read()
1613                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1614                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1615                         return
1616                 except ValueError, err:
1617                         # since this is the last-resort InfoExtractor, if
1618                         # this error is thrown, it'll be thrown here
1619                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1620                         return
1621
1622                 self.report_extraction(video_id)
1623                 # Start with something easy: JW Player in SWFObject
1624                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1625                 if mobj is None:
1626                         # Broaden the search a little bit
1627                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1628                 if mobj is None:
1629                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1630                         return
1631
1632                 # It's possible that one of the regexes
1633                 # matched, but returned an empty group:
1634                 if mobj.group(1) is None:
1635                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1636                         return
1637
1638                 video_url = urllib.unquote(mobj.group(1))
1639                 video_id  = os.path.basename(video_url)
1640
1641                 # here's a fun little line of code for you:
1642                 video_extension = os.path.splitext(video_id)[1][1:]
1643                 video_id        = os.path.splitext(video_id)[0]
1644
1645                 # it's tempting to parse this further, but you would
1646                 # have to take into account all the variations like
1647                 #   Video Title - Site Name
1648                 #   Site Name | Video Title
1649                 #   Video Title - Tagline | Site Name
1650                 # and so on and so forth; it's just not practical
1651                 mobj = re.search(r'<title>(.*)</title>', webpage)
1652                 if mobj is None:
1653                         self._downloader.trouble(u'ERROR: unable to extract title')
1654                         return
1655                 video_title = mobj.group(1).decode('utf-8')
1656                 video_title = sanitize_title(video_title)
1657                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1658
1659                 # video uploader is domain name
1660                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1661                 if mobj is None:
1662                         self._downloader.trouble(u'ERROR: unable to extract title')
1663                         return
1664                 video_uploader = mobj.group(1).decode('utf-8')
1665
1666                 try:
1667                         # Process video information
1668                         self._downloader.process_info({
1669                                 'id':           video_id.decode('utf-8'),
1670                                 'url':          video_url.decode('utf-8'),
1671                                 'uploader':     video_uploader,
1672                                 'upload_date':  u'NA',
1673                                 'title':        video_title,
1674                                 'stitle':       simple_title,
1675                                 'ext':          video_extension.decode('utf-8'),
1676                                 'format':       u'NA',
1677                                 'player_url':   None,
1678                         })
1679                 except UnavailableVideoError, err:
1680                         self._downloader.trouble(u'ERROR: unable to download video')
1681
1682
1683 class YoutubeSearchIE(InfoExtractor):
1684         """Information Extractor for YouTube search queries."""
1685         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1686         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1687         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1688         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1689         _youtube_ie = None
1690         _max_youtube_results = 1000
1691
1692         def __init__(self, youtube_ie, downloader=None):
1693                 InfoExtractor.__init__(self, downloader)
1694                 self._youtube_ie = youtube_ie
1695         
1696         @staticmethod
1697         def suitable(url):
1698                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1699
1700         def report_download_page(self, query, pagenum):
1701                 """Report attempt to download playlist page with given number."""
1702                 query = query.decode(preferredencoding())
1703                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1704
1705         def _real_initialize(self):
1706                 self._youtube_ie.initialize()
1707         
1708         def _real_extract(self, query):
1709                 mobj = re.match(self._VALID_QUERY, query)
1710                 if mobj is None:
1711                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1712                         return
1713
1714                 prefix, query = query.split(':')
1715                 prefix = prefix[8:]
1716                 query  = query.encode('utf-8')
1717                 if prefix == '':
1718                         self._download_n_results(query, 1)
1719                         return
1720                 elif prefix == 'all':
1721                         self._download_n_results(query, self._max_youtube_results)
1722                         return
1723                 else:
1724                         try:
1725                                 n = long(prefix)
1726                                 if n <= 0:
1727                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1728                                         return
1729                                 elif n > self._max_youtube_results:
1730                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1731                                         n = self._max_youtube_results
1732                                 self._download_n_results(query, n)
1733                                 return
1734                         except ValueError: # parsing prefix as integer fails
1735                                 self._download_n_results(query, 1)
1736                                 return
1737
1738         def _download_n_results(self, query, n):
1739                 """Downloads a specified number of results for a query"""
1740
1741                 video_ids = []
1742                 already_seen = set()
1743                 pagenum = 1
1744
1745                 while True:
1746                         self.report_download_page(query, pagenum)
1747                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1748                         request = urllib2.Request(result_url, None, std_headers)
1749                         try:
1750                                 page = urllib2.urlopen(request).read()
1751                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1752                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1753                                 return
1754
1755                         # Extract video identifiers
1756                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1757                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1758                                 if video_id not in already_seen:
1759                                         video_ids.append(video_id)
1760                                         already_seen.add(video_id)
1761                                         if len(video_ids) == n:
1762                                                 # Specified n videos reached
1763                                                 for id in video_ids:
1764                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1765                                                 return
1766
1767                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1768                                 for id in video_ids:
1769                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1770                                 return
1771
1772                         pagenum = pagenum + 1
1773
1774 class GoogleSearchIE(InfoExtractor):
1775         """Information Extractor for Google Video search queries."""
1776         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1777         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1778         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1779         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1780         _google_ie = None
1781         _max_google_results = 1000
1782
1783         def __init__(self, google_ie, downloader=None):
1784                 InfoExtractor.__init__(self, downloader)
1785                 self._google_ie = google_ie
1786         
1787         @staticmethod
1788         def suitable(url):
1789                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1790
1791         def report_download_page(self, query, pagenum):
1792                 """Report attempt to download playlist page with given number."""
1793                 query = query.decode(preferredencoding())
1794                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1795
1796         def _real_initialize(self):
1797                 self._google_ie.initialize()
1798         
1799         def _real_extract(self, query):
1800                 mobj = re.match(self._VALID_QUERY, query)
1801                 if mobj is None:
1802                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1803                         return
1804
1805                 prefix, query = query.split(':')
1806                 prefix = prefix[8:]
1807                 query  = query.encode('utf-8')
1808                 if prefix == '':
1809                         self._download_n_results(query, 1)
1810                         return
1811                 elif prefix == 'all':
1812                         self._download_n_results(query, self._max_google_results)
1813                         return
1814                 else:
1815                         try:
1816                                 n = long(prefix)
1817                                 if n <= 0:
1818                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1819                                         return
1820                                 elif n > self._max_google_results:
1821                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1822                                         n = self._max_google_results
1823                                 self._download_n_results(query, n)
1824                                 return
1825                         except ValueError: # parsing prefix as integer fails
1826                                 self._download_n_results(query, 1)
1827                                 return
1828
1829         def _download_n_results(self, query, n):
1830                 """Downloads a specified number of results for a query"""
1831
1832                 video_ids = []
1833                 already_seen = set()
1834                 pagenum = 1
1835
1836                 while True:
1837                         self.report_download_page(query, pagenum)
1838                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1839                         request = urllib2.Request(result_url, None, std_headers)
1840                         try:
1841                                 page = urllib2.urlopen(request).read()
1842                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1843                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1844                                 return
1845
1846                         # Extract video identifiers
1847                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1848                                 video_id = mobj.group(1)
1849                                 if video_id not in already_seen:
1850                                         video_ids.append(video_id)
1851                                         already_seen.add(video_id)
1852                                         if len(video_ids) == n:
1853                                                 # Specified n videos reached
1854                                                 for id in video_ids:
1855                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1856                                                 return
1857
1858                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1859                                 for id in video_ids:
1860                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1861                                 return
1862
1863                         pagenum = pagenum + 1
1864
1865 class YahooSearchIE(InfoExtractor):
1866         """Information Extractor for Yahoo! Video search queries."""
1867         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1868         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1869         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1870         _MORE_PAGES_INDICATOR = r'\s*Next'
1871         _yahoo_ie = None
1872         _max_yahoo_results = 1000
1873
1874         def __init__(self, yahoo_ie, downloader=None):
1875                 InfoExtractor.__init__(self, downloader)
1876                 self._yahoo_ie = yahoo_ie
1877         
1878         @staticmethod
1879         def suitable(url):
1880                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1881
1882         def report_download_page(self, query, pagenum):
1883                 """Report attempt to download playlist page with given number."""
1884                 query = query.decode(preferredencoding())
1885                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1886
1887         def _real_initialize(self):
1888                 self._yahoo_ie.initialize()
1889         
1890         def _real_extract(self, query):
1891                 mobj = re.match(self._VALID_QUERY, query)
1892                 if mobj is None:
1893                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1894                         return
1895
1896                 prefix, query = query.split(':')
1897                 prefix = prefix[8:]
1898                 query  = query.encode('utf-8')
1899                 if prefix == '':
1900                         self._download_n_results(query, 1)
1901                         return
1902                 elif prefix == 'all':
1903                         self._download_n_results(query, self._max_yahoo_results)
1904                         return
1905                 else:
1906                         try:
1907                                 n = long(prefix)
1908                                 if n <= 0:
1909                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1910                                         return
1911                                 elif n > self._max_yahoo_results:
1912                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1913                                         n = self._max_yahoo_results
1914                                 self._download_n_results(query, n)
1915                                 return
1916                         except ValueError: # parsing prefix as integer fails
1917                                 self._download_n_results(query, 1)
1918                                 return
1919
1920         def _download_n_results(self, query, n):
1921                 """Downloads a specified number of results for a query"""
1922
1923                 video_ids = []
1924                 already_seen = set()
1925                 pagenum = 1
1926
1927                 while True:
1928                         self.report_download_page(query, pagenum)
1929                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1930                         request = urllib2.Request(result_url, None, std_headers)
1931                         try:
1932                                 page = urllib2.urlopen(request).read()
1933                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1934                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1935                                 return
1936
1937                         # Extract video identifiers
1938                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1939                                 video_id = mobj.group(1)
1940                                 if video_id not in already_seen:
1941                                         video_ids.append(video_id)
1942                                         already_seen.add(video_id)
1943                                         if len(video_ids) == n:
1944                                                 # Specified n videos reached
1945                                                 for id in video_ids:
1946                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1947                                                 return
1948
1949                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1950                                 for id in video_ids:
1951                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1952                                 return
1953
1954                         pagenum = pagenum + 1
1955
1956 class YoutubePlaylistIE(InfoExtractor):
1957         """Information Extractor for YouTube playlists."""
1958
1959         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1960         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1961         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1962         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1963         _youtube_ie = None
1964
1965         def __init__(self, youtube_ie, downloader=None):
1966                 InfoExtractor.__init__(self, downloader)
1967                 self._youtube_ie = youtube_ie
1968         
1969         @staticmethod
1970         def suitable(url):
1971                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1972
1973         def report_download_page(self, playlist_id, pagenum):
1974                 """Report attempt to download playlist page with given number."""
1975                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1976
1977         def _real_initialize(self):
1978                 self._youtube_ie.initialize()
1979         
1980         def _real_extract(self, url):
1981                 # Extract playlist id
1982                 mobj = re.match(self._VALID_URL, url)
1983                 if mobj is None:
1984                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1985                         return
1986
1987                 # Download playlist pages
1988                 playlist_id = mobj.group(1)
1989                 video_ids = []
1990                 pagenum = 1
1991
1992                 while True:
1993                         self.report_download_page(playlist_id, pagenum)
1994                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1995                         try:
1996                                 page = urllib2.urlopen(request).read()
1997                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1998                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1999                                 return
2000
2001                         # Extract video identifiers
2002                         ids_in_page = []
2003                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2004                                 if mobj.group(1) not in ids_in_page:
2005                                         ids_in_page.append(mobj.group(1))
2006                         video_ids.extend(ids_in_page)
2007
2008                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2009                                 break
2010                         pagenum = pagenum + 1
2011
2012                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2013                 playlistend = self._downloader.params.get('playlistend', -1)
2014                 video_ids = video_ids[playliststart:playlistend]
2015
2016                 for id in video_ids:
2017                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2018                 return
2019
2020 class YoutubeUserIE(InfoExtractor):
2021         """Information Extractor for YouTube users."""
2022
2023         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2024         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2025         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2026         _youtube_ie = None
2027
2028         def __init__(self, youtube_ie, downloader=None):
2029                 InfoExtractor.__init__(self, downloader)
2030                 self._youtube_ie = youtube_ie
2031         
2032         @staticmethod
2033         def suitable(url):
2034                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2035
2036         def report_download_page(self, username):
2037                 """Report attempt to download user page."""
2038                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2039
2040         def _real_initialize(self):
2041                 self._youtube_ie.initialize()
2042         
2043         def _real_extract(self, url):
2044                 # Extract username
2045                 mobj = re.match(self._VALID_URL, url)
2046                 if mobj is None:
2047                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2048                         return
2049
2050                 # Download user page
2051                 username = mobj.group(1)
2052                 video_ids = []
2053                 pagenum = 1
2054
2055                 self.report_download_page(username)
2056                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2057                 try:
2058                         page = urllib2.urlopen(request).read()
2059                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2060                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2061                         return
2062
2063                 # Extract video identifiers
2064                 ids_in_page = []
2065
2066                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2067                         if mobj.group(1) not in ids_in_page:
2068                                 ids_in_page.append(mobj.group(1))
2069                 video_ids.extend(ids_in_page)
2070
2071                 playliststart = self._downloader.params.get('playliststart', 1) - 1
2072                 playlistend = self._downloader.params.get('playlistend', -1)
2073                 video_ids = video_ids[playliststart:playlistend]
2074
2075                 for id in video_ids:
2076                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2077                 return
2078
2079 class DepositFilesIE(InfoExtractor):
2080         """Information extractor for depositfiles.com"""
2081
2082         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2083
2084         def __init__(self, downloader=None):
2085                 InfoExtractor.__init__(self, downloader)
2086
2087         @staticmethod
2088         def suitable(url):
2089                 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2090
2091         def report_download_webpage(self, file_id):
2092                 """Report webpage download."""
2093                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2094
2095         def report_extraction(self, file_id):
2096                 """Report information extraction."""
2097                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2098
2099         def _real_initialize(self):
2100                 return
2101
2102         def _real_extract(self, url):
2103                 # At this point we have a new file
2104                 self._downloader.increment_downloads()
2105
2106                 file_id = url.split('/')[-1]
2107                 # Rebuild url in english locale
2108                 url = 'http://depositfiles.com/en/files/' + file_id
2109
2110                 # Retrieve file webpage with 'Free download' button pressed
2111                 free_download_indication = { 'gateway_result' : '1' }
2112                 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2113                 try:
2114                         self.report_download_webpage(file_id)
2115                         webpage = urllib2.urlopen(request).read()
2116                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2117                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2118                         return
2119
2120                 # Search for the real file URL
2121                 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2122                 if (mobj is None) or (mobj.group(1) is None):
2123                         # Try to figure out reason of the error.
2124                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2125                         if (mobj is not None) and (mobj.group(1) is not None):
2126                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2127                                 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2128                         else:
2129                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2130                         return
2131
2132                 file_url = mobj.group(1)
2133                 file_extension = os.path.splitext(file_url)[1][1:]
2134
2135                 # Search for file title
2136                 mobj = re.search(r'<b title="(.*?)">', webpage)
2137                 if mobj is None:
2138                         self._downloader.trouble(u'ERROR: unable to extract title')
2139                         return
2140                 file_title = mobj.group(1).decode('utf-8')
2141
2142                 try:
2143                         # Process file information
2144                         self._downloader.process_info({
2145                                 'id':           file_id.decode('utf-8'),
2146                                 'url':          file_url.decode('utf-8'),
2147                                 'uploader':     u'NA',
2148                                 'upload_date':  u'NA',
2149                                 'title':        file_title,
2150                                 'stitle':       file_title,
2151                                 'ext':          file_extension.decode('utf-8'),
2152                                 'format':       u'NA',
2153                                 'player_url':   None,
2154                         })
2155                 except UnavailableVideoError, err:
2156                         self._downloader.trouble(u'ERROR: unable to download file')
2157
2158 class PostProcessor(object):
2159         """Post Processor class.
2160
2161         PostProcessor objects can be added to downloaders with their
2162         add_post_processor() method. When the downloader has finished a
2163         successful download, it will take its internal chain of PostProcessors
2164         and start calling the run() method on each one of them, first with
2165         an initial argument and then with the returned value of the previous
2166         PostProcessor.
2167
2168         The chain will be stopped if one of them ever returns None or the end
2169         of the chain is reached.
2170
2171         PostProcessor objects follow a "mutual registration" process similar
2172         to InfoExtractor objects.
2173         """
2174
2175         _downloader = None
2176
2177         def __init__(self, downloader=None):
2178                 self._downloader = downloader
2179
2180         def set_downloader(self, downloader):
2181                 """Sets the downloader for this PP."""
2182                 self._downloader = downloader
2183         
2184         def run(self, information):
2185                 """Run the PostProcessor.
2186
2187                 The "information" argument is a dictionary like the ones
2188                 composed by InfoExtractors. The only difference is that this
2189                 one has an extra field called "filepath" that points to the
2190                 downloaded file.
2191
2192                 When this method returns None, the postprocessing chain is
2193                 stopped. However, this method may return an information
2194                 dictionary that will be passed to the next postprocessing
2195                 object in the chain. It can be the one it received after
2196                 changing some fields.
2197
2198                 In addition, this method may raise a PostProcessingError
2199                 exception that will be taken into account by the downloader
2200                 it was called from.
2201                 """
2202                 return information # by default, do nothing
2203         
2204 ### MAIN PROGRAM ###
2205 if __name__ == '__main__':
2206         try:
2207                 # Modules needed only when running the main program
2208                 import getpass
2209                 import optparse
2210
2211                 # Function to update the program file with the latest version from bitbucket.org
2212                 def update_self(downloader, filename):
2213                         # Note: downloader only used for options
2214                         if not os.access (filename, os.W_OK):
2215                                 sys.exit('ERROR: no write permissions on %s' % filename)
2216
2217                         downloader.to_screen('Updating to latest stable version...')
2218                         latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2219                         latest_version = urllib.urlopen(latest_url).read().strip()
2220                         prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2221                         newcontent = urllib.urlopen(prog_url).read()
2222                         stream = open(filename, 'w')
2223                         stream.write(newcontent)
2224                         stream.close()
2225                         downloader.to_screen('Updated to version %s' % latest_version)
2226
2227                 # Parse command line
2228                 parser = optparse.OptionParser(
2229                         usage='Usage: %prog [options] url...',
2230                         version='2010.11.19',
2231                         conflict_handler='resolve',
2232                 )
2233
2234                 parser.add_option('-h', '--help',
2235                                 action='help', help='print this help text and exit')
2236                 parser.add_option('-v', '--version',
2237                                 action='version', help='print program version and exit')
2238                 parser.add_option('-U', '--update',
2239                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2240                 parser.add_option('-i', '--ignore-errors',
2241                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2242                 parser.add_option('-r', '--rate-limit',
2243                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2244                 parser.add_option('-R', '--retries',
2245                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2246                 parser.add_option('--playlist-start',
2247                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2248                 parser.add_option('--playlist-end',
2249                                 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2250
2251                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2252                 authentication.add_option('-u', '--username',
2253                                 dest='username', metavar='USERNAME', help='account username')
2254                 authentication.add_option('-p', '--password',
2255                                 dest='password', metavar='PASSWORD', help='account password')
2256                 authentication.add_option('-n', '--netrc',
2257                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2258                 parser.add_option_group(authentication)
2259
2260                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2261                 video_format.add_option('-f', '--format',
2262                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2263                 video_format.add_option('-m', '--mobile-version',
2264                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2265                 video_format.add_option('--all-formats',
2266                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2267                 video_format.add_option('--max-quality',
2268                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2269                 video_format.add_option('-b', '--best-quality',
2270                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2271                 parser.add_option_group(video_format)
2272
2273                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2274                 verbosity.add_option('-q', '--quiet',
2275                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2276                 verbosity.add_option('-s', '--simulate',
2277                                 action='store_true', dest='simulate', help='do not download video', default=False)
2278                 verbosity.add_option('-g', '--get-url',
2279                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2280                 verbosity.add_option('-e', '--get-title',
2281                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2282                 verbosity.add_option('--get-thumbnail',
2283                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2284                 verbosity.add_option('--get-description',
2285                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2286                 verbosity.add_option('--no-progress',
2287                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2288                 parser.add_option_group(verbosity)
2289
2290                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2291                 filesystem.add_option('-t', '--title',
2292                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2293                 filesystem.add_option('-l', '--literal',
2294                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2295                 filesystem.add_option('-A', '--auto-number',
2296                                 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2297                 filesystem.add_option('-o', '--output',
2298                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2299                 filesystem.add_option('-a', '--batch-file',
2300                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2301                 filesystem.add_option('-w', '--no-overwrites',
2302                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2303                 filesystem.add_option('-c', '--continue',
2304                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2305                 filesystem.add_option('--cookies',
2306                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2307                 parser.add_option_group(filesystem)
2308
2309                 (opts, args) = parser.parse_args()
2310
2311                 # Open appropriate CookieJar
2312                 if opts.cookiefile is None:
2313                         jar = cookielib.CookieJar()
2314                 else:
2315                         try:
2316                                 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2317                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2318                                         jar.load()
2319                         except (IOError, OSError), err:
2320                                 sys.exit(u'ERROR: unable to open cookie file')
2321
2322                 # General configuration
2323                 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2324                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2325                 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2326                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2327
2328                 # Batch file verification
2329                 batchurls = []
2330                 if opts.batchfile is not None:
2331                         try:
2332                                 if opts.batchfile == '-':
2333                                         batchfd = sys.stdin
2334                                 else:
2335                                         batchfd = open(opts.batchfile, 'r')
2336                                 batchurls = batchfd.readlines()
2337                                 batchurls = [x.strip() for x in batchurls]
2338                                 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2339                         except IOError:
2340                                 sys.exit(u'ERROR: batch file could not be read')
2341                 all_urls = batchurls + args
2342
2343                 # Conflicting, missing and erroneous options
2344                 if opts.bestquality:
2345                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2346                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2347                         parser.error(u'using .netrc conflicts with giving username/password')
2348                 if opts.password is not None and opts.username is None:
2349                         parser.error(u'account username missing')
2350                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2351                         parser.error(u'using output template conflicts with using title, literal title or auto number')
2352                 if opts.usetitle and opts.useliteral:
2353                         parser.error(u'using title conflicts with using literal title')
2354                 if opts.username is not None and opts.password is None:
2355                         opts.password = getpass.getpass(u'Type account password and press return:')
2356                 if opts.ratelimit is not None:
2357                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2358                         if numeric_limit is None:
2359                                 parser.error(u'invalid rate limit specified')
2360                         opts.ratelimit = numeric_limit
2361                 if opts.retries is not None:
2362                         try:
2363                                 opts.retries = long(opts.retries)
2364                         except (TypeError, ValueError), err:
2365                                 parser.error(u'invalid retry count specified')
2366                 try:
2367                         opts.playliststart = long(opts.playliststart)
2368                         if opts.playliststart <= 0:
2369                                 raise ValueError
2370                 except (TypeError, ValueError), err:
2371                         parser.error(u'invalid playlist start number specified')
2372                 try:
2373                         opts.playlistend = long(opts.playlistend)
2374                         if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2375                                 raise ValueError
2376                 except (TypeError, ValueError), err:
2377                         parser.error(u'invalid playlist end number specified')
2378
2379                 # Information extractors
2380                 youtube_ie = YoutubeIE()
2381                 metacafe_ie = MetacafeIE(youtube_ie)
2382                 dailymotion_ie = DailymotionIE()
2383                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2384                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2385                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2386                 google_ie = GoogleIE()
2387                 google_search_ie = GoogleSearchIE(google_ie)
2388                 photobucket_ie = PhotobucketIE()
2389                 yahoo_ie = YahooIE()
2390                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2391                 deposit_files_ie = DepositFilesIE()
2392                 generic_ie = GenericIE()
2393
2394                 # File downloader
2395                 fd = FileDownloader({
2396                         'usenetrc': opts.usenetrc,
2397                         'username': opts.username,
2398                         'password': opts.password,
2399                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2400                         'forceurl': opts.geturl,
2401                         'forcetitle': opts.gettitle,
2402                         'forcethumbnail': opts.getthumbnail,
2403                         'forcedescription': opts.getdescription,
2404                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2405                         'format': opts.format,
2406                         'format_limit': opts.format_limit,
2407                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2408                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2409                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2410                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2411                                 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2412                                 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2413                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2414                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2415                                 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2416                                 or u'%(id)s.%(ext)s'),
2417                         'ignoreerrors': opts.ignoreerrors,
2418                         'ratelimit': opts.ratelimit,
2419                         'nooverwrites': opts.nooverwrites,
2420                         'retries': opts.retries,
2421                         'continuedl': opts.continue_dl,
2422                         'noprogress': opts.noprogress,
2423                         'playliststart': opts.playliststart,
2424                         'playlistend': opts.playlistend,
2425                         'logtostderr': opts.outtmpl == '-',
2426                         })
2427                 fd.add_info_extractor(youtube_search_ie)
2428                 fd.add_info_extractor(youtube_pl_ie)
2429                 fd.add_info_extractor(youtube_user_ie)
2430                 fd.add_info_extractor(metacafe_ie)
2431                 fd.add_info_extractor(dailymotion_ie)
2432                 fd.add_info_extractor(youtube_ie)
2433                 fd.add_info_extractor(google_ie)
2434                 fd.add_info_extractor(google_search_ie)
2435                 fd.add_info_extractor(photobucket_ie)
2436                 fd.add_info_extractor(yahoo_ie)
2437                 fd.add_info_extractor(yahoo_search_ie)
2438                 fd.add_info_extractor(deposit_files_ie)
2439
2440                 # This must come last since it's the
2441                 # fallback if none of the others work
2442                 fd.add_info_extractor(generic_ie)
2443
2444                 # Update version
2445                 if opts.update_self:
2446                         update_self(fd, sys.argv[0])
2447
2448                 # Maybe do nothing
2449                 if len(all_urls) < 1:
2450                         if not opts.update_self:
2451                                 parser.error(u'you must provide at least one URL')
2452                         else:
2453                                 sys.exit()
2454                 retcode = fd.download(all_urls)
2455
2456                 # Dump cookie jar if requested
2457                 if opts.cookiefile is not None:
2458                         try:
2459                                 jar.save()
2460                         except (IOError, OSError), err:
2461                                 sys.exit(u'ERROR: unable to save cookie jar')
2462
2463                 sys.exit(retcode)
2464
2465         except DownloadError:
2466                 sys.exit(1)
2467         except SameFileError:
2468                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2469         except KeyboardInterrupt:
2470                 sys.exit(u'\nERROR: Interrupted by user')