717d97d9673efc775bfdab9e873ec30bcaec0f98
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         def add_info_extractor(self, ie):
291                 """Add an InfoExtractor object to the end of the list."""
292                 self._ies.append(ie)
293                 ie.set_downloader(self)
294         
295         def add_post_processor(self, pp):
296                 """Add a PostProcessor object to the end of the chain."""
297                 self._pps.append(pp)
298                 pp.set_downloader(self)
299         
300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301                 """Print message to stdout if not in quiet mode."""
302                 try:
303                         if not self.params.get('quiet', False):
304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305                         sys.stdout.flush()
306                 except (UnicodeEncodeError), err:
307                         if not ignore_encoding_errors:
308                                 raise
309         
310         def to_stderr(self, message):
311                 """Print message to stderr."""
312                 print >>sys.stderr, message.encode(preferredencoding())
313         
314         def fixed_template(self):
315                 """Checks if the output template is fixed."""
316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318         def trouble(self, message=None):
319                 """Determine action to take when a download problem appears.
320
321                 Depending on if the downloader has been configured to ignore
322                 download errors or not, this method may throw an exception or
323                 not when errors are found, after printing the message.
324                 """
325                 if message is not None:
326                         self.to_stderr(message)
327                 if not self.params.get('ignoreerrors', False):
328                         raise DownloadError(message)
329                 self._download_retcode = 1
330
331         def slow_down(self, start_time, byte_counter):
332                 """Sleep if the download speed is over the rate limit."""
333                 rate_limit = self.params.get('ratelimit', None)
334                 if rate_limit is None or byte_counter == 0:
335                         return
336                 now = time.time()
337                 elapsed = now - start_time
338                 if elapsed <= 0.0:
339                         return
340                 speed = float(byte_counter) / elapsed
341                 if speed > rate_limit:
342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344         def report_destination(self, filename):
345                 """Report destination filename."""
346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347         
348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349                 """Report download progress."""
350                 if self.params.get('noprogress', False):
351                         return
352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355         def report_resuming_byte(self, resume_len):
356                 """Report attempt to resume at given byte."""
357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358         
359         def report_retry(self, count, retries):
360                 """Report retry in case of HTTP error 503"""
361                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
362         
363         def report_file_already_downloaded(self, file_name):
364                 """Report file has already been fully downloaded."""
365                 try:
366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367                 except (UnicodeEncodeError), err:
368                         self.to_stdout(u'[download] The file has already been downloaded')
369         
370         def report_unable_to_resume(self):
371                 """Report it was impossible to resume download."""
372                 self.to_stdout(u'[download] Unable to resume')
373         
374         def report_finish(self):
375                 """Report download finished."""
376                 if self.params.get('noprogress', False):
377                         self.to_stdout(u'[download] Download completed')
378                 else:
379                         self.to_stdout(u'')
380         
381         def increment_downloads(self):
382                 """Increment the ordinal that assigns a number to each file."""
383                 self._num_downloads += 1
384
385         def process_info(self, info_dict):
386                 """Process a single dictionary returned by an InfoExtractor."""
387                 # Do nothing else if in simulate mode
388                 if self.params.get('simulate', False):
389                         # Forced printings
390                         if self.params.get('forcetitle', False):
391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392                         if self.params.get('forceurl', False):
393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399                         return
400                         
401                 try:
402                         template_dict = dict(info_dict)
403                         template_dict['epoch'] = unicode(long(time.time()))
404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
405                         filename = self.params['outtmpl'] % template_dict
406                 except (ValueError, KeyError), err:
407                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
408                         return
409                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
410                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
411                         return
412
413                 try:
414                         self.pmkdir(filename)
415                 except (OSError, IOError), err:
416                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
417                         return
418
419                 try:
420                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
421                 except (OSError, IOError), err:
422                         raise UnavailableVideoError
423                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
424                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
425                         return
426                 except (ContentTooShortError, ), err:
427                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
428                         return
429
430                 if success:
431                         try:
432                                 self.post_process(filename, info_dict)
433                         except (PostProcessingError), err:
434                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
435                                 return
436
437         def download(self, url_list):
438                 """Download a given list of URLs."""
439                 if len(url_list) > 1 and self.fixed_template():
440                         raise SameFileError(self.params['outtmpl'])
441
442                 for url in url_list:
443                         suitable_found = False
444                         for ie in self._ies:
445                                 # Go to next InfoExtractor if not suitable
446                                 if not ie.suitable(url):
447                                         continue
448
449                                 # Suitable InfoExtractor found
450                                 suitable_found = True
451
452                                 # Extract information from URL and process it
453                                 ie.extract(url)
454
455                                 # Suitable InfoExtractor had been found; go to next URL
456                                 break
457
458                         if not suitable_found:
459                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
460
461                 return self._download_retcode
462
463         def post_process(self, filename, ie_info):
464                 """Run the postprocessing chain on the given file."""
465                 info = dict(ie_info)
466                 info['filepath'] = filename
467                 for pp in self._pps:
468                         info = pp.run(info)
469                         if info is None:
470                                 break
471         
472         def _download_with_rtmpdump(self, filename, url, player_url):
473                 self.report_destination(filename)
474
475                 # Check for rtmpdump first
476                 try:
477                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
478                 except (OSError, IOError):
479                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
480                         return False
481
482                 # Download using rtmpdump. rtmpdump returns exit code 2 when
483                 # the connection was interrumpted and resuming appears to be
484                 # possible. This is part of rtmpdump's normal usage, AFAIK.
485                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
486                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
487                 while retval == 2 or retval == 1:
488                         prevsize = os.path.getsize(filename)
489                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
490                         time.sleep(5.0) # This seems to be needed
491                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
492                         cursize = os.path.getsize(filename)
493                         if prevsize == cursize and retval == 1:
494                                 break
495                 if retval == 0:
496                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
497                         return True
498                 else:
499                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
500                         return False
501
502         def _do_download(self, filename, url, player_url):
503                 # Attempt to download using rtmpdump
504                 if url.startswith('rtmp'):
505                         return self._download_with_rtmpdump(filename, url, player_url)
506
507                 stream = None
508                 open_mode = 'wb'
509                 basic_request = urllib2.Request(url, None, std_headers)
510                 request = urllib2.Request(url, None, std_headers)
511
512                 # Establish possible resume length
513                 if os.path.isfile(filename):
514                         resume_len = os.path.getsize(filename)
515                 else:
516                         resume_len = 0
517
518                 # Request parameters in case of being able to resume
519                 if self.params.get('continuedl', False) and resume_len != 0:
520                         self.report_resuming_byte(resume_len)
521                         request.add_header('Range','bytes=%d-' % resume_len)
522                         open_mode = 'ab'
523
524                 count = 0
525                 retries = self.params.get('retries', 0)
526                 while count <= retries:
527                         # Establish connection
528                         try:
529                                 data = urllib2.urlopen(request)
530                                 break
531                         except (urllib2.HTTPError, ), err:
532                                 if err.code != 503 and err.code != 416:
533                                         # Unexpected HTTP error
534                                         raise
535                                 elif err.code == 416:
536                                         # Unable to resume (requested range not satisfiable)
537                                         try:
538                                                 # Open the connection again without the range header
539                                                 data = urllib2.urlopen(basic_request)
540                                                 content_length = data.info()['Content-Length']
541                                         except (urllib2.HTTPError, ), err:
542                                                 if err.code != 503:
543                                                         raise
544                                         else:
545                                                 # Examine the reported length
546                                                 if (content_length is not None and
547                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
548                                                         # The file had already been fully downloaded.
549                                                         # Explanation to the above condition: in issue #175 it was revealed that
550                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
551                                                         # changing the file size slightly and causing problems for some users. So
552                                                         # I decided to implement a suggested change and consider the file
553                                                         # completely downloaded if the file size differs less than 100 bytes from
554                                                         # the one in the hard drive.
555                                                         self.report_file_already_downloaded(filename)
556                                                         return True
557                                                 else:
558                                                         # The length does not match, we start the download over
559                                                         self.report_unable_to_resume()
560                                                         open_mode = 'wb'
561                                                         break
562                         # Retry
563                         count += 1
564                         if count <= retries:
565                                 self.report_retry(count, retries)
566
567                 if count > retries:
568                         self.trouble(u'ERROR: giving up after %s retries' % retries)
569                         return False
570
571                 data_len = data.info().get('Content-length', None)
572                 data_len_str = self.format_bytes(data_len)
573                 byte_counter = 0
574                 block_size = 1024
575                 start = time.time()
576                 while True:
577                         # Download and write
578                         before = time.time()
579                         data_block = data.read(block_size)
580                         after = time.time()
581                         data_block_len = len(data_block)
582                         if data_block_len == 0:
583                                 break
584                         byte_counter += data_block_len
585
586                         # Open file just in time
587                         if stream is None:
588                                 try:
589                                         (stream, filename) = sanitize_open(filename, open_mode)
590                                         self.report_destination(filename)
591                                 except (OSError, IOError), err:
592                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
593                                         return False
594                         try:
595                                 stream.write(data_block)
596                         except (IOError, OSError), err:
597                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
598                                 return False
599                         block_size = self.best_block_size(after - before, data_block_len)
600
601                         # Progress message
602                         percent_str = self.calc_percent(byte_counter, data_len)
603                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
604                         speed_str = self.calc_speed(start, time.time(), byte_counter)
605                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
606
607                         # Apply rate limit
608                         self.slow_down(start, byte_counter)
609
610                 self.report_finish()
611                 if data_len is not None and str(byte_counter) != data_len:
612                         raise ContentTooShortError(byte_counter, long(data_len))
613                 return True
614
615 class InfoExtractor(object):
616         """Information Extractor class.
617
618         Information extractors are the classes that, given a URL, extract
619         information from the video (or videos) the URL refers to. This
620         information includes the real video URL, the video title and simplified
621         title, author and others. The information is stored in a dictionary
622         which is then passed to the FileDownloader. The FileDownloader
623         processes this information possibly downloading the video to the file
624         system, among other possible outcomes. The dictionaries must include
625         the following fields:
626
627         id:             Video identifier.
628         url:            Final video URL.
629         uploader:       Nickname of the video uploader.
630         title:          Literal title.
631         stitle:         Simplified title.
632         ext:            Video filename extension.
633         format:         Video format.
634         player_url:     SWF Player URL (may be None).
635
636         The following fields are optional. Their primary purpose is to allow
637         youtube-dl to serve as the backend for a video search function, such
638         as the one in youtube2mp3.  They are only used when their respective
639         forced printing functions are called:
640
641         thumbnail:      Full URL to a video thumbnail image.
642         description:    One-line video description.
643
644         Subclasses of this one should re-define the _real_initialize() and
645         _real_extract() methods, as well as the suitable() static method.
646         Probably, they should also be instantiated and added to the main
647         downloader.
648         """
649
650         _ready = False
651         _downloader = None
652
653         def __init__(self, downloader=None):
654                 """Constructor. Receives an optional downloader."""
655                 self._ready = False
656                 self.set_downloader(downloader)
657
658         @staticmethod
659         def suitable(url):
660                 """Receives a URL and returns True if suitable for this IE."""
661                 return False
662
663         def initialize(self):
664                 """Initializes an instance (authentication, etc)."""
665                 if not self._ready:
666                         self._real_initialize()
667                         self._ready = True
668
669         def extract(self, url):
670                 """Extracts URL information and returns it in list of dicts."""
671                 self.initialize()
672                 return self._real_extract(url)
673
674         def set_downloader(self, downloader):
675                 """Sets the downloader for this IE."""
676                 self._downloader = downloader
677         
678         def _real_initialize(self):
679                 """Real initialization process. Redefine in subclasses."""
680                 pass
681
682         def _real_extract(self, url):
683                 """Real extraction process. Redefine in subclasses."""
684                 pass
685
686 class YoutubeIE(InfoExtractor):
687         """Information extractor for youtube.com."""
688
689         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
690         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
691         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
692         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
693         _NETRC_MACHINE = 'youtube'
694         # Listed in order of quality
695         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
696         _video_extensions = {
697                 '13': '3gp',
698                 '17': 'mp4',
699                 '18': 'mp4',
700                 '22': 'mp4',
701                 '37': 'mp4',
702                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
703                 '43': 'webm',
704                 '45': 'webm',
705         }
706
707         @staticmethod
708         def suitable(url):
709                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
710
711         def report_lang(self):
712                 """Report attempt to set language."""
713                 self._downloader.to_stdout(u'[youtube] Setting language')
714
715         def report_login(self):
716                 """Report attempt to log in."""
717                 self._downloader.to_stdout(u'[youtube] Logging in')
718         
719         def report_age_confirmation(self):
720                 """Report attempt to confirm age."""
721                 self._downloader.to_stdout(u'[youtube] Confirming age')
722         
723         def report_video_webpage_download(self, video_id):
724                 """Report attempt to download video webpage."""
725                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
726         
727         def report_video_info_webpage_download(self, video_id):
728                 """Report attempt to download video info webpage."""
729                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
730         
731         def report_information_extraction(self, video_id):
732                 """Report attempt to extract video information."""
733                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
734         
735         def report_unavailable_format(self, video_id, format):
736                 """Report extracted video URL."""
737                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
738         
739         def report_rtmp_download(self):
740                 """Indicate the download will use the RTMP protocol."""
741                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
742         
743         def _real_initialize(self):
744                 if self._downloader is None:
745                         return
746
747                 username = None
748                 password = None
749                 downloader_params = self._downloader.params
750
751                 # Attempt to use provided username and password or .netrc data
752                 if downloader_params.get('username', None) is not None:
753                         username = downloader_params['username']
754                         password = downloader_params['password']
755                 elif downloader_params.get('usenetrc', False):
756                         try:
757                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
758                                 if info is not None:
759                                         username = info[0]
760                                         password = info[2]
761                                 else:
762                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
763                         except (IOError, netrc.NetrcParseError), err:
764                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
765                                 return
766
767                 # Set language
768                 request = urllib2.Request(self._LANG_URL, None, std_headers)
769                 try:
770                         self.report_lang()
771                         urllib2.urlopen(request).read()
772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
773                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
774                         return
775
776                 # No authentication to be performed
777                 if username is None:
778                         return
779
780                 # Log in
781                 login_form = {
782                                 'current_form': 'loginForm',
783                                 'next':         '/',
784                                 'action_login': 'Log In',
785                                 'username':     username,
786                                 'password':     password,
787                                 }
788                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
789                 try:
790                         self.report_login()
791                         login_results = urllib2.urlopen(request).read()
792                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
793                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
794                                 return
795                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
797                         return
798         
799                 # Confirm age
800                 age_form = {
801                                 'next_url':             '/',
802                                 'action_confirm':       'Confirm',
803                                 }
804                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
805                 try:
806                         self.report_age_confirmation()
807                         age_results = urllib2.urlopen(request).read()
808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
810                         return
811
812         def _real_extract(self, url):
813                 # Extract video id from URL
814                 mobj = re.match(self._VALID_URL, url)
815                 if mobj is None:
816                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
817                         return
818                 video_id = mobj.group(2)
819
820                 # Get video webpage
821                 self.report_video_webpage_download(video_id)
822                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
823                 try:
824                         video_webpage = urllib2.urlopen(request).read()
825                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
826                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
827                         return
828
829                 # Attempt to extract SWF player URL
830                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
831                 if mobj is not None:
832                         player_url = mobj.group(1)
833                 else:
834                         player_url = None
835
836                 # Get video info
837                 self.report_video_info_webpage_download(video_id)
838                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
839                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
840                                            % (video_id, el_type))
841                         request = urllib2.Request(video_info_url, None, std_headers)
842                         try:
843                                 video_info_webpage = urllib2.urlopen(request).read()
844                                 video_info = parse_qs(video_info_webpage)
845                                 if 'token' in video_info:
846                                         break
847                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
848                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
849                                 return
850                 if 'token' not in video_info:
851                         if 'reason' in video_info:
852                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
853                         else:
854                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
855                         return
856
857                 # Start extracting information
858                 self.report_information_extraction(video_id)
859
860                 # uploader
861                 if 'author' not in video_info:
862                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
863                         return
864                 video_uploader = urllib.unquote_plus(video_info['author'][0])
865
866                 # title
867                 if 'title' not in video_info:
868                         self._downloader.trouble(u'ERROR: unable to extract video title')
869                         return
870                 video_title = urllib.unquote_plus(video_info['title'][0])
871                 video_title = video_title.decode('utf-8')
872                 video_title = sanitize_title(video_title)
873
874                 # simplified title
875                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
876                 simple_title = simple_title.strip(ur'_')
877
878                 # thumbnail image
879                 if 'thumbnail_url' not in video_info:
880                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
881                         video_thumbnail = ''
882                 else:   # don't panic if we can't find it
883                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
884
885                 # description
886                 video_description = 'No description available.'
887                 if self._downloader.params.get('forcedescription', False):
888                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
889                         if mobj is not None:
890                                 video_description = mobj.group(1)
891
892                 # token
893                 video_token = urllib.unquote_plus(video_info['token'][0])
894
895                 # Decide which formats to download
896                 requested_format = self._downloader.params.get('format', None)
897                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
898
899                 if 'fmt_url_map' in video_info:
900                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
901                         format_limit = self._downloader.params.get('format_limit', None)
902                         if format_limit is not None and format_limit in self._available_formats:
903                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
904                         else:
905                                 format_list = self._available_formats
906                         existing_formats = [x for x in format_list if x in url_map]
907                         if len(existing_formats) == 0:
908                                 self._downloader.trouble(u'ERROR: no known formats available for video')
909                                 return
910                         if requested_format is None:
911                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
912                         elif requested_format == '-1':
913                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
914                         else:
915                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
916
917                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
918                         self.report_rtmp_download()
919                         video_url_list = [(None, video_info['conn'][0])]
920
921                 else:
922                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
923                         return
924
925                 for format_param, video_real_url in video_url_list:
926                         # At this point we have a new video
927                         self._downloader.increment_downloads()
928
929                         # Extension
930                         video_extension = self._video_extensions.get(format_param, 'flv')
931
932                         # Find the video URL in fmt_url_map or conn paramters
933                         try:
934                                 # Process video information
935                                 self._downloader.process_info({
936                                         'id':           video_id.decode('utf-8'),
937                                         'url':          video_real_url.decode('utf-8'),
938                                         'uploader':     video_uploader.decode('utf-8'),
939                                         'title':        video_title,
940                                         'stitle':       simple_title,
941                                         'ext':          video_extension.decode('utf-8'),
942                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
943                                         'thumbnail':    video_thumbnail.decode('utf-8'),
944                                         'description':  video_description.decode('utf-8'),
945                                         'player_url':   player_url,
946                                 })
947                         except UnavailableVideoError, err:
948                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
949
950
951 class MetacafeIE(InfoExtractor):
952         """Information Extractor for metacafe.com."""
953
954         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
955         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
956         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
957         _youtube_ie = None
958
959         def __init__(self, youtube_ie, downloader=None):
960                 InfoExtractor.__init__(self, downloader)
961                 self._youtube_ie = youtube_ie
962
963         @staticmethod
964         def suitable(url):
965                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
966
967         def report_disclaimer(self):
968                 """Report disclaimer retrieval."""
969                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
970
971         def report_age_confirmation(self):
972                 """Report attempt to confirm age."""
973                 self._downloader.to_stdout(u'[metacafe] Confirming age')
974         
975         def report_download_webpage(self, video_id):
976                 """Report webpage download."""
977                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
978         
979         def report_extraction(self, video_id):
980                 """Report information extraction."""
981                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
982
983         def _real_initialize(self):
984                 # Retrieve disclaimer
985                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
986                 try:
987                         self.report_disclaimer()
988                         disclaimer = urllib2.urlopen(request).read()
989                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
990                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
991                         return
992
993                 # Confirm age
994                 disclaimer_form = {
995                         'filters': '0',
996                         'submit': "Continue - I'm over 18",
997                         }
998                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
999                 try:
1000                         self.report_age_confirmation()
1001                         disclaimer = urllib2.urlopen(request).read()
1002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1004                         return
1005         
1006         def _real_extract(self, url):
1007                 # Extract id and simplified title from URL
1008                 mobj = re.match(self._VALID_URL, url)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1011                         return
1012
1013                 video_id = mobj.group(1)
1014
1015                 # Check if video comes from YouTube
1016                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1017                 if mobj2 is not None:
1018                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1019                         return
1020
1021                 # At this point we have a new video
1022                 self._downloader.increment_downloads()
1023
1024                 simple_title = mobj.group(2).decode('utf-8')
1025                 video_extension = 'flv'
1026
1027                 # Retrieve video webpage to extract further information
1028                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1029                 try:
1030                         self.report_download_webpage(video_id)
1031                         webpage = urllib2.urlopen(request).read()
1032                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1033                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1034                         return
1035
1036                 # Extract URL, uploader and title from webpage
1037                 self.report_extraction(video_id)
1038                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1039                 if mobj is None:
1040                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1041                         return
1042                 mediaURL = urllib.unquote(mobj.group(1))
1043
1044                 # Extract gdaKey if available
1045                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1046                 if mobj is None:
1047                         video_url = mediaURL
1048                         #self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1049                         #return
1050                 else:
1051                         gdaKey = mobj.group(1)
1052                         video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1053
1054                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1055                 if mobj is None:
1056                         self._downloader.trouble(u'ERROR: unable to extract title')
1057                         return
1058                 video_title = mobj.group(1).decode('utf-8')
1059                 video_title = sanitize_title(video_title)
1060
1061                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1062                 if mobj is None:
1063                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1064                         return
1065                 video_uploader = mobj.group(1)
1066
1067                 try:
1068                         # Process video information
1069                         self._downloader.process_info({
1070                                 'id':           video_id.decode('utf-8'),
1071                                 'url':          video_url.decode('utf-8'),
1072                                 'uploader':     video_uploader.decode('utf-8'),
1073                                 'title':        video_title,
1074                                 'stitle':       simple_title,
1075                                 'ext':          video_extension.decode('utf-8'),
1076                                 'format':       u'NA',
1077                                 'player_url':   None,
1078                         })
1079                 except UnavailableVideoError:
1080                         self._downloader.trouble(u'ERROR: unable to download video')
1081
1082
1083 class DailymotionIE(InfoExtractor):
1084         """Information Extractor for Dailymotion"""
1085
1086         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1087
1088         def __init__(self, downloader=None):
1089                 InfoExtractor.__init__(self, downloader)
1090
1091         @staticmethod
1092         def suitable(url):
1093                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1094
1095         def report_download_webpage(self, video_id):
1096                 """Report webpage download."""
1097                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1098         
1099         def report_extraction(self, video_id):
1100                 """Report information extraction."""
1101                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1102
1103         def _real_initialize(self):
1104                 return
1105
1106         def _real_extract(self, url):
1107                 # Extract id and simplified title from URL
1108                 mobj = re.match(self._VALID_URL, url)
1109                 if mobj is None:
1110                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1111                         return
1112
1113                 # At this point we have a new video
1114                 self._downloader.increment_downloads()
1115                 video_id = mobj.group(1)
1116
1117                 simple_title = mobj.group(2).decode('utf-8')
1118                 video_extension = 'flv'
1119
1120                 # Retrieve video webpage to extract further information
1121                 request = urllib2.Request(url)
1122                 try:
1123                         self.report_download_webpage(video_id)
1124                         webpage = urllib2.urlopen(request).read()
1125                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1126                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1127                         return
1128
1129                 # Extract URL, uploader and title from webpage
1130                 self.report_extraction(video_id)
1131                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1132                 if mobj is None:
1133                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1134                         return
1135                 mediaURL = urllib.unquote(mobj.group(1))
1136
1137                 # if needed add http://www.dailymotion.com/ if relative URL
1138
1139                 video_url = mediaURL
1140
1141                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1142                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1143                 if mobj is None:
1144                         self._downloader.trouble(u'ERROR: unable to extract title')
1145                         return
1146                 video_title = mobj.group(1).decode('utf-8')
1147                 video_title = sanitize_title(video_title)
1148
1149                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1150                 if mobj is None:
1151                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1152                         return
1153                 video_uploader = mobj.group(1)
1154
1155                 try:
1156                         # Process video information
1157                         self._downloader.process_info({
1158                                 'id':           video_id.decode('utf-8'),
1159                                 'url':          video_url.decode('utf-8'),
1160                                 'uploader':     video_uploader.decode('utf-8'),
1161                                 'title':        video_title,
1162                                 'stitle':       simple_title,
1163                                 'ext':          video_extension.decode('utf-8'),
1164                                 'format':       u'NA',
1165                                 'player_url':   None,
1166                         })
1167                 except UnavailableVideoError:
1168                         self._downloader.trouble(u'ERROR: unable to download video')
1169
1170 class GoogleIE(InfoExtractor):
1171         """Information extractor for video.google.com."""
1172
1173         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1174
1175         def __init__(self, downloader=None):
1176                 InfoExtractor.__init__(self, downloader)
1177
1178         @staticmethod
1179         def suitable(url):
1180                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1181
1182         def report_download_webpage(self, video_id):
1183                 """Report webpage download."""
1184                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1185
1186         def report_extraction(self, video_id):
1187                 """Report information extraction."""
1188                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1189
1190         def _real_initialize(self):
1191                 return
1192
1193         def _real_extract(self, url):
1194                 # Extract id from URL
1195                 mobj = re.match(self._VALID_URL, url)
1196                 if mobj is None:
1197                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1198                         return
1199
1200                 # At this point we have a new video
1201                 self._downloader.increment_downloads()
1202                 video_id = mobj.group(1)
1203
1204                 video_extension = 'mp4'
1205
1206                 # Retrieve video webpage to extract further information
1207                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1208                 try:
1209                         self.report_download_webpage(video_id)
1210                         webpage = urllib2.urlopen(request).read()
1211                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1212                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1213                         return
1214
1215                 # Extract URL, uploader, and title from webpage
1216                 self.report_extraction(video_id)
1217                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1218                 if mobj is None:
1219                         video_extension = 'flv'
1220                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1221                 if mobj is None:
1222                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1223                         return
1224                 mediaURL = urllib.unquote(mobj.group(1))
1225                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1226                 mediaURL = mediaURL.replace('\\x26', '\x26')
1227
1228                 video_url = mediaURL
1229
1230                 mobj = re.search(r'<title>(.*)</title>', webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract title')
1233                         return
1234                 video_title = mobj.group(1).decode('utf-8')
1235                 video_title = sanitize_title(video_title)
1236                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1237
1238                 # Extract video description
1239                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1240                 if mobj is None:
1241                         self._downloader.trouble(u'ERROR: unable to extract video description')
1242                         return
1243                 video_description = mobj.group(1).decode('utf-8')
1244                 if not video_description:
1245                         video_description = 'No description available.'
1246
1247                 # Extract video thumbnail
1248                 if self._downloader.params.get('forcethumbnail', False):
1249                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1250                         try:
1251                                 webpage = urllib2.urlopen(request).read()
1252                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1253                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1254                                 return
1255                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1256                         if mobj is None:
1257                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1258                                 return
1259                         video_thumbnail = mobj.group(1)
1260                 else:   # we need something to pass to process_info
1261                         video_thumbnail = ''
1262
1263
1264                 try:
1265                         # Process video information
1266                         self._downloader.process_info({
1267                                 'id':           video_id.decode('utf-8'),
1268                                 'url':          video_url.decode('utf-8'),
1269                                 'uploader':     u'NA',
1270                                 'title':        video_title,
1271                                 'stitle':       simple_title,
1272                                 'ext':          video_extension.decode('utf-8'),
1273                                 'format':       u'NA',
1274                                 'player_url':   None,
1275                         })
1276                 except UnavailableVideoError:
1277                         self._downloader.trouble(u'ERROR: unable to download video')
1278
1279
1280 class PhotobucketIE(InfoExtractor):
1281         """Information extractor for photobucket.com."""
1282
1283         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1284
1285         def __init__(self, downloader=None):
1286                 InfoExtractor.__init__(self, downloader)
1287
1288         @staticmethod
1289         def suitable(url):
1290                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1291
1292         def report_download_webpage(self, video_id):
1293                 """Report webpage download."""
1294                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1295
1296         def report_extraction(self, video_id):
1297                 """Report information extraction."""
1298                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1299
1300         def _real_initialize(self):
1301                 return
1302
1303         def _real_extract(self, url):
1304                 # Extract id from URL
1305                 mobj = re.match(self._VALID_URL, url)
1306                 if mobj is None:
1307                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1308                         return
1309
1310                 # At this point we have a new video
1311                 self._downloader.increment_downloads()
1312                 video_id = mobj.group(1)
1313
1314                 video_extension = 'flv'
1315
1316                 # Retrieve video webpage to extract further information
1317                 request = urllib2.Request(url)
1318                 try:
1319                         self.report_download_webpage(video_id)
1320                         webpage = urllib2.urlopen(request).read()
1321                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1323                         return
1324
1325                 # Extract URL, uploader, and title from webpage
1326                 self.report_extraction(video_id)
1327                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1328                 if mobj is None:
1329                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1330                         return
1331                 mediaURL = urllib.unquote(mobj.group(1))
1332
1333                 video_url = mediaURL
1334
1335                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1336                 if mobj is None:
1337                         self._downloader.trouble(u'ERROR: unable to extract title')
1338                         return
1339                 video_title = mobj.group(1).decode('utf-8')
1340                 video_title = sanitize_title(video_title)
1341                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1342
1343                 video_uploader = mobj.group(2).decode('utf-8')
1344
1345                 try:
1346                         # Process video information
1347                         self._downloader.process_info({
1348                                 'id':           video_id.decode('utf-8'),
1349                                 'url':          video_url.decode('utf-8'),
1350                                 'uploader':     video_uploader,
1351                                 'title':        video_title,
1352                                 'stitle':       simple_title,
1353                                 'ext':          video_extension.decode('utf-8'),
1354                                 'format':       u'NA',
1355                                 'player_url':   None,
1356                         })
1357                 except UnavailableVideoError:
1358                         self._downloader.trouble(u'ERROR: unable to download video')
1359
1360
1361 class YahooIE(InfoExtractor):
1362         """Information extractor for video.yahoo.com."""
1363
1364         # _VALID_URL matches all Yahoo! Video URLs
1365         # _VPAGE_URL matches only the extractable '/watch/' URLs
1366         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1367         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1368
1369         def __init__(self, downloader=None):
1370                 InfoExtractor.__init__(self, downloader)
1371
1372         @staticmethod
1373         def suitable(url):
1374                 return (re.match(YahooIE._VALID_URL, url) is not None)
1375
1376         def report_download_webpage(self, video_id):
1377                 """Report webpage download."""
1378                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1379
1380         def report_extraction(self, video_id):
1381                 """Report information extraction."""
1382                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1383
1384         def _real_initialize(self):
1385                 return
1386
1387         def _real_extract(self, url, new_video=True):
1388                 # Extract ID from URL
1389                 mobj = re.match(self._VALID_URL, url)
1390                 if mobj is None:
1391                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1392                         return
1393
1394                 # At this point we have a new video
1395                 self._downloader.increment_downloads()
1396                 video_id = mobj.group(2)
1397                 video_extension = 'flv'
1398
1399                 # Rewrite valid but non-extractable URLs as
1400                 # extractable English language /watch/ URLs
1401                 if re.match(self._VPAGE_URL, url) is None:
1402                         request = urllib2.Request(url)
1403                         try:
1404                                 webpage = urllib2.urlopen(request).read()
1405                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1406                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1407                                 return
1408
1409                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1410                         if mobj is None:
1411                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1412                                 return
1413                         yahoo_id = mobj.group(1)
1414
1415                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1416                         if mobj is None:
1417                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1418                                 return
1419                         yahoo_vid = mobj.group(1)
1420
1421                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1422                         return self._real_extract(url, new_video=False)
1423
1424                 # Retrieve video webpage to extract further information
1425                 request = urllib2.Request(url)
1426                 try:
1427                         self.report_download_webpage(video_id)
1428                         webpage = urllib2.urlopen(request).read()
1429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1430                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1431                         return
1432
1433                 # Extract uploader and title from webpage
1434                 self.report_extraction(video_id)
1435                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1436                 if mobj is None:
1437                         self._downloader.trouble(u'ERROR: unable to extract video title')
1438                         return
1439                 video_title = mobj.group(1).decode('utf-8')
1440                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1441
1442                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1443                 if mobj is None:
1444                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1445                         return
1446                 video_uploader = mobj.group(1).decode('utf-8')
1447
1448                 # Extract video thumbnail
1449                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1450                 if mobj is None:
1451                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1452                         return
1453                 video_thumbnail = mobj.group(1).decode('utf-8')
1454
1455                 # Extract video description
1456                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1457                 if mobj is None:
1458                         self._downloader.trouble(u'ERROR: unable to extract video description')
1459                         return
1460                 video_description = mobj.group(1).decode('utf-8')
1461                 if not video_description: video_description = 'No description available.'
1462
1463                 # Extract video height and width
1464                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract video height')
1467                         return
1468                 yv_video_height = mobj.group(1)
1469
1470                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1471                 if mobj is None:
1472                         self._downloader.trouble(u'ERROR: unable to extract video width')
1473                         return
1474                 yv_video_width = mobj.group(1)
1475
1476                 # Retrieve video playlist to extract media URL
1477                 # I'm not completely sure what all these options are, but we
1478                 # seem to need most of them, otherwise the server sends a 401.
1479                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1480                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1481                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1482                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1483                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1484                 try:
1485                         self.report_download_webpage(video_id)
1486                         webpage = urllib2.urlopen(request).read()
1487                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1488                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1489                         return
1490
1491                 # Extract media URL from playlist XML
1492                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1493                 if mobj is None:
1494                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1495                         return
1496                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1497                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1498
1499                 try:
1500                         # Process video information
1501                         self._downloader.process_info({
1502                                 'id':           video_id.decode('utf-8'),
1503                                 'url':          video_url,
1504                                 'uploader':     video_uploader,
1505                                 'title':        video_title,
1506                                 'stitle':       simple_title,
1507                                 'ext':          video_extension.decode('utf-8'),
1508                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1509                                 'description':  video_description,
1510                                 'thumbnail':    video_thumbnail,
1511                                 'description':  video_description,
1512                                 'player_url':   None,
1513                         })
1514                 except UnavailableVideoError:
1515                         self._downloader.trouble(u'ERROR: unable to download video')
1516
1517
1518 class GenericIE(InfoExtractor):
1519         """Generic last-resort information extractor."""
1520
1521         def __init__(self, downloader=None):
1522                 InfoExtractor.__init__(self, downloader)
1523
1524         @staticmethod
1525         def suitable(url):
1526                 return True
1527
1528         def report_download_webpage(self, video_id):
1529                 """Report webpage download."""
1530                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1531                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1532
1533         def report_extraction(self, video_id):
1534                 """Report information extraction."""
1535                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1536
1537         def _real_initialize(self):
1538                 return
1539
1540         def _real_extract(self, url):
1541                 # At this point we have a new video
1542                 self._downloader.increment_downloads()
1543
1544                 video_id = url.split('/')[-1]
1545                 request = urllib2.Request(url)
1546                 try:
1547                         self.report_download_webpage(video_id)
1548                         webpage = urllib2.urlopen(request).read()
1549                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1550                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1551                         return
1552                 except ValueError, err:
1553                         # since this is the last-resort InfoExtractor, if
1554                         # this error is thrown, it'll be thrown here
1555                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1556                         return
1557
1558                 # Start with something easy: JW Player in SWFObject
1559                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1560                 if mobj is None:
1561                         # Broaden the search a little bit
1562                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1563                 if mobj is None:
1564                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1565                         return
1566
1567                 # It's possible that one of the regexes
1568                 # matched, but returned an empty group:
1569                 if mobj.group(1) is None:
1570                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1571                         return
1572
1573                 video_url = urllib.unquote(mobj.group(1))
1574                 video_id  = os.path.basename(video_url)
1575
1576                 # here's a fun little line of code for you:
1577                 video_extension = os.path.splitext(video_id)[1][1:]
1578                 video_id        = os.path.splitext(video_id)[0]
1579
1580                 # it's tempting to parse this further, but you would
1581                 # have to take into account all the variations like
1582                 #   Video Title - Site Name
1583                 #   Site Name | Video Title
1584                 #   Video Title - Tagline | Site Name
1585                 # and so on and so forth; it's just not practical
1586                 mobj = re.search(r'<title>(.*)</title>', webpage)
1587                 if mobj is None:
1588                         self._downloader.trouble(u'ERROR: unable to extract title')
1589                         return
1590                 video_title = mobj.group(1).decode('utf-8')
1591                 video_title = sanitize_title(video_title)
1592                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1593
1594                 # video uploader is domain name
1595                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1596                 if mobj is None:
1597                         self._downloader.trouble(u'ERROR: unable to extract title')
1598                         return
1599                 video_uploader = mobj.group(1).decode('utf-8')
1600
1601                 try:
1602                         # Process video information
1603                         self._downloader.process_info({
1604                                 'id':           video_id.decode('utf-8'),
1605                                 'url':          video_url.decode('utf-8'),
1606                                 'uploader':     video_uploader,
1607                                 'title':        video_title,
1608                                 'stitle':       simple_title,
1609                                 'ext':          video_extension.decode('utf-8'),
1610                                 'format':       u'NA',
1611                                 'player_url':   None,
1612                         })
1613                 except UnavailableVideoError, err:
1614                         self._downloader.trouble(u'ERROR: unable to download video')
1615
1616
1617 class YoutubeSearchIE(InfoExtractor):
1618         """Information Extractor for YouTube search queries."""
1619         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1620         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1621         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1622         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1623         _youtube_ie = None
1624         _max_youtube_results = 1000
1625
1626         def __init__(self, youtube_ie, downloader=None):
1627                 InfoExtractor.__init__(self, downloader)
1628                 self._youtube_ie = youtube_ie
1629         
1630         @staticmethod
1631         def suitable(url):
1632                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1633
1634         def report_download_page(self, query, pagenum):
1635                 """Report attempt to download playlist page with given number."""
1636                 query = query.decode(preferredencoding())
1637                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1638
1639         def _real_initialize(self):
1640                 self._youtube_ie.initialize()
1641         
1642         def _real_extract(self, query):
1643                 mobj = re.match(self._VALID_QUERY, query)
1644                 if mobj is None:
1645                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1646                         return
1647
1648                 prefix, query = query.split(':')
1649                 prefix = prefix[8:]
1650                 query  = query.encode('utf-8')
1651                 if prefix == '':
1652                         self._download_n_results(query, 1)
1653                         return
1654                 elif prefix == 'all':
1655                         self._download_n_results(query, self._max_youtube_results)
1656                         return
1657                 else:
1658                         try:
1659                                 n = long(prefix)
1660                                 if n <= 0:
1661                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1662                                         return
1663                                 elif n > self._max_youtube_results:
1664                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1665                                         n = self._max_youtube_results
1666                                 self._download_n_results(query, n)
1667                                 return
1668                         except ValueError: # parsing prefix as integer fails
1669                                 self._download_n_results(query, 1)
1670                                 return
1671
1672         def _download_n_results(self, query, n):
1673                 """Downloads a specified number of results for a query"""
1674
1675                 video_ids = []
1676                 already_seen = set()
1677                 pagenum = 1
1678
1679                 while True:
1680                         self.report_download_page(query, pagenum)
1681                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1682                         request = urllib2.Request(result_url, None, std_headers)
1683                         try:
1684                                 page = urllib2.urlopen(request).read()
1685                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1687                                 return
1688
1689                         # Extract video identifiers
1690                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1692                                 if video_id not in already_seen:
1693                                         video_ids.append(video_id)
1694                                         already_seen.add(video_id)
1695                                         if len(video_ids) == n:
1696                                                 # Specified n videos reached
1697                                                 for id in video_ids:
1698                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1699                                                 return
1700
1701                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1702                                 for id in video_ids:
1703                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1704                                 return
1705
1706                         pagenum = pagenum + 1
1707
1708 class GoogleSearchIE(InfoExtractor):
1709         """Information Extractor for Google Video search queries."""
1710         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1711         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1712         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1713         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1714         _google_ie = None
1715         _max_google_results = 1000
1716
1717         def __init__(self, google_ie, downloader=None):
1718                 InfoExtractor.__init__(self, downloader)
1719                 self._google_ie = google_ie
1720         
1721         @staticmethod
1722         def suitable(url):
1723                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1724
1725         def report_download_page(self, query, pagenum):
1726                 """Report attempt to download playlist page with given number."""
1727                 query = query.decode(preferredencoding())
1728                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1729
1730         def _real_initialize(self):
1731                 self._google_ie.initialize()
1732         
1733         def _real_extract(self, query):
1734                 mobj = re.match(self._VALID_QUERY, query)
1735                 if mobj is None:
1736                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1737                         return
1738
1739                 prefix, query = query.split(':')
1740                 prefix = prefix[8:]
1741                 query  = query.encode('utf-8')
1742                 if prefix == '':
1743                         self._download_n_results(query, 1)
1744                         return
1745                 elif prefix == 'all':
1746                         self._download_n_results(query, self._max_google_results)
1747                         return
1748                 else:
1749                         try:
1750                                 n = long(prefix)
1751                                 if n <= 0:
1752                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1753                                         return
1754                                 elif n > self._max_google_results:
1755                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1756                                         n = self._max_google_results
1757                                 self._download_n_results(query, n)
1758                                 return
1759                         except ValueError: # parsing prefix as integer fails
1760                                 self._download_n_results(query, 1)
1761                                 return
1762
1763         def _download_n_results(self, query, n):
1764                 """Downloads a specified number of results for a query"""
1765
1766                 video_ids = []
1767                 already_seen = set()
1768                 pagenum = 1
1769
1770                 while True:
1771                         self.report_download_page(query, pagenum)
1772                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1773                         request = urllib2.Request(result_url, None, std_headers)
1774                         try:
1775                                 page = urllib2.urlopen(request).read()
1776                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1778                                 return
1779
1780                         # Extract video identifiers
1781                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1782                                 video_id = mobj.group(1)
1783                                 if video_id not in already_seen:
1784                                         video_ids.append(video_id)
1785                                         already_seen.add(video_id)
1786                                         if len(video_ids) == n:
1787                                                 # Specified n videos reached
1788                                                 for id in video_ids:
1789                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1790                                                 return
1791
1792                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1793                                 for id in video_ids:
1794                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1795                                 return
1796
1797                         pagenum = pagenum + 1
1798
1799 class YahooSearchIE(InfoExtractor):
1800         """Information Extractor for Yahoo! Video search queries."""
1801         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1802         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1803         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1804         _MORE_PAGES_INDICATOR = r'\s*Next'
1805         _yahoo_ie = None
1806         _max_yahoo_results = 1000
1807
1808         def __init__(self, yahoo_ie, downloader=None):
1809                 InfoExtractor.__init__(self, downloader)
1810                 self._yahoo_ie = yahoo_ie
1811         
1812         @staticmethod
1813         def suitable(url):
1814                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1815
1816         def report_download_page(self, query, pagenum):
1817                 """Report attempt to download playlist page with given number."""
1818                 query = query.decode(preferredencoding())
1819                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1820
1821         def _real_initialize(self):
1822                 self._yahoo_ie.initialize()
1823         
1824         def _real_extract(self, query):
1825                 mobj = re.match(self._VALID_QUERY, query)
1826                 if mobj is None:
1827                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1828                         return
1829
1830                 prefix, query = query.split(':')
1831                 prefix = prefix[8:]
1832                 query  = query.encode('utf-8')
1833                 if prefix == '':
1834                         self._download_n_results(query, 1)
1835                         return
1836                 elif prefix == 'all':
1837                         self._download_n_results(query, self._max_yahoo_results)
1838                         return
1839                 else:
1840                         try:
1841                                 n = long(prefix)
1842                                 if n <= 0:
1843                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1844                                         return
1845                                 elif n > self._max_yahoo_results:
1846                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1847                                         n = self._max_yahoo_results
1848                                 self._download_n_results(query, n)
1849                                 return
1850                         except ValueError: # parsing prefix as integer fails
1851                                 self._download_n_results(query, 1)
1852                                 return
1853
1854         def _download_n_results(self, query, n):
1855                 """Downloads a specified number of results for a query"""
1856
1857                 video_ids = []
1858                 already_seen = set()
1859                 pagenum = 1
1860
1861                 while True:
1862                         self.report_download_page(query, pagenum)
1863                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1864                         request = urllib2.Request(result_url, None, std_headers)
1865                         try:
1866                                 page = urllib2.urlopen(request).read()
1867                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1868                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1869                                 return
1870
1871                         # Extract video identifiers
1872                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1873                                 video_id = mobj.group(1)
1874                                 if video_id not in already_seen:
1875                                         video_ids.append(video_id)
1876                                         already_seen.add(video_id)
1877                                         if len(video_ids) == n:
1878                                                 # Specified n videos reached
1879                                                 for id in video_ids:
1880                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1881                                                 return
1882
1883                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1884                                 for id in video_ids:
1885                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1886                                 return
1887
1888                         pagenum = pagenum + 1
1889
1890 class YoutubePlaylistIE(InfoExtractor):
1891         """Information Extractor for YouTube playlists."""
1892
1893         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1894         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1895         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1896         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1897         _youtube_ie = None
1898
1899         def __init__(self, youtube_ie, downloader=None):
1900                 InfoExtractor.__init__(self, downloader)
1901                 self._youtube_ie = youtube_ie
1902         
1903         @staticmethod
1904         def suitable(url):
1905                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1906
1907         def report_download_page(self, playlist_id, pagenum):
1908                 """Report attempt to download playlist page with given number."""
1909                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1910
1911         def _real_initialize(self):
1912                 self._youtube_ie.initialize()
1913         
1914         def _real_extract(self, url):
1915                 # Extract playlist id
1916                 mobj = re.match(self._VALID_URL, url)
1917                 if mobj is None:
1918                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1919                         return
1920
1921                 # Download playlist pages
1922                 playlist_id = mobj.group(1)
1923                 video_ids = []
1924                 pagenum = 1
1925
1926                 while True:
1927                         self.report_download_page(playlist_id, pagenum)
1928                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1929                         try:
1930                                 page = urllib2.urlopen(request).read()
1931                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1932                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1933                                 return
1934
1935                         # Extract video identifiers
1936                         ids_in_page = []
1937                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1938                                 if mobj.group(1) not in ids_in_page:
1939                                         ids_in_page.append(mobj.group(1))
1940                         video_ids.extend(ids_in_page)
1941
1942                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1943                                 break
1944                         pagenum = pagenum + 1
1945
1946                 playliststart = self._downloader.params.get('playliststart', 1)
1947                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1948                 if playliststart > 0:
1949                         video_ids = video_ids[playliststart:]
1950                         
1951                 for id in video_ids:
1952                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1953                 return
1954
1955 class YoutubeUserIE(InfoExtractor):
1956         """Information Extractor for YouTube users."""
1957
1958         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1959         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1960         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1961         _youtube_ie = None
1962
1963         def __init__(self, youtube_ie, downloader=None):
1964                 InfoExtractor.__init__(self, downloader)
1965                 self._youtube_ie = youtube_ie
1966         
1967         @staticmethod
1968         def suitable(url):
1969                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1970
1971         def report_download_page(self, username):
1972                 """Report attempt to download user page."""
1973                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1974
1975         def _real_initialize(self):
1976                 self._youtube_ie.initialize()
1977         
1978         def _real_extract(self, url):
1979                 # Extract username
1980                 mobj = re.match(self._VALID_URL, url)
1981                 if mobj is None:
1982                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1983                         return
1984
1985                 # Download user page
1986                 username = mobj.group(1)
1987                 video_ids = []
1988                 pagenum = 1
1989
1990                 self.report_download_page(username)
1991                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1992                 try:
1993                         page = urllib2.urlopen(request).read()
1994                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1995                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1996                         return
1997
1998                 # Extract video identifiers
1999                 ids_in_page = []
2000
2001                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2002                         if mobj.group(1) not in ids_in_page:
2003                                 ids_in_page.append(mobj.group(1))
2004                 video_ids.extend(ids_in_page)
2005
2006                 playliststart = self._downloader.params.get('playliststart', 1)
2007                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2008                 if playliststart > 0:
2009                         video_ids = video_ids[playliststart:]   
2010
2011                 for id in video_ids:
2012                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2013                 return
2014
2015 class PostProcessor(object):
2016         """Post Processor class.
2017
2018         PostProcessor objects can be added to downloaders with their
2019         add_post_processor() method. When the downloader has finished a
2020         successful download, it will take its internal chain of PostProcessors
2021         and start calling the run() method on each one of them, first with
2022         an initial argument and then with the returned value of the previous
2023         PostProcessor.
2024
2025         The chain will be stopped if one of them ever returns None or the end
2026         of the chain is reached.
2027
2028         PostProcessor objects follow a "mutual registration" process similar
2029         to InfoExtractor objects.
2030         """
2031
2032         _downloader = None
2033
2034         def __init__(self, downloader=None):
2035                 self._downloader = downloader
2036
2037         def set_downloader(self, downloader):
2038                 """Sets the downloader for this PP."""
2039                 self._downloader = downloader
2040         
2041         def run(self, information):
2042                 """Run the PostProcessor.
2043
2044                 The "information" argument is a dictionary like the ones
2045                 composed by InfoExtractors. The only difference is that this
2046                 one has an extra field called "filepath" that points to the
2047                 downloaded file.
2048
2049                 When this method returns None, the postprocessing chain is
2050                 stopped. However, this method may return an information
2051                 dictionary that will be passed to the next postprocessing
2052                 object in the chain. It can be the one it received after
2053                 changing some fields.
2054
2055                 In addition, this method may raise a PostProcessingError
2056                 exception that will be taken into account by the downloader
2057                 it was called from.
2058                 """
2059                 return information # by default, do nothing
2060         
2061 ### MAIN PROGRAM ###
2062 if __name__ == '__main__':
2063         try:
2064                 # Modules needed only when running the main program
2065                 import getpass
2066                 import optparse
2067
2068                 # Function to update the program file with the latest version from bitbucket.org
2069                 def update_self(downloader, filename):
2070                         # Note: downloader only used for options
2071                         if not os.access (filename, os.W_OK):
2072                                 sys.exit('ERROR: no write permissions on %s' % filename)
2073
2074                         downloader.to_stdout('Updating to latest stable version...')
2075                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2076                         latest_version = urllib.urlopen(latest_url).read().strip()
2077                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2078                         newcontent = urllib.urlopen(prog_url).read()
2079                         stream = open(filename, 'w')
2080                         stream.write(newcontent)
2081                         stream.close()
2082                         downloader.to_stdout('Updated to version %s' % latest_version)
2083
2084                 # General configuration
2085                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2086                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2087                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2088
2089                 # Parse command line
2090                 parser = optparse.OptionParser(
2091                         usage='Usage: %prog [options] url...',
2092                         version='2010.08.04',
2093                         conflict_handler='resolve',
2094                 )
2095
2096                 parser.add_option('-h', '--help',
2097                                 action='help', help='print this help text and exit')
2098                 parser.add_option('-v', '--version',
2099                                 action='version', help='print program version and exit')
2100                 parser.add_option('-U', '--update',
2101                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2102                 parser.add_option('-i', '--ignore-errors',
2103                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2104                 parser.add_option('-r', '--rate-limit',
2105                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2106                 parser.add_option('-R', '--retries',
2107                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2108                 parser.add_option('--playlist-start',
2109                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2110
2111                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2112                 authentication.add_option('-u', '--username',
2113                                 dest='username', metavar='USERNAME', help='account username')
2114                 authentication.add_option('-p', '--password',
2115                                 dest='password', metavar='PASSWORD', help='account password')
2116                 authentication.add_option('-n', '--netrc',
2117                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2118                 parser.add_option_group(authentication)
2119
2120                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2121                 video_format.add_option('-f', '--format',
2122                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2123                 video_format.add_option('-m', '--mobile-version',
2124                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2125                 video_format.add_option('--all-formats',
2126                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2127                 video_format.add_option('--max-quality',
2128                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2129                 video_format.add_option('-b', '--best-quality',
2130                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2131                 parser.add_option_group(video_format)
2132
2133                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2134                 verbosity.add_option('-q', '--quiet',
2135                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2136                 verbosity.add_option('-s', '--simulate',
2137                                 action='store_true', dest='simulate', help='do not download video', default=False)
2138                 verbosity.add_option('-g', '--get-url',
2139                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2140                 verbosity.add_option('-e', '--get-title',
2141                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2142                 verbosity.add_option('--get-thumbnail',
2143                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2144                 verbosity.add_option('--get-description',
2145                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2146                 verbosity.add_option('--no-progress',
2147                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2148                 parser.add_option_group(verbosity)
2149
2150                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2151                 filesystem.add_option('-t', '--title',
2152                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2153                 filesystem.add_option('-l', '--literal',
2154                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2155                 filesystem.add_option('-o', '--output',
2156                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2157                 filesystem.add_option('-a', '--batch-file',
2158                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2159                 filesystem.add_option('-w', '--no-overwrites',
2160                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2161                 filesystem.add_option('-c', '--continue',
2162                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2163                 parser.add_option_group(filesystem)
2164
2165                 (opts, args) = parser.parse_args()
2166
2167                 # Batch file verification
2168                 batchurls = []
2169                 if opts.batchfile is not None:
2170                         try:
2171                                 if opts.batchfile == '-':
2172                                         batchfd = sys.stdin
2173                                 else:
2174                                         batchfd = open(opts.batchfile, 'r')
2175                                 batchurls = batchfd.readlines()
2176                                 batchurls = [x.strip() for x in batchurls]
2177                                 batchurls = [x for x in batchurls if len(x) > 0]
2178                         except IOError:
2179                                 sys.exit(u'ERROR: batch file could not be read')
2180                 all_urls = batchurls + args
2181
2182                 # Conflicting, missing and erroneous options
2183                 if opts.bestquality:
2184                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2185                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2186                         parser.error(u'using .netrc conflicts with giving username/password')
2187                 if opts.password is not None and opts.username is None:
2188                         parser.error(u'account username missing')
2189                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2190                         parser.error(u'using output template conflicts with using title or literal title')
2191                 if opts.usetitle and opts.useliteral:
2192                         parser.error(u'using title conflicts with using literal title')
2193                 if opts.username is not None and opts.password is None:
2194                         opts.password = getpass.getpass(u'Type account password and press return:')
2195                 if opts.ratelimit is not None:
2196                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2197                         if numeric_limit is None:
2198                                 parser.error(u'invalid rate limit specified')
2199                         opts.ratelimit = numeric_limit
2200                 if opts.retries is not None:
2201                         try:
2202                                 opts.retries = long(opts.retries)
2203                         except (TypeError, ValueError), err:
2204                                 parser.error(u'invalid retry count specified')
2205                 if opts.playliststart is not None:
2206                         try:
2207                                 opts.playliststart = long(opts.playliststart)
2208                         except (TypeError, ValueError), err:
2209                                 parser.error(u'invalid playlist page specified')
2210
2211                 # Information extractors
2212                 youtube_ie = YoutubeIE()
2213                 metacafe_ie = MetacafeIE(youtube_ie)
2214                 dailymotion_ie = DailymotionIE()
2215                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2216                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2217                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2218                 google_ie = GoogleIE()
2219                 google_search_ie = GoogleSearchIE(google_ie)
2220                 photobucket_ie = PhotobucketIE()
2221                 yahoo_ie = YahooIE()
2222                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2223                 generic_ie = GenericIE()
2224
2225                 # File downloader
2226                 fd = FileDownloader({
2227                         'usenetrc': opts.usenetrc,
2228                         'username': opts.username,
2229                         'password': opts.password,
2230                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2231                         'forceurl': opts.geturl,
2232                         'forcetitle': opts.gettitle,
2233                         'forcethumbnail': opts.getthumbnail,
2234                         'forcedescription': opts.getdescription,
2235                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2236                         'format': opts.format,
2237                         'format_limit': opts.format_limit,
2238                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2239                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2240                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2241                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2242                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2243                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2244                                 or u'%(id)s.%(ext)s'),
2245                         'ignoreerrors': opts.ignoreerrors,
2246                         'ratelimit': opts.ratelimit,
2247                         'nooverwrites': opts.nooverwrites,
2248                         'retries': opts.retries,
2249                         'continuedl': opts.continue_dl,
2250                         'noprogress': opts.noprogress,
2251                         'playliststart': opts.playliststart,
2252                         })
2253                 fd.add_info_extractor(youtube_search_ie)
2254                 fd.add_info_extractor(youtube_pl_ie)
2255                 fd.add_info_extractor(youtube_user_ie)
2256                 fd.add_info_extractor(metacafe_ie)
2257                 fd.add_info_extractor(dailymotion_ie)
2258                 fd.add_info_extractor(youtube_ie)
2259                 fd.add_info_extractor(google_ie)
2260                 fd.add_info_extractor(google_search_ie)
2261                 fd.add_info_extractor(photobucket_ie)
2262                 fd.add_info_extractor(yahoo_ie)
2263                 fd.add_info_extractor(yahoo_search_ie)
2264
2265                 # This must come last since it's the
2266                 # fallback if none of the others work
2267                 fd.add_info_extractor(generic_ie)
2268
2269                 # Update version
2270                 if opts.update_self:
2271                         update_self(fd, sys.argv[0])
2272
2273                 # Maybe do nothing
2274                 if len(all_urls) < 1:
2275                         if not opts.update_self:
2276                                 parser.error(u'you must provide at least one URL')
2277                         else:
2278                                 sys.exit()
2279                 retcode = fd.download(all_urls)
2280                 sys.exit(retcode)
2281
2282         except DownloadError:
2283                 sys.exit(1)
2284         except SameFileError:
2285                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2286         except KeyboardInterrupt:
2287                 sys.exit(u'\nERROR: Interrupted by user')