Fix metacafe.com downloads for some videos (fixes issue #189)
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         def add_info_extractor(self, ie):
291                 """Add an InfoExtractor object to the end of the list."""
292                 self._ies.append(ie)
293                 ie.set_downloader(self)
294         
295         def add_post_processor(self, pp):
296                 """Add a PostProcessor object to the end of the chain."""
297                 self._pps.append(pp)
298                 pp.set_downloader(self)
299         
300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301                 """Print message to stdout if not in quiet mode."""
302                 try:
303                         if not self.params.get('quiet', False):
304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305                         sys.stdout.flush()
306                 except (UnicodeEncodeError), err:
307                         if not ignore_encoding_errors:
308                                 raise
309         
310         def to_stderr(self, message):
311                 """Print message to stderr."""
312                 print >>sys.stderr, message.encode(preferredencoding())
313         
314         def fixed_template(self):
315                 """Checks if the output template is fixed."""
316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318         def trouble(self, message=None):
319                 """Determine action to take when a download problem appears.
320
321                 Depending on if the downloader has been configured to ignore
322                 download errors or not, this method may throw an exception or
323                 not when errors are found, after printing the message.
324                 """
325                 if message is not None:
326                         self.to_stderr(message)
327                 if not self.params.get('ignoreerrors', False):
328                         raise DownloadError(message)
329                 self._download_retcode = 1
330
331         def slow_down(self, start_time, byte_counter):
332                 """Sleep if the download speed is over the rate limit."""
333                 rate_limit = self.params.get('ratelimit', None)
334                 if rate_limit is None or byte_counter == 0:
335                         return
336                 now = time.time()
337                 elapsed = now - start_time
338                 if elapsed <= 0.0:
339                         return
340                 speed = float(byte_counter) / elapsed
341                 if speed > rate_limit:
342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344         def report_destination(self, filename):
345                 """Report destination filename."""
346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347         
348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349                 """Report download progress."""
350                 if self.params.get('noprogress', False):
351                         return
352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355         def report_resuming_byte(self, resume_len):
356                 """Report attempt to resume at given byte."""
357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358         
359         def report_retry(self, count, retries):
360                 """Report retry in case of HTTP error 503"""
361                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
362         
363         def report_file_already_downloaded(self, file_name):
364                 """Report file has already been fully downloaded."""
365                 try:
366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367                 except (UnicodeEncodeError), err:
368                         self.to_stdout(u'[download] The file has already been downloaded')
369         
370         def report_unable_to_resume(self):
371                 """Report it was impossible to resume download."""
372                 self.to_stdout(u'[download] Unable to resume')
373         
374         def report_finish(self):
375                 """Report download finished."""
376                 if self.params.get('noprogress', False):
377                         self.to_stdout(u'[download] Download completed')
378                 else:
379                         self.to_stdout(u'')
380         
381         def increment_downloads(self):
382                 """Increment the ordinal that assigns a number to each file."""
383                 self._num_downloads += 1
384
385         def process_info(self, info_dict):
386                 """Process a single dictionary returned by an InfoExtractor."""
387                 # Do nothing else if in simulate mode
388                 if self.params.get('simulate', False):
389                         # Forced printings
390                         if self.params.get('forcetitle', False):
391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392                         if self.params.get('forceurl', False):
393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399                         return
400                         
401                 try:
402                         template_dict = dict(info_dict)
403                         template_dict['epoch'] = unicode(long(time.time()))
404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
405                         filename = self.params['outtmpl'] % template_dict
406                 except (ValueError, KeyError), err:
407                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
408                         return
409                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
410                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
411                         return
412
413                 try:
414                         self.pmkdir(filename)
415                 except (OSError, IOError), err:
416                         self.trouble(u'ERROR: unable to create directories: %s' % str(err))
417                         return
418
419                 try:
420                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
421                 except (OSError, IOError), err:
422                         raise UnavailableVideoError
423                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
424                         self.trouble(u'ERROR: unable to download video data: %s' % str(err))
425                         return
426                 except (ContentTooShortError, ), err:
427                         self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
428                         return
429
430                 if success:
431                         try:
432                                 self.post_process(filename, info_dict)
433                         except (PostProcessingError), err:
434                                 self.trouble(u'ERROR: postprocessing: %s' % str(err))
435                                 return
436
437         def download(self, url_list):
438                 """Download a given list of URLs."""
439                 if len(url_list) > 1 and self.fixed_template():
440                         raise SameFileError(self.params['outtmpl'])
441
442                 for url in url_list:
443                         suitable_found = False
444                         for ie in self._ies:
445                                 # Go to next InfoExtractor if not suitable
446                                 if not ie.suitable(url):
447                                         continue
448
449                                 # Suitable InfoExtractor found
450                                 suitable_found = True
451
452                                 # Extract information from URL and process it
453                                 ie.extract(url)
454
455                                 # Suitable InfoExtractor had been found; go to next URL
456                                 break
457
458                         if not suitable_found:
459                                 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
460
461                 return self._download_retcode
462
463         def post_process(self, filename, ie_info):
464                 """Run the postprocessing chain on the given file."""
465                 info = dict(ie_info)
466                 info['filepath'] = filename
467                 for pp in self._pps:
468                         info = pp.run(info)
469                         if info is None:
470                                 break
471         
472         def _download_with_rtmpdump(self, filename, url, player_url):
473                 self.report_destination(filename)
474
475                 # Check for rtmpdump first
476                 try:
477                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
478                 except (OSError, IOError):
479                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
480                         return False
481
482                 # Download using rtmpdump. rtmpdump returns exit code 2 when
483                 # the connection was interrumpted and resuming appears to be
484                 # possible. This is part of rtmpdump's normal usage, AFAIK.
485                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
486                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
487                 while retval == 2 or retval == 1:
488                         prevsize = os.path.getsize(filename)
489                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
490                         time.sleep(5.0) # This seems to be needed
491                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
492                         cursize = os.path.getsize(filename)
493                         if prevsize == cursize and retval == 1:
494                                 break
495                 if retval == 0:
496                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
497                         return True
498                 else:
499                         self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
500                         return False
501
502         def _do_download(self, filename, url, player_url):
503                 # Attempt to download using rtmpdump
504                 if url.startswith('rtmp'):
505                         return self._download_with_rtmpdump(filename, url, player_url)
506
507                 stream = None
508                 open_mode = 'wb'
509                 basic_request = urllib2.Request(url, None, std_headers)
510                 request = urllib2.Request(url, None, std_headers)
511
512                 # Establish possible resume length
513                 if os.path.isfile(filename):
514                         resume_len = os.path.getsize(filename)
515                 else:
516                         resume_len = 0
517
518                 # Request parameters in case of being able to resume
519                 if self.params.get('continuedl', False) and resume_len != 0:
520                         self.report_resuming_byte(resume_len)
521                         request.add_header('Range','bytes=%d-' % resume_len)
522                         open_mode = 'ab'
523
524                 count = 0
525                 retries = self.params.get('retries', 0)
526                 while count <= retries:
527                         # Establish connection
528                         try:
529                                 data = urllib2.urlopen(request)
530                                 break
531                         except (urllib2.HTTPError, ), err:
532                                 if err.code != 503 and err.code != 416:
533                                         # Unexpected HTTP error
534                                         raise
535                                 elif err.code == 416:
536                                         # Unable to resume (requested range not satisfiable)
537                                         try:
538                                                 # Open the connection again without the range header
539                                                 data = urllib2.urlopen(basic_request)
540                                                 content_length = data.info()['Content-Length']
541                                         except (urllib2.HTTPError, ), err:
542                                                 if err.code != 503:
543                                                         raise
544                                         else:
545                                                 # Examine the reported length
546                                                 if (content_length is not None and
547                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
548                                                         # The file had already been fully downloaded.
549                                                         # Explanation to the above condition: in issue #175 it was revealed that
550                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
551                                                         # changing the file size slightly and causing problems for some users. So
552                                                         # I decided to implement a suggested change and consider the file
553                                                         # completely downloaded if the file size differs less than 100 bytes from
554                                                         # the one in the hard drive.
555                                                         self.report_file_already_downloaded(filename)
556                                                         return True
557                                                 else:
558                                                         # The length does not match, we start the download over
559                                                         self.report_unable_to_resume()
560                                                         open_mode = 'wb'
561                                                         break
562                         # Retry
563                         count += 1
564                         if count <= retries:
565                                 self.report_retry(count, retries)
566
567                 if count > retries:
568                         self.trouble(u'ERROR: giving up after %s retries' % retries)
569                         return False
570
571                 data_len = data.info().get('Content-length', None)
572                 data_len_str = self.format_bytes(data_len)
573                 byte_counter = 0
574                 block_size = 1024
575                 start = time.time()
576                 while True:
577                         # Download and write
578                         before = time.time()
579                         data_block = data.read(block_size)
580                         after = time.time()
581                         data_block_len = len(data_block)
582                         if data_block_len == 0:
583                                 break
584                         byte_counter += data_block_len
585
586                         # Open file just in time
587                         if stream is None:
588                                 try:
589                                         (stream, filename) = sanitize_open(filename, open_mode)
590                                         self.report_destination(filename)
591                                 except (OSError, IOError), err:
592                                         self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
593                                         return False
594                         try:
595                                 stream.write(data_block)
596                         except (IOError, OSError), err:
597                                 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
598                                 return False
599                         block_size = self.best_block_size(after - before, data_block_len)
600
601                         # Progress message
602                         percent_str = self.calc_percent(byte_counter, data_len)
603                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
604                         speed_str = self.calc_speed(start, time.time(), byte_counter)
605                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
606
607                         # Apply rate limit
608                         self.slow_down(start, byte_counter)
609
610                 self.report_finish()
611                 if data_len is not None and str(byte_counter) != data_len:
612                         raise ContentTooShortError(byte_counter, long(data_len))
613                 return True
614
615 class InfoExtractor(object):
616         """Information Extractor class.
617
618         Information extractors are the classes that, given a URL, extract
619         information from the video (or videos) the URL refers to. This
620         information includes the real video URL, the video title and simplified
621         title, author and others. The information is stored in a dictionary
622         which is then passed to the FileDownloader. The FileDownloader
623         processes this information possibly downloading the video to the file
624         system, among other possible outcomes. The dictionaries must include
625         the following fields:
626
627         id:             Video identifier.
628         url:            Final video URL.
629         uploader:       Nickname of the video uploader.
630         title:          Literal title.
631         stitle:         Simplified title.
632         ext:            Video filename extension.
633         format:         Video format.
634         player_url:     SWF Player URL (may be None).
635
636         The following fields are optional. Their primary purpose is to allow
637         youtube-dl to serve as the backend for a video search function, such
638         as the one in youtube2mp3.  They are only used when their respective
639         forced printing functions are called:
640
641         thumbnail:      Full URL to a video thumbnail image.
642         description:    One-line video description.
643
644         Subclasses of this one should re-define the _real_initialize() and
645         _real_extract() methods, as well as the suitable() static method.
646         Probably, they should also be instantiated and added to the main
647         downloader.
648         """
649
650         _ready = False
651         _downloader = None
652
653         def __init__(self, downloader=None):
654                 """Constructor. Receives an optional downloader."""
655                 self._ready = False
656                 self.set_downloader(downloader)
657
658         @staticmethod
659         def suitable(url):
660                 """Receives a URL and returns True if suitable for this IE."""
661                 return False
662
663         def initialize(self):
664                 """Initializes an instance (authentication, etc)."""
665                 if not self._ready:
666                         self._real_initialize()
667                         self._ready = True
668
669         def extract(self, url):
670                 """Extracts URL information and returns it in list of dicts."""
671                 self.initialize()
672                 return self._real_extract(url)
673
674         def set_downloader(self, downloader):
675                 """Sets the downloader for this IE."""
676                 self._downloader = downloader
677         
678         def _real_initialize(self):
679                 """Real initialization process. Redefine in subclasses."""
680                 pass
681
682         def _real_extract(self, url):
683                 """Real extraction process. Redefine in subclasses."""
684                 pass
685
686 class YoutubeIE(InfoExtractor):
687         """Information extractor for youtube.com."""
688
689         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
690         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
691         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
692         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
693         _NETRC_MACHINE = 'youtube'
694         # Listed in order of quality
695         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
696         _video_extensions = {
697                 '13': '3gp',
698                 '17': 'mp4',
699                 '18': 'mp4',
700                 '22': 'mp4',
701                 '37': 'mp4',
702                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
703                 '43': 'webm',
704                 '45': 'webm',
705         }
706
707         @staticmethod
708         def suitable(url):
709                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
710
711         def report_lang(self):
712                 """Report attempt to set language."""
713                 self._downloader.to_stdout(u'[youtube] Setting language')
714
715         def report_login(self):
716                 """Report attempt to log in."""
717                 self._downloader.to_stdout(u'[youtube] Logging in')
718         
719         def report_age_confirmation(self):
720                 """Report attempt to confirm age."""
721                 self._downloader.to_stdout(u'[youtube] Confirming age')
722         
723         def report_video_webpage_download(self, video_id):
724                 """Report attempt to download video webpage."""
725                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
726         
727         def report_video_info_webpage_download(self, video_id):
728                 """Report attempt to download video info webpage."""
729                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
730         
731         def report_information_extraction(self, video_id):
732                 """Report attempt to extract video information."""
733                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
734         
735         def report_unavailable_format(self, video_id, format):
736                 """Report extracted video URL."""
737                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
738         
739         def report_rtmp_download(self):
740                 """Indicate the download will use the RTMP protocol."""
741                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
742         
743         def _real_initialize(self):
744                 if self._downloader is None:
745                         return
746
747                 username = None
748                 password = None
749                 downloader_params = self._downloader.params
750
751                 # Attempt to use provided username and password or .netrc data
752                 if downloader_params.get('username', None) is not None:
753                         username = downloader_params['username']
754                         password = downloader_params['password']
755                 elif downloader_params.get('usenetrc', False):
756                         try:
757                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
758                                 if info is not None:
759                                         username = info[0]
760                                         password = info[2]
761                                 else:
762                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
763                         except (IOError, netrc.NetrcParseError), err:
764                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
765                                 return
766
767                 # Set language
768                 request = urllib2.Request(self._LANG_URL, None, std_headers)
769                 try:
770                         self.report_lang()
771                         urllib2.urlopen(request).read()
772                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
773                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
774                         return
775
776                 # No authentication to be performed
777                 if username is None:
778                         return
779
780                 # Log in
781                 login_form = {
782                                 'current_form': 'loginForm',
783                                 'next':         '/',
784                                 'action_login': 'Log In',
785                                 'username':     username,
786                                 'password':     password,
787                                 }
788                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
789                 try:
790                         self.report_login()
791                         login_results = urllib2.urlopen(request).read()
792                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
793                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
794                                 return
795                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
797                         return
798         
799                 # Confirm age
800                 age_form = {
801                                 'next_url':             '/',
802                                 'action_confirm':       'Confirm',
803                                 }
804                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
805                 try:
806                         self.report_age_confirmation()
807                         age_results = urllib2.urlopen(request).read()
808                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
810                         return
811
812         def _real_extract(self, url):
813                 # Extract video id from URL
814                 mobj = re.match(self._VALID_URL, url)
815                 if mobj is None:
816                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
817                         return
818                 video_id = mobj.group(2)
819
820                 # Get video webpage
821                 self.report_video_webpage_download(video_id)
822                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
823                 try:
824                         video_webpage = urllib2.urlopen(request).read()
825                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
826                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
827                         return
828
829                 # Attempt to extract SWF player URL
830                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
831                 if mobj is not None:
832                         player_url = mobj.group(1)
833                 else:
834                         player_url = None
835
836                 # Get video info
837                 self.report_video_info_webpage_download(video_id)
838                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
839                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
840                                            % (video_id, el_type))
841                         request = urllib2.Request(video_info_url, None, std_headers)
842                         try:
843                                 video_info_webpage = urllib2.urlopen(request).read()
844                                 video_info = parse_qs(video_info_webpage)
845                                 if 'token' in video_info:
846                                         break
847                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
848                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
849                                 return
850                 if 'token' not in video_info:
851                         if 'reason' in video_info:
852                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
853                         else:
854                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
855                         return
856
857                 # Start extracting information
858                 self.report_information_extraction(video_id)
859
860                 # uploader
861                 if 'author' not in video_info:
862                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
863                         return
864                 video_uploader = urllib.unquote_plus(video_info['author'][0])
865
866                 # title
867                 if 'title' not in video_info:
868                         self._downloader.trouble(u'ERROR: unable to extract video title')
869                         return
870                 video_title = urllib.unquote_plus(video_info['title'][0])
871                 video_title = video_title.decode('utf-8')
872                 video_title = sanitize_title(video_title)
873
874                 # simplified title
875                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
876                 simple_title = simple_title.strip(ur'_')
877
878                 # thumbnail image
879                 if 'thumbnail_url' not in video_info:
880                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
881                         video_thumbnail = ''
882                 else:   # don't panic if we can't find it
883                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
884
885                 # description
886                 video_description = 'No description available.'
887                 if self._downloader.params.get('forcedescription', False):
888                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
889                         if mobj is not None:
890                                 video_description = mobj.group(1)
891
892                 # token
893                 video_token = urllib.unquote_plus(video_info['token'][0])
894
895                 # Decide which formats to download
896                 requested_format = self._downloader.params.get('format', None)
897                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
898
899                 if 'fmt_url_map' in video_info:
900                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
901                         format_limit = self._downloader.params.get('format_limit', None)
902                         if format_limit is not None and format_limit in self._available_formats:
903                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
904                         else:
905                                 format_list = self._available_formats
906                         existing_formats = [x for x in format_list if x in url_map]
907                         if len(existing_formats) == 0:
908                                 self._downloader.trouble(u'ERROR: no known formats available for video')
909                                 return
910                         if requested_format is None:
911                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
912                         elif requested_format == '-1':
913                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
914                         else:
915                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
916
917                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
918                         self.report_rtmp_download()
919                         video_url_list = [(None, video_info['conn'][0])]
920
921                 else:
922                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
923                         return
924
925                 for format_param, video_real_url in video_url_list:
926                         # At this point we have a new video
927                         self._downloader.increment_downloads()
928
929                         # Extension
930                         video_extension = self._video_extensions.get(format_param, 'flv')
931
932                         # Find the video URL in fmt_url_map or conn paramters
933                         try:
934                                 # Process video information
935                                 self._downloader.process_info({
936                                         'id':           video_id.decode('utf-8'),
937                                         'url':          video_real_url.decode('utf-8'),
938                                         'uploader':     video_uploader.decode('utf-8'),
939                                         'title':        video_title,
940                                         'stitle':       simple_title,
941                                         'ext':          video_extension.decode('utf-8'),
942                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
943                                         'thumbnail':    video_thumbnail.decode('utf-8'),
944                                         'description':  video_description.decode('utf-8'),
945                                         'player_url':   player_url,
946                                 })
947                         except UnavailableVideoError, err:
948                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
949
950
951 class MetacafeIE(InfoExtractor):
952         """Information Extractor for metacafe.com."""
953
954         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
955         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
956         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
957         _youtube_ie = None
958
959         def __init__(self, youtube_ie, downloader=None):
960                 InfoExtractor.__init__(self, downloader)
961                 self._youtube_ie = youtube_ie
962
963         @staticmethod
964         def suitable(url):
965                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
966
967         def report_disclaimer(self):
968                 """Report disclaimer retrieval."""
969                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
970
971         def report_age_confirmation(self):
972                 """Report attempt to confirm age."""
973                 self._downloader.to_stdout(u'[metacafe] Confirming age')
974         
975         def report_download_webpage(self, video_id):
976                 """Report webpage download."""
977                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
978         
979         def report_extraction(self, video_id):
980                 """Report information extraction."""
981                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
982
983         def _real_initialize(self):
984                 # Retrieve disclaimer
985                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
986                 try:
987                         self.report_disclaimer()
988                         disclaimer = urllib2.urlopen(request).read()
989                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
990                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
991                         return
992
993                 # Confirm age
994                 disclaimer_form = {
995                         'filters': '0',
996                         'submit': "Continue - I'm over 18",
997                         }
998                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
999                 try:
1000                         self.report_age_confirmation()
1001                         disclaimer = urllib2.urlopen(request).read()
1002                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1004                         return
1005         
1006         def _real_extract(self, url):
1007                 # Extract id and simplified title from URL
1008                 mobj = re.match(self._VALID_URL, url)
1009                 if mobj is None:
1010                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1011                         return
1012
1013                 video_id = mobj.group(1)
1014
1015                 # Check if video comes from YouTube
1016                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1017                 if mobj2 is not None:
1018                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1019                         return
1020
1021                 # At this point we have a new video
1022                 self._downloader.increment_downloads()
1023
1024                 simple_title = mobj.group(2).decode('utf-8')
1025                 video_extension = 'flv'
1026
1027                 # Retrieve video webpage to extract further information
1028                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1029                 try:
1030                         self.report_download_webpage(video_id)
1031                         webpage = urllib2.urlopen(request).read()
1032                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1033                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1034                         return
1035
1036                 # Extract URL, uploader and title from webpage
1037                 self.report_extraction(video_id)
1038                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1039                 if mobj is not None:
1040                         mediaURL = urllib.unquote(mobj.group(1))
1041                         
1042                         # Extract gdaKey if available
1043                         mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1044                         if mobj is None:
1045                                 video_url = mediaURL
1046                         else:
1047                                 gdaKey = mobj.group(1)
1048                                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1049                 else:
1050                         mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1051                         if mobj is None:
1052                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1053                                 return
1054                         vardict = parse_qs(mobj.group(1))
1055                         if 'mediaData' not in vardict:
1056                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1057                                 return
1058                         mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1059                         if mobj is None:
1060                                 self._downloader.trouble(u'ERROR: unable to extract media URL')
1061                                 return
1062                         video_url = '%s?__gda__=%s' % (mobj.group(1).replace('\\/', '/'), mobj.group(2))
1063
1064                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1065                 if mobj is None:
1066                         self._downloader.trouble(u'ERROR: unable to extract title')
1067                         return
1068                 video_title = mobj.group(1).decode('utf-8')
1069                 video_title = sanitize_title(video_title)
1070
1071                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1072                 if mobj is None:
1073                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1074                         return
1075                 video_uploader = mobj.group(1)
1076
1077                 try:
1078                         # Process video information
1079                         self._downloader.process_info({
1080                                 'id':           video_id.decode('utf-8'),
1081                                 'url':          video_url.decode('utf-8'),
1082                                 'uploader':     video_uploader.decode('utf-8'),
1083                                 'title':        video_title,
1084                                 'stitle':       simple_title,
1085                                 'ext':          video_extension.decode('utf-8'),
1086                                 'format':       u'NA',
1087                                 'player_url':   None,
1088                         })
1089                 except UnavailableVideoError:
1090                         self._downloader.trouble(u'ERROR: unable to download video')
1091
1092
1093 class DailymotionIE(InfoExtractor):
1094         """Information Extractor for Dailymotion"""
1095
1096         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1097
1098         def __init__(self, downloader=None):
1099                 InfoExtractor.__init__(self, downloader)
1100
1101         @staticmethod
1102         def suitable(url):
1103                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1104
1105         def report_download_webpage(self, video_id):
1106                 """Report webpage download."""
1107                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1108         
1109         def report_extraction(self, video_id):
1110                 """Report information extraction."""
1111                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1112
1113         def _real_initialize(self):
1114                 return
1115
1116         def _real_extract(self, url):
1117                 # Extract id and simplified title from URL
1118                 mobj = re.match(self._VALID_URL, url)
1119                 if mobj is None:
1120                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1121                         return
1122
1123                 # At this point we have a new video
1124                 self._downloader.increment_downloads()
1125                 video_id = mobj.group(1)
1126
1127                 simple_title = mobj.group(2).decode('utf-8')
1128                 video_extension = 'flv'
1129
1130                 # Retrieve video webpage to extract further information
1131                 request = urllib2.Request(url)
1132                 try:
1133                         self.report_download_webpage(video_id)
1134                         webpage = urllib2.urlopen(request).read()
1135                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1136                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1137                         return
1138
1139                 # Extract URL, uploader and title from webpage
1140                 self.report_extraction(video_id)
1141                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1142                 if mobj is None:
1143                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1144                         return
1145                 mediaURL = urllib.unquote(mobj.group(1))
1146
1147                 # if needed add http://www.dailymotion.com/ if relative URL
1148
1149                 video_url = mediaURL
1150
1151                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1152                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1153                 if mobj is None:
1154                         self._downloader.trouble(u'ERROR: unable to extract title')
1155                         return
1156                 video_title = mobj.group(1).decode('utf-8')
1157                 video_title = sanitize_title(video_title)
1158
1159                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1160                 if mobj is None:
1161                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1162                         return
1163                 video_uploader = mobj.group(1)
1164
1165                 try:
1166                         # Process video information
1167                         self._downloader.process_info({
1168                                 'id':           video_id.decode('utf-8'),
1169                                 'url':          video_url.decode('utf-8'),
1170                                 'uploader':     video_uploader.decode('utf-8'),
1171                                 'title':        video_title,
1172                                 'stitle':       simple_title,
1173                                 'ext':          video_extension.decode('utf-8'),
1174                                 'format':       u'NA',
1175                                 'player_url':   None,
1176                         })
1177                 except UnavailableVideoError:
1178                         self._downloader.trouble(u'ERROR: unable to download video')
1179
1180 class GoogleIE(InfoExtractor):
1181         """Information extractor for video.google.com."""
1182
1183         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1184
1185         def __init__(self, downloader=None):
1186                 InfoExtractor.__init__(self, downloader)
1187
1188         @staticmethod
1189         def suitable(url):
1190                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1191
1192         def report_download_webpage(self, video_id):
1193                 """Report webpage download."""
1194                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1195
1196         def report_extraction(self, video_id):
1197                 """Report information extraction."""
1198                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1199
1200         def _real_initialize(self):
1201                 return
1202
1203         def _real_extract(self, url):
1204                 # Extract id from URL
1205                 mobj = re.match(self._VALID_URL, url)
1206                 if mobj is None:
1207                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1208                         return
1209
1210                 # At this point we have a new video
1211                 self._downloader.increment_downloads()
1212                 video_id = mobj.group(1)
1213
1214                 video_extension = 'mp4'
1215
1216                 # Retrieve video webpage to extract further information
1217                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1218                 try:
1219                         self.report_download_webpage(video_id)
1220                         webpage = urllib2.urlopen(request).read()
1221                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1222                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1223                         return
1224
1225                 # Extract URL, uploader, and title from webpage
1226                 self.report_extraction(video_id)
1227                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1228                 if mobj is None:
1229                         video_extension = 'flv'
1230                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1233                         return
1234                 mediaURL = urllib.unquote(mobj.group(1))
1235                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1236                 mediaURL = mediaURL.replace('\\x26', '\x26')
1237
1238                 video_url = mediaURL
1239
1240                 mobj = re.search(r'<title>(.*)</title>', webpage)
1241                 if mobj is None:
1242                         self._downloader.trouble(u'ERROR: unable to extract title')
1243                         return
1244                 video_title = mobj.group(1).decode('utf-8')
1245                 video_title = sanitize_title(video_title)
1246                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1247
1248                 # Extract video description
1249                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1250                 if mobj is None:
1251                         self._downloader.trouble(u'ERROR: unable to extract video description')
1252                         return
1253                 video_description = mobj.group(1).decode('utf-8')
1254                 if not video_description:
1255                         video_description = 'No description available.'
1256
1257                 # Extract video thumbnail
1258                 if self._downloader.params.get('forcethumbnail', False):
1259                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1260                         try:
1261                                 webpage = urllib2.urlopen(request).read()
1262                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1263                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1264                                 return
1265                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1266                         if mobj is None:
1267                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1268                                 return
1269                         video_thumbnail = mobj.group(1)
1270                 else:   # we need something to pass to process_info
1271                         video_thumbnail = ''
1272
1273
1274                 try:
1275                         # Process video information
1276                         self._downloader.process_info({
1277                                 'id':           video_id.decode('utf-8'),
1278                                 'url':          video_url.decode('utf-8'),
1279                                 'uploader':     u'NA',
1280                                 'title':        video_title,
1281                                 'stitle':       simple_title,
1282                                 'ext':          video_extension.decode('utf-8'),
1283                                 'format':       u'NA',
1284                                 'player_url':   None,
1285                         })
1286                 except UnavailableVideoError:
1287                         self._downloader.trouble(u'ERROR: unable to download video')
1288
1289
1290 class PhotobucketIE(InfoExtractor):
1291         """Information extractor for photobucket.com."""
1292
1293         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1294
1295         def __init__(self, downloader=None):
1296                 InfoExtractor.__init__(self, downloader)
1297
1298         @staticmethod
1299         def suitable(url):
1300                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1301
1302         def report_download_webpage(self, video_id):
1303                 """Report webpage download."""
1304                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1305
1306         def report_extraction(self, video_id):
1307                 """Report information extraction."""
1308                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1309
1310         def _real_initialize(self):
1311                 return
1312
1313         def _real_extract(self, url):
1314                 # Extract id from URL
1315                 mobj = re.match(self._VALID_URL, url)
1316                 if mobj is None:
1317                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318                         return
1319
1320                 # At this point we have a new video
1321                 self._downloader.increment_downloads()
1322                 video_id = mobj.group(1)
1323
1324                 video_extension = 'flv'
1325
1326                 # Retrieve video webpage to extract further information
1327                 request = urllib2.Request(url)
1328                 try:
1329                         self.report_download_webpage(video_id)
1330                         webpage = urllib2.urlopen(request).read()
1331                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1332                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1333                         return
1334
1335                 # Extract URL, uploader, and title from webpage
1336                 self.report_extraction(video_id)
1337                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1338                 if mobj is None:
1339                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1340                         return
1341                 mediaURL = urllib.unquote(mobj.group(1))
1342
1343                 video_url = mediaURL
1344
1345                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1346                 if mobj is None:
1347                         self._downloader.trouble(u'ERROR: unable to extract title')
1348                         return
1349                 video_title = mobj.group(1).decode('utf-8')
1350                 video_title = sanitize_title(video_title)
1351                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1352
1353                 video_uploader = mobj.group(2).decode('utf-8')
1354
1355                 try:
1356                         # Process video information
1357                         self._downloader.process_info({
1358                                 'id':           video_id.decode('utf-8'),
1359                                 'url':          video_url.decode('utf-8'),
1360                                 'uploader':     video_uploader,
1361                                 'title':        video_title,
1362                                 'stitle':       simple_title,
1363                                 'ext':          video_extension.decode('utf-8'),
1364                                 'format':       u'NA',
1365                                 'player_url':   None,
1366                         })
1367                 except UnavailableVideoError:
1368                         self._downloader.trouble(u'ERROR: unable to download video')
1369
1370
1371 class YahooIE(InfoExtractor):
1372         """Information extractor for video.yahoo.com."""
1373
1374         # _VALID_URL matches all Yahoo! Video URLs
1375         # _VPAGE_URL matches only the extractable '/watch/' URLs
1376         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1377         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1378
1379         def __init__(self, downloader=None):
1380                 InfoExtractor.__init__(self, downloader)
1381
1382         @staticmethod
1383         def suitable(url):
1384                 return (re.match(YahooIE._VALID_URL, url) is not None)
1385
1386         def report_download_webpage(self, video_id):
1387                 """Report webpage download."""
1388                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1389
1390         def report_extraction(self, video_id):
1391                 """Report information extraction."""
1392                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1393
1394         def _real_initialize(self):
1395                 return
1396
1397         def _real_extract(self, url, new_video=True):
1398                 # Extract ID from URL
1399                 mobj = re.match(self._VALID_URL, url)
1400                 if mobj is None:
1401                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1402                         return
1403
1404                 # At this point we have a new video
1405                 self._downloader.increment_downloads()
1406                 video_id = mobj.group(2)
1407                 video_extension = 'flv'
1408
1409                 # Rewrite valid but non-extractable URLs as
1410                 # extractable English language /watch/ URLs
1411                 if re.match(self._VPAGE_URL, url) is None:
1412                         request = urllib2.Request(url)
1413                         try:
1414                                 webpage = urllib2.urlopen(request).read()
1415                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1416                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1417                                 return
1418
1419                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1420                         if mobj is None:
1421                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1422                                 return
1423                         yahoo_id = mobj.group(1)
1424
1425                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1426                         if mobj is None:
1427                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1428                                 return
1429                         yahoo_vid = mobj.group(1)
1430
1431                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1432                         return self._real_extract(url, new_video=False)
1433
1434                 # Retrieve video webpage to extract further information
1435                 request = urllib2.Request(url)
1436                 try:
1437                         self.report_download_webpage(video_id)
1438                         webpage = urllib2.urlopen(request).read()
1439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1440                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1441                         return
1442
1443                 # Extract uploader and title from webpage
1444                 self.report_extraction(video_id)
1445                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1446                 if mobj is None:
1447                         self._downloader.trouble(u'ERROR: unable to extract video title')
1448                         return
1449                 video_title = mobj.group(1).decode('utf-8')
1450                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1451
1452                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1453                 if mobj is None:
1454                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1455                         return
1456                 video_uploader = mobj.group(1).decode('utf-8')
1457
1458                 # Extract video thumbnail
1459                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1460                 if mobj is None:
1461                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1462                         return
1463                 video_thumbnail = mobj.group(1).decode('utf-8')
1464
1465                 # Extract video description
1466                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1467                 if mobj is None:
1468                         self._downloader.trouble(u'ERROR: unable to extract video description')
1469                         return
1470                 video_description = mobj.group(1).decode('utf-8')
1471                 if not video_description: video_description = 'No description available.'
1472
1473                 # Extract video height and width
1474                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1475                 if mobj is None:
1476                         self._downloader.trouble(u'ERROR: unable to extract video height')
1477                         return
1478                 yv_video_height = mobj.group(1)
1479
1480                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1481                 if mobj is None:
1482                         self._downloader.trouble(u'ERROR: unable to extract video width')
1483                         return
1484                 yv_video_width = mobj.group(1)
1485
1486                 # Retrieve video playlist to extract media URL
1487                 # I'm not completely sure what all these options are, but we
1488                 # seem to need most of them, otherwise the server sends a 401.
1489                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1490                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1491                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1492                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1493                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1494                 try:
1495                         self.report_download_webpage(video_id)
1496                         webpage = urllib2.urlopen(request).read()
1497                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1498                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1499                         return
1500
1501                 # Extract media URL from playlist XML
1502                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1503                 if mobj is None:
1504                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1505                         return
1506                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1507                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1508
1509                 try:
1510                         # Process video information
1511                         self._downloader.process_info({
1512                                 'id':           video_id.decode('utf-8'),
1513                                 'url':          video_url,
1514                                 'uploader':     video_uploader,
1515                                 'title':        video_title,
1516                                 'stitle':       simple_title,
1517                                 'ext':          video_extension.decode('utf-8'),
1518                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1519                                 'description':  video_description,
1520                                 'thumbnail':    video_thumbnail,
1521                                 'description':  video_description,
1522                                 'player_url':   None,
1523                         })
1524                 except UnavailableVideoError:
1525                         self._downloader.trouble(u'ERROR: unable to download video')
1526
1527
1528 class GenericIE(InfoExtractor):
1529         """Generic last-resort information extractor."""
1530
1531         def __init__(self, downloader=None):
1532                 InfoExtractor.__init__(self, downloader)
1533
1534         @staticmethod
1535         def suitable(url):
1536                 return True
1537
1538         def report_download_webpage(self, video_id):
1539                 """Report webpage download."""
1540                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1541                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1542
1543         def report_extraction(self, video_id):
1544                 """Report information extraction."""
1545                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1546
1547         def _real_initialize(self):
1548                 return
1549
1550         def _real_extract(self, url):
1551                 # At this point we have a new video
1552                 self._downloader.increment_downloads()
1553
1554                 video_id = url.split('/')[-1]
1555                 request = urllib2.Request(url)
1556                 try:
1557                         self.report_download_webpage(video_id)
1558                         webpage = urllib2.urlopen(request).read()
1559                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1560                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1561                         return
1562                 except ValueError, err:
1563                         # since this is the last-resort InfoExtractor, if
1564                         # this error is thrown, it'll be thrown here
1565                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1566                         return
1567
1568                 # Start with something easy: JW Player in SWFObject
1569                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1570                 if mobj is None:
1571                         # Broaden the search a little bit
1572                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1573                 if mobj is None:
1574                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1575                         return
1576
1577                 # It's possible that one of the regexes
1578                 # matched, but returned an empty group:
1579                 if mobj.group(1) is None:
1580                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1581                         return
1582
1583                 video_url = urllib.unquote(mobj.group(1))
1584                 video_id  = os.path.basename(video_url)
1585
1586                 # here's a fun little line of code for you:
1587                 video_extension = os.path.splitext(video_id)[1][1:]
1588                 video_id        = os.path.splitext(video_id)[0]
1589
1590                 # it's tempting to parse this further, but you would
1591                 # have to take into account all the variations like
1592                 #   Video Title - Site Name
1593                 #   Site Name | Video Title
1594                 #   Video Title - Tagline | Site Name
1595                 # and so on and so forth; it's just not practical
1596                 mobj = re.search(r'<title>(.*)</title>', webpage)
1597                 if mobj is None:
1598                         self._downloader.trouble(u'ERROR: unable to extract title')
1599                         return
1600                 video_title = mobj.group(1).decode('utf-8')
1601                 video_title = sanitize_title(video_title)
1602                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1603
1604                 # video uploader is domain name
1605                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1606                 if mobj is None:
1607                         self._downloader.trouble(u'ERROR: unable to extract title')
1608                         return
1609                 video_uploader = mobj.group(1).decode('utf-8')
1610
1611                 try:
1612                         # Process video information
1613                         self._downloader.process_info({
1614                                 'id':           video_id.decode('utf-8'),
1615                                 'url':          video_url.decode('utf-8'),
1616                                 'uploader':     video_uploader,
1617                                 'title':        video_title,
1618                                 'stitle':       simple_title,
1619                                 'ext':          video_extension.decode('utf-8'),
1620                                 'format':       u'NA',
1621                                 'player_url':   None,
1622                         })
1623                 except UnavailableVideoError, err:
1624                         self._downloader.trouble(u'ERROR: unable to download video')
1625
1626
1627 class YoutubeSearchIE(InfoExtractor):
1628         """Information Extractor for YouTube search queries."""
1629         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1630         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1631         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1632         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1633         _youtube_ie = None
1634         _max_youtube_results = 1000
1635
1636         def __init__(self, youtube_ie, downloader=None):
1637                 InfoExtractor.__init__(self, downloader)
1638                 self._youtube_ie = youtube_ie
1639         
1640         @staticmethod
1641         def suitable(url):
1642                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1643
1644         def report_download_page(self, query, pagenum):
1645                 """Report attempt to download playlist page with given number."""
1646                 query = query.decode(preferredencoding())
1647                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1648
1649         def _real_initialize(self):
1650                 self._youtube_ie.initialize()
1651         
1652         def _real_extract(self, query):
1653                 mobj = re.match(self._VALID_QUERY, query)
1654                 if mobj is None:
1655                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1656                         return
1657
1658                 prefix, query = query.split(':')
1659                 prefix = prefix[8:]
1660                 query  = query.encode('utf-8')
1661                 if prefix == '':
1662                         self._download_n_results(query, 1)
1663                         return
1664                 elif prefix == 'all':
1665                         self._download_n_results(query, self._max_youtube_results)
1666                         return
1667                 else:
1668                         try:
1669                                 n = long(prefix)
1670                                 if n <= 0:
1671                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1672                                         return
1673                                 elif n > self._max_youtube_results:
1674                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1675                                         n = self._max_youtube_results
1676                                 self._download_n_results(query, n)
1677                                 return
1678                         except ValueError: # parsing prefix as integer fails
1679                                 self._download_n_results(query, 1)
1680                                 return
1681
1682         def _download_n_results(self, query, n):
1683                 """Downloads a specified number of results for a query"""
1684
1685                 video_ids = []
1686                 already_seen = set()
1687                 pagenum = 1
1688
1689                 while True:
1690                         self.report_download_page(query, pagenum)
1691                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1692                         request = urllib2.Request(result_url, None, std_headers)
1693                         try:
1694                                 page = urllib2.urlopen(request).read()
1695                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1696                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1697                                 return
1698
1699                         # Extract video identifiers
1700                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1701                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1702                                 if video_id not in already_seen:
1703                                         video_ids.append(video_id)
1704                                         already_seen.add(video_id)
1705                                         if len(video_ids) == n:
1706                                                 # Specified n videos reached
1707                                                 for id in video_ids:
1708                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1709                                                 return
1710
1711                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1712                                 for id in video_ids:
1713                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1714                                 return
1715
1716                         pagenum = pagenum + 1
1717
1718 class GoogleSearchIE(InfoExtractor):
1719         """Information Extractor for Google Video search queries."""
1720         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1721         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1722         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1723         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1724         _google_ie = None
1725         _max_google_results = 1000
1726
1727         def __init__(self, google_ie, downloader=None):
1728                 InfoExtractor.__init__(self, downloader)
1729                 self._google_ie = google_ie
1730         
1731         @staticmethod
1732         def suitable(url):
1733                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1734
1735         def report_download_page(self, query, pagenum):
1736                 """Report attempt to download playlist page with given number."""
1737                 query = query.decode(preferredencoding())
1738                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1739
1740         def _real_initialize(self):
1741                 self._google_ie.initialize()
1742         
1743         def _real_extract(self, query):
1744                 mobj = re.match(self._VALID_QUERY, query)
1745                 if mobj is None:
1746                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1747                         return
1748
1749                 prefix, query = query.split(':')
1750                 prefix = prefix[8:]
1751                 query  = query.encode('utf-8')
1752                 if prefix == '':
1753                         self._download_n_results(query, 1)
1754                         return
1755                 elif prefix == 'all':
1756                         self._download_n_results(query, self._max_google_results)
1757                         return
1758                 else:
1759                         try:
1760                                 n = long(prefix)
1761                                 if n <= 0:
1762                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1763                                         return
1764                                 elif n > self._max_google_results:
1765                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1766                                         n = self._max_google_results
1767                                 self._download_n_results(query, n)
1768                                 return
1769                         except ValueError: # parsing prefix as integer fails
1770                                 self._download_n_results(query, 1)
1771                                 return
1772
1773         def _download_n_results(self, query, n):
1774                 """Downloads a specified number of results for a query"""
1775
1776                 video_ids = []
1777                 already_seen = set()
1778                 pagenum = 1
1779
1780                 while True:
1781                         self.report_download_page(query, pagenum)
1782                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1783                         request = urllib2.Request(result_url, None, std_headers)
1784                         try:
1785                                 page = urllib2.urlopen(request).read()
1786                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1787                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1788                                 return
1789
1790                         # Extract video identifiers
1791                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1792                                 video_id = mobj.group(1)
1793                                 if video_id not in already_seen:
1794                                         video_ids.append(video_id)
1795                                         already_seen.add(video_id)
1796                                         if len(video_ids) == n:
1797                                                 # Specified n videos reached
1798                                                 for id in video_ids:
1799                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1800                                                 return
1801
1802                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1803                                 for id in video_ids:
1804                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1805                                 return
1806
1807                         pagenum = pagenum + 1
1808
1809 class YahooSearchIE(InfoExtractor):
1810         """Information Extractor for Yahoo! Video search queries."""
1811         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1812         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1813         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1814         _MORE_PAGES_INDICATOR = r'\s*Next'
1815         _yahoo_ie = None
1816         _max_yahoo_results = 1000
1817
1818         def __init__(self, yahoo_ie, downloader=None):
1819                 InfoExtractor.__init__(self, downloader)
1820                 self._yahoo_ie = yahoo_ie
1821         
1822         @staticmethod
1823         def suitable(url):
1824                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1825
1826         def report_download_page(self, query, pagenum):
1827                 """Report attempt to download playlist page with given number."""
1828                 query = query.decode(preferredencoding())
1829                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1830
1831         def _real_initialize(self):
1832                 self._yahoo_ie.initialize()
1833         
1834         def _real_extract(self, query):
1835                 mobj = re.match(self._VALID_QUERY, query)
1836                 if mobj is None:
1837                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1838                         return
1839
1840                 prefix, query = query.split(':')
1841                 prefix = prefix[8:]
1842                 query  = query.encode('utf-8')
1843                 if prefix == '':
1844                         self._download_n_results(query, 1)
1845                         return
1846                 elif prefix == 'all':
1847                         self._download_n_results(query, self._max_yahoo_results)
1848                         return
1849                 else:
1850                         try:
1851                                 n = long(prefix)
1852                                 if n <= 0:
1853                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1854                                         return
1855                                 elif n > self._max_yahoo_results:
1856                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1857                                         n = self._max_yahoo_results
1858                                 self._download_n_results(query, n)
1859                                 return
1860                         except ValueError: # parsing prefix as integer fails
1861                                 self._download_n_results(query, 1)
1862                                 return
1863
1864         def _download_n_results(self, query, n):
1865                 """Downloads a specified number of results for a query"""
1866
1867                 video_ids = []
1868                 already_seen = set()
1869                 pagenum = 1
1870
1871                 while True:
1872                         self.report_download_page(query, pagenum)
1873                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1874                         request = urllib2.Request(result_url, None, std_headers)
1875                         try:
1876                                 page = urllib2.urlopen(request).read()
1877                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1878                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1879                                 return
1880
1881                         # Extract video identifiers
1882                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1883                                 video_id = mobj.group(1)
1884                                 if video_id not in already_seen:
1885                                         video_ids.append(video_id)
1886                                         already_seen.add(video_id)
1887                                         if len(video_ids) == n:
1888                                                 # Specified n videos reached
1889                                                 for id in video_ids:
1890                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1891                                                 return
1892
1893                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1894                                 for id in video_ids:
1895                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1896                                 return
1897
1898                         pagenum = pagenum + 1
1899
1900 class YoutubePlaylistIE(InfoExtractor):
1901         """Information Extractor for YouTube playlists."""
1902
1903         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1904         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1905         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1906         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1907         _youtube_ie = None
1908
1909         def __init__(self, youtube_ie, downloader=None):
1910                 InfoExtractor.__init__(self, downloader)
1911                 self._youtube_ie = youtube_ie
1912         
1913         @staticmethod
1914         def suitable(url):
1915                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1916
1917         def report_download_page(self, playlist_id, pagenum):
1918                 """Report attempt to download playlist page with given number."""
1919                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1920
1921         def _real_initialize(self):
1922                 self._youtube_ie.initialize()
1923         
1924         def _real_extract(self, url):
1925                 # Extract playlist id
1926                 mobj = re.match(self._VALID_URL, url)
1927                 if mobj is None:
1928                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1929                         return
1930
1931                 # Download playlist pages
1932                 playlist_id = mobj.group(1)
1933                 video_ids = []
1934                 pagenum = 1
1935
1936                 while True:
1937                         self.report_download_page(playlist_id, pagenum)
1938                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1939                         try:
1940                                 page = urllib2.urlopen(request).read()
1941                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1943                                 return
1944
1945                         # Extract video identifiers
1946                         ids_in_page = []
1947                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1948                                 if mobj.group(1) not in ids_in_page:
1949                                         ids_in_page.append(mobj.group(1))
1950                         video_ids.extend(ids_in_page)
1951
1952                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1953                                 break
1954                         pagenum = pagenum + 1
1955
1956                 playliststart = self._downloader.params.get('playliststart', 1)
1957                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1958                 if playliststart > 0:
1959                         video_ids = video_ids[playliststart:]
1960                         
1961                 for id in video_ids:
1962                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1963                 return
1964
1965 class YoutubeUserIE(InfoExtractor):
1966         """Information Extractor for YouTube users."""
1967
1968         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1969         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1970         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1971         _youtube_ie = None
1972
1973         def __init__(self, youtube_ie, downloader=None):
1974                 InfoExtractor.__init__(self, downloader)
1975                 self._youtube_ie = youtube_ie
1976         
1977         @staticmethod
1978         def suitable(url):
1979                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1980
1981         def report_download_page(self, username):
1982                 """Report attempt to download user page."""
1983                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1984
1985         def _real_initialize(self):
1986                 self._youtube_ie.initialize()
1987         
1988         def _real_extract(self, url):
1989                 # Extract username
1990                 mobj = re.match(self._VALID_URL, url)
1991                 if mobj is None:
1992                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1993                         return
1994
1995                 # Download user page
1996                 username = mobj.group(1)
1997                 video_ids = []
1998                 pagenum = 1
1999
2000                 self.report_download_page(username)
2001                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2002                 try:
2003                         page = urllib2.urlopen(request).read()
2004                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2006                         return
2007
2008                 # Extract video identifiers
2009                 ids_in_page = []
2010
2011                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2012                         if mobj.group(1) not in ids_in_page:
2013                                 ids_in_page.append(mobj.group(1))
2014                 video_ids.extend(ids_in_page)
2015
2016                 playliststart = self._downloader.params.get('playliststart', 1)
2017                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2018                 if playliststart > 0:
2019                         video_ids = video_ids[playliststart:]   
2020
2021                 for id in video_ids:
2022                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2023                 return
2024
2025 class PostProcessor(object):
2026         """Post Processor class.
2027
2028         PostProcessor objects can be added to downloaders with their
2029         add_post_processor() method. When the downloader has finished a
2030         successful download, it will take its internal chain of PostProcessors
2031         and start calling the run() method on each one of them, first with
2032         an initial argument and then with the returned value of the previous
2033         PostProcessor.
2034
2035         The chain will be stopped if one of them ever returns None or the end
2036         of the chain is reached.
2037
2038         PostProcessor objects follow a "mutual registration" process similar
2039         to InfoExtractor objects.
2040         """
2041
2042         _downloader = None
2043
2044         def __init__(self, downloader=None):
2045                 self._downloader = downloader
2046
2047         def set_downloader(self, downloader):
2048                 """Sets the downloader for this PP."""
2049                 self._downloader = downloader
2050         
2051         def run(self, information):
2052                 """Run the PostProcessor.
2053
2054                 The "information" argument is a dictionary like the ones
2055                 composed by InfoExtractors. The only difference is that this
2056                 one has an extra field called "filepath" that points to the
2057                 downloaded file.
2058
2059                 When this method returns None, the postprocessing chain is
2060                 stopped. However, this method may return an information
2061                 dictionary that will be passed to the next postprocessing
2062                 object in the chain. It can be the one it received after
2063                 changing some fields.
2064
2065                 In addition, this method may raise a PostProcessingError
2066                 exception that will be taken into account by the downloader
2067                 it was called from.
2068                 """
2069                 return information # by default, do nothing
2070         
2071 ### MAIN PROGRAM ###
2072 if __name__ == '__main__':
2073         try:
2074                 # Modules needed only when running the main program
2075                 import getpass
2076                 import optparse
2077
2078                 # Function to update the program file with the latest version from bitbucket.org
2079                 def update_self(downloader, filename):
2080                         # Note: downloader only used for options
2081                         if not os.access (filename, os.W_OK):
2082                                 sys.exit('ERROR: no write permissions on %s' % filename)
2083
2084                         downloader.to_stdout('Updating to latest stable version...')
2085                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2086                         latest_version = urllib.urlopen(latest_url).read().strip()
2087                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2088                         newcontent = urllib.urlopen(prog_url).read()
2089                         stream = open(filename, 'w')
2090                         stream.write(newcontent)
2091                         stream.close()
2092                         downloader.to_stdout('Updated to version %s' % latest_version)
2093
2094                 # General configuration
2095                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2096                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2097                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2098
2099                 # Parse command line
2100                 parser = optparse.OptionParser(
2101                         usage='Usage: %prog [options] url...',
2102                         version='2010.08.04',
2103                         conflict_handler='resolve',
2104                 )
2105
2106                 parser.add_option('-h', '--help',
2107                                 action='help', help='print this help text and exit')
2108                 parser.add_option('-v', '--version',
2109                                 action='version', help='print program version and exit')
2110                 parser.add_option('-U', '--update',
2111                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2112                 parser.add_option('-i', '--ignore-errors',
2113                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2114                 parser.add_option('-r', '--rate-limit',
2115                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2116                 parser.add_option('-R', '--retries',
2117                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2118                 parser.add_option('--playlist-start',
2119                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2120
2121                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2122                 authentication.add_option('-u', '--username',
2123                                 dest='username', metavar='USERNAME', help='account username')
2124                 authentication.add_option('-p', '--password',
2125                                 dest='password', metavar='PASSWORD', help='account password')
2126                 authentication.add_option('-n', '--netrc',
2127                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2128                 parser.add_option_group(authentication)
2129
2130                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2131                 video_format.add_option('-f', '--format',
2132                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2133                 video_format.add_option('-m', '--mobile-version',
2134                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2135                 video_format.add_option('--all-formats',
2136                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2137                 video_format.add_option('--max-quality',
2138                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2139                 video_format.add_option('-b', '--best-quality',
2140                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2141                 parser.add_option_group(video_format)
2142
2143                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2144                 verbosity.add_option('-q', '--quiet',
2145                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2146                 verbosity.add_option('-s', '--simulate',
2147                                 action='store_true', dest='simulate', help='do not download video', default=False)
2148                 verbosity.add_option('-g', '--get-url',
2149                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2150                 verbosity.add_option('-e', '--get-title',
2151                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2152                 verbosity.add_option('--get-thumbnail',
2153                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2154                 verbosity.add_option('--get-description',
2155                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2156                 verbosity.add_option('--no-progress',
2157                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2158                 parser.add_option_group(verbosity)
2159
2160                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2161                 filesystem.add_option('-t', '--title',
2162                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2163                 filesystem.add_option('-l', '--literal',
2164                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2165                 filesystem.add_option('-o', '--output',
2166                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2167                 filesystem.add_option('-a', '--batch-file',
2168                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2169                 filesystem.add_option('-w', '--no-overwrites',
2170                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2171                 filesystem.add_option('-c', '--continue',
2172                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2173                 parser.add_option_group(filesystem)
2174
2175                 (opts, args) = parser.parse_args()
2176
2177                 # Batch file verification
2178                 batchurls = []
2179                 if opts.batchfile is not None:
2180                         try:
2181                                 if opts.batchfile == '-':
2182                                         batchfd = sys.stdin
2183                                 else:
2184                                         batchfd = open(opts.batchfile, 'r')
2185                                 batchurls = batchfd.readlines()
2186                                 batchurls = [x.strip() for x in batchurls]
2187                                 batchurls = [x for x in batchurls if len(x) > 0]
2188                         except IOError:
2189                                 sys.exit(u'ERROR: batch file could not be read')
2190                 all_urls = batchurls + args
2191
2192                 # Conflicting, missing and erroneous options
2193                 if opts.bestquality:
2194                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2195                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2196                         parser.error(u'using .netrc conflicts with giving username/password')
2197                 if opts.password is not None and opts.username is None:
2198                         parser.error(u'account username missing')
2199                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2200                         parser.error(u'using output template conflicts with using title or literal title')
2201                 if opts.usetitle and opts.useliteral:
2202                         parser.error(u'using title conflicts with using literal title')
2203                 if opts.username is not None and opts.password is None:
2204                         opts.password = getpass.getpass(u'Type account password and press return:')
2205                 if opts.ratelimit is not None:
2206                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2207                         if numeric_limit is None:
2208                                 parser.error(u'invalid rate limit specified')
2209                         opts.ratelimit = numeric_limit
2210                 if opts.retries is not None:
2211                         try:
2212                                 opts.retries = long(opts.retries)
2213                         except (TypeError, ValueError), err:
2214                                 parser.error(u'invalid retry count specified')
2215                 if opts.playliststart is not None:
2216                         try:
2217                                 opts.playliststart = long(opts.playliststart)
2218                         except (TypeError, ValueError), err:
2219                                 parser.error(u'invalid playlist page specified')
2220
2221                 # Information extractors
2222                 youtube_ie = YoutubeIE()
2223                 metacafe_ie = MetacafeIE(youtube_ie)
2224                 dailymotion_ie = DailymotionIE()
2225                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2226                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2227                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2228                 google_ie = GoogleIE()
2229                 google_search_ie = GoogleSearchIE(google_ie)
2230                 photobucket_ie = PhotobucketIE()
2231                 yahoo_ie = YahooIE()
2232                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2233                 generic_ie = GenericIE()
2234
2235                 # File downloader
2236                 fd = FileDownloader({
2237                         'usenetrc': opts.usenetrc,
2238                         'username': opts.username,
2239                         'password': opts.password,
2240                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2241                         'forceurl': opts.geturl,
2242                         'forcetitle': opts.gettitle,
2243                         'forcethumbnail': opts.getthumbnail,
2244                         'forcedescription': opts.getdescription,
2245                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2246                         'format': opts.format,
2247                         'format_limit': opts.format_limit,
2248                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2249                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2250                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2251                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2252                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2253                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2254                                 or u'%(id)s.%(ext)s'),
2255                         'ignoreerrors': opts.ignoreerrors,
2256                         'ratelimit': opts.ratelimit,
2257                         'nooverwrites': opts.nooverwrites,
2258                         'retries': opts.retries,
2259                         'continuedl': opts.continue_dl,
2260                         'noprogress': opts.noprogress,
2261                         'playliststart': opts.playliststart,
2262                         })
2263                 fd.add_info_extractor(youtube_search_ie)
2264                 fd.add_info_extractor(youtube_pl_ie)
2265                 fd.add_info_extractor(youtube_user_ie)
2266                 fd.add_info_extractor(metacafe_ie)
2267                 fd.add_info_extractor(dailymotion_ie)
2268                 fd.add_info_extractor(youtube_ie)
2269                 fd.add_info_extractor(google_ie)
2270                 fd.add_info_extractor(google_search_ie)
2271                 fd.add_info_extractor(photobucket_ie)
2272                 fd.add_info_extractor(yahoo_ie)
2273                 fd.add_info_extractor(yahoo_search_ie)
2274
2275                 # This must come last since it's the
2276                 # fallback if none of the others work
2277                 fd.add_info_extractor(generic_ie)
2278
2279                 # Update version
2280                 if opts.update_self:
2281                         update_self(fd, sys.argv[0])
2282
2283                 # Maybe do nothing
2284                 if len(all_urls) < 1:
2285                         if not opts.update_self:
2286                                 parser.error(u'you must provide at least one URL')
2287                         else:
2288                                 sys.exit()
2289                 retcode = fd.download(all_urls)
2290                 sys.exit(retcode)
2291
2292         except DownloadError:
2293                 sys.exit(1)
2294         except SameFileError:
2295                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2296         except KeyboardInterrupt:
2297                 sys.exit(u'\nERROR: Interrupted by user')