c67bd38551d774bc1da893a707df667a87f27d91
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Firefox/3.6.7',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         @staticmethod
291         def verify_url(url):
292                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
293                 request = urllib2.Request(url, None, std_headers)
294                 data = urllib2.urlopen(request)
295                 data.read(1)
296                 url = data.geturl()
297                 data.close()
298                 return url
299
300         def add_info_extractor(self, ie):
301                 """Add an InfoExtractor object to the end of the list."""
302                 self._ies.append(ie)
303                 ie.set_downloader(self)
304         
305         def add_post_processor(self, pp):
306                 """Add a PostProcessor object to the end of the chain."""
307                 self._pps.append(pp)
308                 pp.set_downloader(self)
309         
310         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
311                 """Print message to stdout if not in quiet mode."""
312                 try:
313                         if not self.params.get('quiet', False):
314                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
315                         sys.stdout.flush()
316                 except (UnicodeEncodeError), err:
317                         if not ignore_encoding_errors:
318                                 raise
319         
320         def to_stderr(self, message):
321                 """Print message to stderr."""
322                 print >>sys.stderr, message.encode(preferredencoding())
323         
324         def fixed_template(self):
325                 """Checks if the output template is fixed."""
326                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
327
328         def trouble(self, message=None):
329                 """Determine action to take when a download problem appears.
330
331                 Depending on if the downloader has been configured to ignore
332                 download errors or not, this method may throw an exception or
333                 not when errors are found, after printing the message.
334                 """
335                 if message is not None:
336                         self.to_stderr(message)
337                 if not self.params.get('ignoreerrors', False):
338                         raise DownloadError(message)
339                 self._download_retcode = 1
340
341         def slow_down(self, start_time, byte_counter):
342                 """Sleep if the download speed is over the rate limit."""
343                 rate_limit = self.params.get('ratelimit', None)
344                 if rate_limit is None or byte_counter == 0:
345                         return
346                 now = time.time()
347                 elapsed = now - start_time
348                 if elapsed <= 0.0:
349                         return
350                 speed = float(byte_counter) / elapsed
351                 if speed > rate_limit:
352                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
353
354         def report_destination(self, filename):
355                 """Report destination filename."""
356                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
357         
358         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
359                 """Report download progress."""
360                 if self.params.get('noprogress', False):
361                         return
362                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
363                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
364
365         def report_resuming_byte(self, resume_len):
366                 """Report attemtp to resume at given byte."""
367                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
368         
369         def report_retry(self, count, retries):
370                 """Report retry in case of HTTP error 503"""
371                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
372         
373         def report_file_already_downloaded(self, file_name):
374                 """Report file has already been fully downloaded."""
375                 try:
376                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
377                 except (UnicodeEncodeError), err:
378                         self.to_stdout(u'[download] The file has already been downloaded')
379         
380         def report_unable_to_resume(self):
381                 """Report it was impossible to resume download."""
382                 self.to_stdout(u'[download] Unable to resume')
383         
384         def report_finish(self):
385                 """Report download finished."""
386                 if self.params.get('noprogress', False):
387                         self.to_stdout(u'[download] Download completed')
388                 else:
389                         self.to_stdout(u'')
390         
391         def increment_downloads(self):
392                 """Increment the ordinal that assigns a number to each file."""
393                 self._num_downloads += 1
394
395         def process_info(self, info_dict):
396                 """Process a single dictionary returned by an InfoExtractor."""
397                 # Do nothing else if in simulate mode
398                 if self.params.get('simulate', False):
399                         # Verify URL if it's an HTTP one
400                         if info_dict['url'].startswith('http'):
401                                 try:
402                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
403                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
404                                         raise UnavailableVideoError
405
406                         # Forced printings
407                         if self.params.get('forcetitle', False):
408                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
409                         if self.params.get('forceurl', False):
410                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
411                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
412                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
413                         if self.params.get('forcedescription', False) and 'description' in info_dict:
414                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
415
416                         return
417                         
418                 try:
419                         template_dict = dict(info_dict)
420                         template_dict['epoch'] = unicode(long(time.time()))
421                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
422                         filename = self.params['outtmpl'] % template_dict
423                 except (ValueError, KeyError), err:
424                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
425                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
426                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
427                         return
428
429                 try:
430                         self.pmkdir(filename)
431                 except (OSError, IOError), err:
432                         self.trouble('ERROR: unable to create directories: %s' % str(err))
433                         return
434
435                 try:
436                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
437                 except (OSError, IOError), err:
438                         raise UnavailableVideoError
439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
440                         self.trouble('ERROR: unable to download video data: %s' % str(err))
441                         return
442                 except (ContentTooShortError, ), err:
443                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
444                         return
445
446                 if success:
447                         try:
448                                 self.post_process(filename, info_dict)
449                         except (PostProcessingError), err:
450                                 self.trouble('ERROR: postprocessing: %s' % str(err))
451                                 return
452
453         def download(self, url_list):
454                 """Download a given list of URLs."""
455                 if len(url_list) > 1 and self.fixed_template():
456                         raise SameFileError(self.params['outtmpl'])
457
458                 for url in url_list:
459                         suitable_found = False
460                         for ie in self._ies:
461                                 # Go to next InfoExtractor if not suitable
462                                 if not ie.suitable(url):
463                                         continue
464
465                                 # Suitable InfoExtractor found
466                                 suitable_found = True
467
468                                 # Extract information from URL and process it
469                                 ie.extract(url)
470
471                                 # Suitable InfoExtractor had been found; go to next URL
472                                 break
473
474                         if not suitable_found:
475                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
476
477                 return self._download_retcode
478
479         def post_process(self, filename, ie_info):
480                 """Run the postprocessing chain on the given file."""
481                 info = dict(ie_info)
482                 info['filepath'] = filename
483                 for pp in self._pps:
484                         info = pp.run(info)
485                         if info is None:
486                                 break
487         
488         def _download_with_rtmpdump(self, filename, url, player_url):
489                 self.report_destination(filename)
490
491                 # Check for rtmpdump first
492                 try:
493                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
494                 except (OSError, IOError):
495                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
496                         return False
497
498                 # Download using rtmpdump. rtmpdump returns exit code 2 when
499                 # the connection was interrumpted and resuming appears to be
500                 # possible. This is part of rtmpdump's normal usage, AFAIK.
501                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
502                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
503                 while retval == 2 or retval == 1:
504                         prevsize = os.path.getsize(filename)
505                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
506                         time.sleep(5.0) # This seems to be needed
507                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
508                         cursize = os.path.getsize(filename)
509                         if prevsize == cursize and retval == 1:
510                                 break
511                 if retval == 0:
512                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
513                         return True
514                 else:
515                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
516                         return False
517
518         def _do_download(self, filename, url, player_url):
519                 # Attempt to download using rtmpdump
520                 if url.startswith('rtmp'):
521                         return self._download_with_rtmpdump(filename, url, player_url)
522
523                 stream = None
524                 open_mode = 'wb'
525                 basic_request = urllib2.Request(url, None, std_headers)
526                 request = urllib2.Request(url, None, std_headers)
527
528                 # Establish possible resume length
529                 if os.path.isfile(filename):
530                         resume_len = os.path.getsize(filename)
531                 else:
532                         resume_len = 0
533
534                 # Request parameters in case of being able to resume
535                 if self.params.get('continuedl', False) and resume_len != 0:
536                         self.report_resuming_byte(resume_len)
537                         request.add_header('Range','bytes=%d-' % resume_len)
538                         open_mode = 'ab'
539
540                 count = 0
541                 retries = self.params.get('retries', 0)
542                 while True:
543                         # Establish connection
544                         try:
545                                 data = urllib2.urlopen(request)
546                                 break
547                         except (urllib2.HTTPError, ), err:
548                                 if err.code == 503:
549                                         # Retry in case of HTTP error 503
550                                         count += 1
551                                         if count <= retries:
552                                                 self.report_retry(count, retries)
553                                                 continue
554                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
555                                         raise
556                                 # Unable to resume
557                                 data = urllib2.urlopen(basic_request)
558                                 content_length = data.info()['Content-Length']
559
560                                 if content_length is not None and long(content_length) == resume_len:
561                                         # Because the file had already been fully downloaded
562                                         self.report_file_already_downloaded(filename)
563                                         return True
564                                 else:
565                                         # Because the server didn't let us
566                                         self.report_unable_to_resume()
567                                         open_mode = 'wb'
568
569                 data_len = data.info().get('Content-length', None)
570                 data_len_str = self.format_bytes(data_len)
571                 byte_counter = 0
572                 block_size = 1024
573                 start = time.time()
574                 while True:
575                         # Download and write
576                         before = time.time()
577                         data_block = data.read(block_size)
578                         after = time.time()
579                         data_block_len = len(data_block)
580                         if data_block_len == 0:
581                                 break
582                         byte_counter += data_block_len
583
584                         # Open file just in time
585                         if stream is None:
586                                 try:
587                                         (stream, filename) = sanitize_open(filename, open_mode)
588                                         self.report_destination(filename)
589                                 except (OSError, IOError), err:
590                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
591                                         return False
592                         try:
593                                 stream.write(data_block)
594                         except (IOError, OSError), err:
595                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
596                         block_size = self.best_block_size(after - before, data_block_len)
597
598                         # Progress message
599                         percent_str = self.calc_percent(byte_counter, data_len)
600                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
601                         speed_str = self.calc_speed(start, time.time(), byte_counter)
602                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
603
604                         # Apply rate limit
605                         self.slow_down(start, byte_counter)
606
607                 self.report_finish()
608                 if data_len is not None and str(byte_counter) != data_len:
609                         raise ContentTooShortError(byte_counter, long(data_len))
610                 return True
611
612 class InfoExtractor(object):
613         """Information Extractor class.
614
615         Information extractors are the classes that, given a URL, extract
616         information from the video (or videos) the URL refers to. This
617         information includes the real video URL, the video title and simplified
618         title, author and others. The information is stored in a dictionary
619         which is then passed to the FileDownloader. The FileDownloader
620         processes this information possibly downloading the video to the file
621         system, among other possible outcomes. The dictionaries must include
622         the following fields:
623
624         id:             Video identifier.
625         url:            Final video URL.
626         uploader:       Nickname of the video uploader.
627         title:          Literal title.
628         stitle:         Simplified title.
629         ext:            Video filename extension.
630         format:         Video format.
631         player_url:     SWF Player URL (may be None).
632
633         The following fields are optional. Their primary purpose is to allow
634         youtube-dl to serve as the backend for a video search function, such
635         as the one in youtube2mp3.  They are only used when their respective
636         forced printing functions are called:
637
638         thumbnail:      Full URL to a video thumbnail image.
639         description:    One-line video description.
640
641         Subclasses of this one should re-define the _real_initialize() and
642         _real_extract() methods, as well as the suitable() static method.
643         Probably, they should also be instantiated and added to the main
644         downloader.
645         """
646
647         _ready = False
648         _downloader = None
649
650         def __init__(self, downloader=None):
651                 """Constructor. Receives an optional downloader."""
652                 self._ready = False
653                 self.set_downloader(downloader)
654
655         @staticmethod
656         def suitable(url):
657                 """Receives a URL and returns True if suitable for this IE."""
658                 return False
659
660         def initialize(self):
661                 """Initializes an instance (authentication, etc)."""
662                 if not self._ready:
663                         self._real_initialize()
664                         self._ready = True
665
666         def extract(self, url):
667                 """Extracts URL information and returns it in list of dicts."""
668                 self.initialize()
669                 return self._real_extract(url)
670
671         def set_downloader(self, downloader):
672                 """Sets the downloader for this IE."""
673                 self._downloader = downloader
674         
675         def _real_initialize(self):
676                 """Real initialization process. Redefine in subclasses."""
677                 pass
678
679         def _real_extract(self, url):
680                 """Real extraction process. Redefine in subclasses."""
681                 pass
682
683 class YoutubeIE(InfoExtractor):
684         """Information extractor for youtube.com."""
685
686         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
687         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
688         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
689         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
690         _NETRC_MACHINE = 'youtube'
691         # Listed in order of quality
692         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
693         _video_extensions = {
694                 '13': '3gp',
695                 '17': 'mp4',
696                 '18': 'mp4',
697                 '22': 'mp4',
698                 '37': 'mp4',
699                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
700                 '43': 'webm',
701                 '45': 'webm',
702         }
703
704         @staticmethod
705         def suitable(url):
706                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
707
708         def report_lang(self):
709                 """Report attempt to set language."""
710                 self._downloader.to_stdout(u'[youtube] Setting language')
711
712         def report_login(self):
713                 """Report attempt to log in."""
714                 self._downloader.to_stdout(u'[youtube] Logging in')
715         
716         def report_age_confirmation(self):
717                 """Report attempt to confirm age."""
718                 self._downloader.to_stdout(u'[youtube] Confirming age')
719         
720         def report_video_webpage_download(self, video_id):
721                 """Report attempt to download video webpage."""
722                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
723         
724         def report_video_info_webpage_download(self, video_id):
725                 """Report attempt to download video info webpage."""
726                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
727         
728         def report_information_extraction(self, video_id):
729                 """Report attempt to extract video information."""
730                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
731         
732         def report_unavailable_format(self, video_id, format):
733                 """Report extracted video URL."""
734                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
735         
736         def report_rtmp_download(self):
737                 """Indicate the download will use the RTMP protocol."""
738                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
739         
740         def _real_initialize(self):
741                 if self._downloader is None:
742                         return
743
744                 username = None
745                 password = None
746                 downloader_params = self._downloader.params
747
748                 # Attempt to use provided username and password or .netrc data
749                 if downloader_params.get('username', None) is not None:
750                         username = downloader_params['username']
751                         password = downloader_params['password']
752                 elif downloader_params.get('usenetrc', False):
753                         try:
754                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
755                                 if info is not None:
756                                         username = info[0]
757                                         password = info[2]
758                                 else:
759                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
760                         except (IOError, netrc.NetrcParseError), err:
761                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
762                                 return
763
764                 # Set language
765                 request = urllib2.Request(self._LANG_URL, None, std_headers)
766                 try:
767                         self.report_lang()
768                         urllib2.urlopen(request).read()
769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
770                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
771                         return
772
773                 # No authentication to be performed
774                 if username is None:
775                         return
776
777                 # Log in
778                 login_form = {
779                                 'current_form': 'loginForm',
780                                 'next':         '/',
781                                 'action_login': 'Log In',
782                                 'username':     username,
783                                 'password':     password,
784                                 }
785                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
786                 try:
787                         self.report_login()
788                         login_results = urllib2.urlopen(request).read()
789                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
790                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
791                                 return
792                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
793                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
794                         return
795         
796                 # Confirm age
797                 age_form = {
798                                 'next_url':             '/',
799                                 'action_confirm':       'Confirm',
800                                 }
801                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
802                 try:
803                         self.report_age_confirmation()
804                         age_results = urllib2.urlopen(request).read()
805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
806                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
807                         return
808
809         def _real_extract(self, url):
810                 # Extract video id from URL
811                 mobj = re.match(self._VALID_URL, url)
812                 if mobj is None:
813                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
814                         return
815                 video_id = mobj.group(2)
816
817                 # Get video webpage
818                 self.report_video_webpage_download(video_id)
819                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
820                 try:
821                         video_webpage = urllib2.urlopen(request).read()
822                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
823                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
824                         return
825
826                 # Attempt to extract SWF player URL
827                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
828                 if mobj is not None:
829                         player_url = mobj.group(1)
830                 else:
831                         player_url = None
832
833                 # Get video info
834                 self.report_video_info_webpage_download(video_id)
835                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
836                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
837                                            % (video_id, el_type))
838                         request = urllib2.Request(video_info_url, None, std_headers)
839                         try:
840                                 video_info_webpage = urllib2.urlopen(request).read()
841                                 video_info = parse_qs(video_info_webpage)
842                                 if 'token' in video_info:
843                                         break
844                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
845                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
846                                 return
847                 self.report_information_extraction(video_id)
848
849                 # uploader
850                 if 'author' not in video_info:
851                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
852                         return
853                 video_uploader = urllib.unquote_plus(video_info['author'][0])
854
855                 # title
856                 if 'title' not in video_info:
857                         self._downloader.trouble(u'ERROR: unable to extract video title')
858                         return
859                 video_title = urllib.unquote_plus(video_info['title'][0])
860                 video_title = video_title.decode('utf-8')
861                 video_title = sanitize_title(video_title)
862
863                 # simplified title
864                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
865                 simple_title = simple_title.strip(ur'_')
866
867                 # thumbnail image
868                 if 'thumbnail_url' not in video_info:
869                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
870                         video_thumbnail = ''
871                 else:   # don't panic if we can't find it
872                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
873
874                 # description
875                 video_description = 'No description available.'
876                 if self._downloader.params.get('forcedescription', False):
877                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
878                         if mobj is not None:
879                                 video_description = mobj.group(1)
880
881                 # token
882                 video_token = urllib.unquote_plus(video_info['token'][0])
883
884                 # Decide which formats to download
885                 requested_format = self._downloader.params.get('format', None)
886                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
887
888                 if 'fmt_url_map' in video_info:
889                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
890                         format_limit = self._downloader.params.get('format_limit', None)
891                         if format_limit is not None and format_limit in self._available_formats:
892                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
893                         else:
894                                 format_list = self._available_formats
895                         existing_formats = [x for x in format_list if x in url_map]
896                         if len(existing_formats) == 0:
897                                 self._downloader.trouble(u'ERROR: no known formats available for video')
898                                 return
899                         if requested_format is None:
900                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
901                         elif requested_format == '-1':
902                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
903                         else:
904                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
905
906                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
907                         self.report_rtmp_download()
908                         video_url_list = [(None, video_info['conn'][0])]
909
910                 else:
911                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
912                         return
913
914                 for format_param, video_real_url in video_url_list:
915                         # At this point we have a new video
916                         self._downloader.increment_downloads()
917
918                         # Extension
919                         video_extension = self._video_extensions.get(format_param, 'flv')
920
921                         # Find the video URL in fmt_url_map or conn paramters
922                         try:
923                                 # Process video information
924                                 self._downloader.process_info({
925                                         'id':           video_id.decode('utf-8'),
926                                         'url':          video_real_url.decode('utf-8'),
927                                         'uploader':     video_uploader.decode('utf-8'),
928                                         'title':        video_title,
929                                         'stitle':       simple_title,
930                                         'ext':          video_extension.decode('utf-8'),
931                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
932                                         'thumbnail':    video_thumbnail.decode('utf-8'),
933                                         'description':  video_description.decode('utf-8'),
934                                         'player_url':   player_url,
935                                 })
936                         except UnavailableVideoError, err:
937                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
938
939
940 class MetacafeIE(InfoExtractor):
941         """Information Extractor for metacafe.com."""
942
943         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
944         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
945         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
946         _youtube_ie = None
947
948         def __init__(self, youtube_ie, downloader=None):
949                 InfoExtractor.__init__(self, downloader)
950                 self._youtube_ie = youtube_ie
951
952         @staticmethod
953         def suitable(url):
954                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
955
956         def report_disclaimer(self):
957                 """Report disclaimer retrieval."""
958                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
959
960         def report_age_confirmation(self):
961                 """Report attempt to confirm age."""
962                 self._downloader.to_stdout(u'[metacafe] Confirming age')
963         
964         def report_download_webpage(self, video_id):
965                 """Report webpage download."""
966                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
967         
968         def report_extraction(self, video_id):
969                 """Report information extraction."""
970                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
971
972         def _real_initialize(self):
973                 # Retrieve disclaimer
974                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
975                 try:
976                         self.report_disclaimer()
977                         disclaimer = urllib2.urlopen(request).read()
978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
979                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
980                         return
981
982                 # Confirm age
983                 disclaimer_form = {
984                         'filters': '0',
985                         'submit': "Continue - I'm over 18",
986                         }
987                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
988                 try:
989                         self.report_age_confirmation()
990                         disclaimer = urllib2.urlopen(request).read()
991                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
992                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
993                         return
994         
995         def _real_extract(self, url):
996                 # Extract id and simplified title from URL
997                 mobj = re.match(self._VALID_URL, url)
998                 if mobj is None:
999                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1000                         return
1001
1002                 video_id = mobj.group(1)
1003
1004                 # Check if video comes from YouTube
1005                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1006                 if mobj2 is not None:
1007                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1008                         return
1009
1010                 # At this point we have a new video
1011                 self._downloader.increment_downloads()
1012
1013                 simple_title = mobj.group(2).decode('utf-8')
1014                 video_extension = 'flv'
1015
1016                 # Retrieve video webpage to extract further information
1017                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1018                 try:
1019                         self.report_download_webpage(video_id)
1020                         webpage = urllib2.urlopen(request).read()
1021                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1022                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1023                         return
1024
1025                 # Extract URL, uploader and title from webpage
1026                 self.report_extraction(video_id)
1027                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1028                 if mobj is None:
1029                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1030                         return
1031                 mediaURL = urllib.unquote(mobj.group(1))
1032
1033                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1034                 #if mobj is None:
1035                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1036                 #       return
1037                 #gdaKey = mobj.group(1)
1038                 #
1039                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1040
1041                 video_url = mediaURL
1042
1043                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1044                 if mobj is None:
1045                         self._downloader.trouble(u'ERROR: unable to extract title')
1046                         return
1047                 video_title = mobj.group(1).decode('utf-8')
1048                 video_title = sanitize_title(video_title)
1049
1050                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1051                 if mobj is None:
1052                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1053                         return
1054                 video_uploader = mobj.group(1)
1055
1056                 try:
1057                         # Process video information
1058                         self._downloader.process_info({
1059                                 'id':           video_id.decode('utf-8'),
1060                                 'url':          video_url.decode('utf-8'),
1061                                 'uploader':     video_uploader.decode('utf-8'),
1062                                 'title':        video_title,
1063                                 'stitle':       simple_title,
1064                                 'ext':          video_extension.decode('utf-8'),
1065                                 'format':       u'NA',
1066                                 'player_url':   None,
1067                         })
1068                 except UnavailableVideoError:
1069                         self._downloader.trouble(u'ERROR: unable to download video')
1070
1071
1072 class DailymotionIE(InfoExtractor):
1073         """Information Extractor for Dailymotion"""
1074
1075         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1076
1077         def __init__(self, downloader=None):
1078                 InfoExtractor.__init__(self, downloader)
1079
1080         @staticmethod
1081         def suitable(url):
1082                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1083
1084         def report_download_webpage(self, video_id):
1085                 """Report webpage download."""
1086                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1087         
1088         def report_extraction(self, video_id):
1089                 """Report information extraction."""
1090                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1091
1092         def _real_initialize(self):
1093                 return
1094
1095         def _real_extract(self, url):
1096                 # Extract id and simplified title from URL
1097                 mobj = re.match(self._VALID_URL, url)
1098                 if mobj is None:
1099                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1100                         return
1101
1102                 # At this point we have a new video
1103                 self._downloader.increment_downloads()
1104                 video_id = mobj.group(1)
1105
1106                 simple_title = mobj.group(2).decode('utf-8')
1107                 video_extension = 'flv'
1108
1109                 # Retrieve video webpage to extract further information
1110                 request = urllib2.Request(url)
1111                 try:
1112                         self.report_download_webpage(video_id)
1113                         webpage = urllib2.urlopen(request).read()
1114                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1115                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1116                         return
1117
1118                 # Extract URL, uploader and title from webpage
1119                 self.report_extraction(video_id)
1120                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1121                 if mobj is None:
1122                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1123                         return
1124                 mediaURL = urllib.unquote(mobj.group(1))
1125
1126                 # if needed add http://www.dailymotion.com/ if relative URL
1127
1128                 video_url = mediaURL
1129
1130                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1131                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1132                 if mobj is None:
1133                         self._downloader.trouble(u'ERROR: unable to extract title')
1134                         return
1135                 video_title = mobj.group(1).decode('utf-8')
1136                 video_title = sanitize_title(video_title)
1137
1138                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1139                 if mobj is None:
1140                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1141                         return
1142                 video_uploader = mobj.group(1)
1143
1144                 try:
1145                         # Process video information
1146                         self._downloader.process_info({
1147                                 'id':           video_id.decode('utf-8'),
1148                                 'url':          video_url.decode('utf-8'),
1149                                 'uploader':     video_uploader.decode('utf-8'),
1150                                 'title':        video_title,
1151                                 'stitle':       simple_title,
1152                                 'ext':          video_extension.decode('utf-8'),
1153                                 'format':       u'NA',
1154                                 'player_url':   None,
1155                         })
1156                 except UnavailableVideoError:
1157                         self._downloader.trouble(u'ERROR: unable to download video')
1158
1159 class GoogleIE(InfoExtractor):
1160         """Information extractor for video.google.com."""
1161
1162         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1163
1164         def __init__(self, downloader=None):
1165                 InfoExtractor.__init__(self, downloader)
1166
1167         @staticmethod
1168         def suitable(url):
1169                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1170
1171         def report_download_webpage(self, video_id):
1172                 """Report webpage download."""
1173                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1174
1175         def report_extraction(self, video_id):
1176                 """Report information extraction."""
1177                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1178
1179         def _real_initialize(self):
1180                 return
1181
1182         def _real_extract(self, url):
1183                 # Extract id from URL
1184                 mobj = re.match(self._VALID_URL, url)
1185                 if mobj is None:
1186                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187                         return
1188
1189                 # At this point we have a new video
1190                 self._downloader.increment_downloads()
1191                 video_id = mobj.group(1)
1192
1193                 video_extension = 'mp4'
1194
1195                 # Retrieve video webpage to extract further information
1196                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1197                 try:
1198                         self.report_download_webpage(video_id)
1199                         webpage = urllib2.urlopen(request).read()
1200                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1201                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1202                         return
1203
1204                 # Extract URL, uploader, and title from webpage
1205                 self.report_extraction(video_id)
1206                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1207                 if mobj is None:
1208                         video_extension = 'flv'
1209                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1210                 if mobj is None:
1211                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1212                         return
1213                 mediaURL = urllib.unquote(mobj.group(1))
1214                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1215                 mediaURL = mediaURL.replace('\\x26', '\x26')
1216
1217                 video_url = mediaURL
1218
1219                 mobj = re.search(r'<title>(.*)</title>', webpage)
1220                 if mobj is None:
1221                         self._downloader.trouble(u'ERROR: unable to extract title')
1222                         return
1223                 video_title = mobj.group(1).decode('utf-8')
1224                 video_title = sanitize_title(video_title)
1225                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1226
1227                 # Extract video description
1228                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1229                 if mobj is None:
1230                         self._downloader.trouble(u'ERROR: unable to extract video description')
1231                         return
1232                 video_description = mobj.group(1).decode('utf-8')
1233                 if not video_description:
1234                         video_description = 'No description available.'
1235
1236                 # Extract video thumbnail
1237                 if self._downloader.params.get('forcethumbnail', False):
1238                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1239                         try:
1240                                 webpage = urllib2.urlopen(request).read()
1241                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1242                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1243                                 return
1244                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1245                         if mobj is None:
1246                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1247                                 return
1248                         video_thumbnail = mobj.group(1)
1249                 else:   # we need something to pass to process_info
1250                         video_thumbnail = ''
1251
1252
1253                 try:
1254                         # Process video information
1255                         self._downloader.process_info({
1256                                 'id':           video_id.decode('utf-8'),
1257                                 'url':          video_url.decode('utf-8'),
1258                                 'uploader':     u'NA',
1259                                 'title':        video_title,
1260                                 'stitle':       simple_title,
1261                                 'ext':          video_extension.decode('utf-8'),
1262                                 'format':       u'NA',
1263                                 'player_url':   None,
1264                         })
1265                 except UnavailableVideoError:
1266                         self._downloader.trouble(u'ERROR: unable to download video')
1267
1268
1269 class PhotobucketIE(InfoExtractor):
1270         """Information extractor for photobucket.com."""
1271
1272         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1273
1274         def __init__(self, downloader=None):
1275                 InfoExtractor.__init__(self, downloader)
1276
1277         @staticmethod
1278         def suitable(url):
1279                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1280
1281         def report_download_webpage(self, video_id):
1282                 """Report webpage download."""
1283                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1284
1285         def report_extraction(self, video_id):
1286                 """Report information extraction."""
1287                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1288
1289         def _real_initialize(self):
1290                 return
1291
1292         def _real_extract(self, url):
1293                 # Extract id from URL
1294                 mobj = re.match(self._VALID_URL, url)
1295                 if mobj is None:
1296                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1297                         return
1298
1299                 # At this point we have a new video
1300                 self._downloader.increment_downloads()
1301                 video_id = mobj.group(1)
1302
1303                 video_extension = 'flv'
1304
1305                 # Retrieve video webpage to extract further information
1306                 request = urllib2.Request(url)
1307                 try:
1308                         self.report_download_webpage(video_id)
1309                         webpage = urllib2.urlopen(request).read()
1310                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1311                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1312                         return
1313
1314                 # Extract URL, uploader, and title from webpage
1315                 self.report_extraction(video_id)
1316                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1317                 if mobj is None:
1318                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1319                         return
1320                 mediaURL = urllib.unquote(mobj.group(1))
1321
1322                 video_url = mediaURL
1323
1324                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1325                 if mobj is None:
1326                         self._downloader.trouble(u'ERROR: unable to extract title')
1327                         return
1328                 video_title = mobj.group(1).decode('utf-8')
1329                 video_title = sanitize_title(video_title)
1330                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1331
1332                 video_uploader = mobj.group(2).decode('utf-8')
1333
1334                 try:
1335                         # Process video information
1336                         self._downloader.process_info({
1337                                 'id':           video_id.decode('utf-8'),
1338                                 'url':          video_url.decode('utf-8'),
1339                                 'uploader':     video_uploader,
1340                                 'title':        video_title,
1341                                 'stitle':       simple_title,
1342                                 'ext':          video_extension.decode('utf-8'),
1343                                 'format':       u'NA',
1344                                 'player_url':   None,
1345                         })
1346                 except UnavailableVideoError:
1347                         self._downloader.trouble(u'ERROR: unable to download video')
1348
1349
1350 class YahooIE(InfoExtractor):
1351         """Information extractor for video.yahoo.com."""
1352
1353         # _VALID_URL matches all Yahoo! Video URLs
1354         # _VPAGE_URL matches only the extractable '/watch/' URLs
1355         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1356         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1357
1358         def __init__(self, downloader=None):
1359                 InfoExtractor.__init__(self, downloader)
1360
1361         @staticmethod
1362         def suitable(url):
1363                 return (re.match(YahooIE._VALID_URL, url) is not None)
1364
1365         def report_download_webpage(self, video_id):
1366                 """Report webpage download."""
1367                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1368
1369         def report_extraction(self, video_id):
1370                 """Report information extraction."""
1371                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1372
1373         def _real_initialize(self):
1374                 return
1375
1376         def _real_extract(self, url, new_video=True):
1377                 # Extract ID from URL
1378                 mobj = re.match(self._VALID_URL, url)
1379                 if mobj is None:
1380                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1381                         return
1382
1383                 # At this point we have a new video
1384                 self._downloader.increment_downloads()
1385                 video_id = mobj.group(2)
1386                 video_extension = 'flv'
1387
1388                 # Rewrite valid but non-extractable URLs as
1389                 # extractable English language /watch/ URLs
1390                 if re.match(self._VPAGE_URL, url) is None:
1391                         request = urllib2.Request(url)
1392                         try:
1393                                 webpage = urllib2.urlopen(request).read()
1394                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1395                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1396                                 return
1397
1398                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1399                         if mobj is None:
1400                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1401                                 return
1402                         yahoo_id = mobj.group(1)
1403
1404                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1405                         if mobj is None:
1406                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1407                                 return
1408                         yahoo_vid = mobj.group(1)
1409
1410                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1411                         return self._real_extract(url, new_video=False)
1412
1413                 # Retrieve video webpage to extract further information
1414                 request = urllib2.Request(url)
1415                 try:
1416                         self.report_download_webpage(video_id)
1417                         webpage = urllib2.urlopen(request).read()
1418                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1419                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1420                         return
1421
1422                 # Extract uploader and title from webpage
1423                 self.report_extraction(video_id)
1424                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1425                 if mobj is None:
1426                         self._downloader.trouble(u'ERROR: unable to extract video title')
1427                         return
1428                 video_title = mobj.group(1).decode('utf-8')
1429                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1430
1431                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1432                 if mobj is None:
1433                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1434                         return
1435                 video_uploader = mobj.group(1).decode('utf-8')
1436
1437                 # Extract video thumbnail
1438                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1439                 if mobj is None:
1440                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1441                         return
1442                 video_thumbnail = mobj.group(1).decode('utf-8')
1443
1444                 # Extract video description
1445                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1446                 if mobj is None:
1447                         self._downloader.trouble(u'ERROR: unable to extract video description')
1448                         return
1449                 video_description = mobj.group(1).decode('utf-8')
1450                 if not video_description: video_description = 'No description available.'
1451
1452                 # Extract video height and width
1453                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1454                 if mobj is None:
1455                         self._downloader.trouble(u'ERROR: unable to extract video height')
1456                         return
1457                 yv_video_height = mobj.group(1)
1458
1459                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1460                 if mobj is None:
1461                         self._downloader.trouble(u'ERROR: unable to extract video width')
1462                         return
1463                 yv_video_width = mobj.group(1)
1464
1465                 # Retrieve video playlist to extract media URL
1466                 # I'm not completely sure what all these options are, but we
1467                 # seem to need most of them, otherwise the server sends a 401.
1468                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1469                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1470                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1471                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1472                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1473                 try:
1474                         self.report_download_webpage(video_id)
1475                         webpage = urllib2.urlopen(request).read()
1476                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1477                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1478                         return
1479
1480                 # Extract media URL from playlist XML
1481                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1482                 if mobj is None:
1483                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1484                         return
1485                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1486                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1487
1488                 try:
1489                         # Process video information
1490                         self._downloader.process_info({
1491                                 'id':           video_id.decode('utf-8'),
1492                                 'url':          video_url,
1493                                 'uploader':     video_uploader,
1494                                 'title':        video_title,
1495                                 'stitle':       simple_title,
1496                                 'ext':          video_extension.decode('utf-8'),
1497                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1498                                 'description':  video_description,
1499                                 'thumbnail':    video_thumbnail,
1500                                 'description':  video_description,
1501                                 'player_url':   None,
1502                         })
1503                 except UnavailableVideoError:
1504                         self._downloader.trouble(u'ERROR: unable to download video')
1505
1506
1507 class GenericIE(InfoExtractor):
1508         """Generic last-resort information extractor."""
1509
1510         def __init__(self, downloader=None):
1511                 InfoExtractor.__init__(self, downloader)
1512
1513         @staticmethod
1514         def suitable(url):
1515                 return True
1516
1517         def report_download_webpage(self, video_id):
1518                 """Report webpage download."""
1519                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1520                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1521
1522         def report_extraction(self, video_id):
1523                 """Report information extraction."""
1524                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1525
1526         def _real_initialize(self):
1527                 return
1528
1529         def _real_extract(self, url):
1530                 # At this point we have a new video
1531                 self._downloader.increment_downloads()
1532
1533                 video_id = url.split('/')[-1]
1534                 request = urllib2.Request(url)
1535                 try:
1536                         self.report_download_webpage(video_id)
1537                         webpage = urllib2.urlopen(request).read()
1538                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1539                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1540                         return
1541                 except ValueError, err:
1542                         # since this is the last-resort InfoExtractor, if
1543                         # this error is thrown, it'll be thrown here
1544                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1545                         return
1546
1547                 # Start with something easy: JW Player in SWFObject
1548                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1549                 if mobj is None:
1550                         # Broaden the search a little bit
1551                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1552                 if mobj is None:
1553                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1554                         return
1555
1556                 # It's possible that one of the regexes
1557                 # matched, but returned an empty group:
1558                 if mobj.group(1) is None:
1559                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1560                         return
1561
1562                 video_url = urllib.unquote(mobj.group(1))
1563                 video_id  = os.path.basename(video_url)
1564
1565                 # here's a fun little line of code for you:
1566                 video_extension = os.path.splitext(video_id)[1][1:]
1567                 video_id        = os.path.splitext(video_id)[0]
1568
1569                 # it's tempting to parse this further, but you would
1570                 # have to take into account all the variations like
1571                 #   Video Title - Site Name
1572                 #   Site Name | Video Title
1573                 #   Video Title - Tagline | Site Name
1574                 # and so on and so forth; it's just not practical
1575                 mobj = re.search(r'<title>(.*)</title>', webpage)
1576                 if mobj is None:
1577                         self._downloader.trouble(u'ERROR: unable to extract title')
1578                         return
1579                 video_title = mobj.group(1).decode('utf-8')
1580                 video_title = sanitize_title(video_title)
1581                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1582
1583                 # video uploader is domain name
1584                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1585                 if mobj is None:
1586                         self._downloader.trouble(u'ERROR: unable to extract title')
1587                         return
1588                 video_uploader = mobj.group(1).decode('utf-8')
1589
1590                 try:
1591                         # Process video information
1592                         self._downloader.process_info({
1593                                 'id':           video_id.decode('utf-8'),
1594                                 'url':          video_url.decode('utf-8'),
1595                                 'uploader':     video_uploader,
1596                                 'title':        video_title,
1597                                 'stitle':       simple_title,
1598                                 'ext':          video_extension.decode('utf-8'),
1599                                 'format':       u'NA',
1600                                 'player_url':   None,
1601                         })
1602                 except UnavailableVideoError, err:
1603                         self._downloader.trouble(u'ERROR: unable to download video')
1604
1605
1606 class YoutubeSearchIE(InfoExtractor):
1607         """Information Extractor for YouTube search queries."""
1608         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1609         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1610         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1611         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1612         _youtube_ie = None
1613         _max_youtube_results = 1000
1614
1615         def __init__(self, youtube_ie, downloader=None):
1616                 InfoExtractor.__init__(self, downloader)
1617                 self._youtube_ie = youtube_ie
1618         
1619         @staticmethod
1620         def suitable(url):
1621                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1622
1623         def report_download_page(self, query, pagenum):
1624                 """Report attempt to download playlist page with given number."""
1625                 query = query.decode(preferredencoding())
1626                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1627
1628         def _real_initialize(self):
1629                 self._youtube_ie.initialize()
1630         
1631         def _real_extract(self, query):
1632                 mobj = re.match(self._VALID_QUERY, query)
1633                 if mobj is None:
1634                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1635                         return
1636
1637                 prefix, query = query.split(':')
1638                 prefix = prefix[8:]
1639                 query  = query.encode('utf-8')
1640                 if prefix == '':
1641                         self._download_n_results(query, 1)
1642                         return
1643                 elif prefix == 'all':
1644                         self._download_n_results(query, self._max_youtube_results)
1645                         return
1646                 else:
1647                         try:
1648                                 n = long(prefix)
1649                                 if n <= 0:
1650                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1651                                         return
1652                                 elif n > self._max_youtube_results:
1653                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1654                                         n = self._max_youtube_results
1655                                 self._download_n_results(query, n)
1656                                 return
1657                         except ValueError: # parsing prefix as integer fails
1658                                 self._download_n_results(query, 1)
1659                                 return
1660
1661         def _download_n_results(self, query, n):
1662                 """Downloads a specified number of results for a query"""
1663
1664                 video_ids = []
1665                 already_seen = set()
1666                 pagenum = 1
1667
1668                 while True:
1669                         self.report_download_page(query, pagenum)
1670                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1671                         request = urllib2.Request(result_url, None, std_headers)
1672                         try:
1673                                 page = urllib2.urlopen(request).read()
1674                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1675                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1676                                 return
1677
1678                         # Extract video identifiers
1679                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1680                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1681                                 if video_id not in already_seen:
1682                                         video_ids.append(video_id)
1683                                         already_seen.add(video_id)
1684                                         if len(video_ids) == n:
1685                                                 # Specified n videos reached
1686                                                 for id in video_ids:
1687                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1688                                                 return
1689
1690                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1691                                 for id in video_ids:
1692                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1693                                 return
1694
1695                         pagenum = pagenum + 1
1696
1697 class GoogleSearchIE(InfoExtractor):
1698         """Information Extractor for Google Video search queries."""
1699         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1700         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1701         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1702         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1703         _google_ie = None
1704         _max_google_results = 1000
1705
1706         def __init__(self, google_ie, downloader=None):
1707                 InfoExtractor.__init__(self, downloader)
1708                 self._google_ie = google_ie
1709         
1710         @staticmethod
1711         def suitable(url):
1712                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1713
1714         def report_download_page(self, query, pagenum):
1715                 """Report attempt to download playlist page with given number."""
1716                 query = query.decode(preferredencoding())
1717                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1718
1719         def _real_initialize(self):
1720                 self._google_ie.initialize()
1721         
1722         def _real_extract(self, query):
1723                 mobj = re.match(self._VALID_QUERY, query)
1724                 if mobj is None:
1725                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1726                         return
1727
1728                 prefix, query = query.split(':')
1729                 prefix = prefix[8:]
1730                 query  = query.encode('utf-8')
1731                 if prefix == '':
1732                         self._download_n_results(query, 1)
1733                         return
1734                 elif prefix == 'all':
1735                         self._download_n_results(query, self._max_google_results)
1736                         return
1737                 else:
1738                         try:
1739                                 n = long(prefix)
1740                                 if n <= 0:
1741                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1742                                         return
1743                                 elif n > self._max_google_results:
1744                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1745                                         n = self._max_google_results
1746                                 self._download_n_results(query, n)
1747                                 return
1748                         except ValueError: # parsing prefix as integer fails
1749                                 self._download_n_results(query, 1)
1750                                 return
1751
1752         def _download_n_results(self, query, n):
1753                 """Downloads a specified number of results for a query"""
1754
1755                 video_ids = []
1756                 already_seen = set()
1757                 pagenum = 1
1758
1759                 while True:
1760                         self.report_download_page(query, pagenum)
1761                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1762                         request = urllib2.Request(result_url, None, std_headers)
1763                         try:
1764                                 page = urllib2.urlopen(request).read()
1765                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1766                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1767                                 return
1768
1769                         # Extract video identifiers
1770                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1771                                 video_id = mobj.group(1)
1772                                 if video_id not in already_seen:
1773                                         video_ids.append(video_id)
1774                                         already_seen.add(video_id)
1775                                         if len(video_ids) == n:
1776                                                 # Specified n videos reached
1777                                                 for id in video_ids:
1778                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1779                                                 return
1780
1781                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1782                                 for id in video_ids:
1783                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1784                                 return
1785
1786                         pagenum = pagenum + 1
1787
1788 class YahooSearchIE(InfoExtractor):
1789         """Information Extractor for Yahoo! Video search queries."""
1790         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1791         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1792         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1793         _MORE_PAGES_INDICATOR = r'\s*Next'
1794         _yahoo_ie = None
1795         _max_yahoo_results = 1000
1796
1797         def __init__(self, yahoo_ie, downloader=None):
1798                 InfoExtractor.__init__(self, downloader)
1799                 self._yahoo_ie = yahoo_ie
1800         
1801         @staticmethod
1802         def suitable(url):
1803                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1804
1805         def report_download_page(self, query, pagenum):
1806                 """Report attempt to download playlist page with given number."""
1807                 query = query.decode(preferredencoding())
1808                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1809
1810         def _real_initialize(self):
1811                 self._yahoo_ie.initialize()
1812         
1813         def _real_extract(self, query):
1814                 mobj = re.match(self._VALID_QUERY, query)
1815                 if mobj is None:
1816                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1817                         return
1818
1819                 prefix, query = query.split(':')
1820                 prefix = prefix[8:]
1821                 query  = query.encode('utf-8')
1822                 if prefix == '':
1823                         self._download_n_results(query, 1)
1824                         return
1825                 elif prefix == 'all':
1826                         self._download_n_results(query, self._max_yahoo_results)
1827                         return
1828                 else:
1829                         try:
1830                                 n = long(prefix)
1831                                 if n <= 0:
1832                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1833                                         return
1834                                 elif n > self._max_yahoo_results:
1835                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1836                                         n = self._max_yahoo_results
1837                                 self._download_n_results(query, n)
1838                                 return
1839                         except ValueError: # parsing prefix as integer fails
1840                                 self._download_n_results(query, 1)
1841                                 return
1842
1843         def _download_n_results(self, query, n):
1844                 """Downloads a specified number of results for a query"""
1845
1846                 video_ids = []
1847                 already_seen = set()
1848                 pagenum = 1
1849
1850                 while True:
1851                         self.report_download_page(query, pagenum)
1852                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1853                         request = urllib2.Request(result_url, None, std_headers)
1854                         try:
1855                                 page = urllib2.urlopen(request).read()
1856                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1857                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1858                                 return
1859
1860                         # Extract video identifiers
1861                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1862                                 video_id = mobj.group(1)
1863                                 if video_id not in already_seen:
1864                                         video_ids.append(video_id)
1865                                         already_seen.add(video_id)
1866                                         if len(video_ids) == n:
1867                                                 # Specified n videos reached
1868                                                 for id in video_ids:
1869                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1870                                                 return
1871
1872                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1873                                 for id in video_ids:
1874                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1875                                 return
1876
1877                         pagenum = pagenum + 1
1878
1879 class YoutubePlaylistIE(InfoExtractor):
1880         """Information Extractor for YouTube playlists."""
1881
1882         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1883         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1884         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1885         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1886         _youtube_ie = None
1887
1888         def __init__(self, youtube_ie, downloader=None):
1889                 InfoExtractor.__init__(self, downloader)
1890                 self._youtube_ie = youtube_ie
1891         
1892         @staticmethod
1893         def suitable(url):
1894                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1895
1896         def report_download_page(self, playlist_id, pagenum):
1897                 """Report attempt to download playlist page with given number."""
1898                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1899
1900         def _real_initialize(self):
1901                 self._youtube_ie.initialize()
1902         
1903         def _real_extract(self, url):
1904                 # Extract playlist id
1905                 mobj = re.match(self._VALID_URL, url)
1906                 if mobj is None:
1907                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1908                         return
1909
1910                 # Download playlist pages
1911                 playlist_id = mobj.group(1)
1912                 video_ids = []
1913                 pagenum = 1
1914
1915                 while True:
1916                         self.report_download_page(playlist_id, pagenum)
1917                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1918                         try:
1919                                 page = urllib2.urlopen(request).read()
1920                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1921                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1922                                 return
1923
1924                         # Extract video identifiers
1925                         ids_in_page = []
1926                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1927                                 if mobj.group(1) not in ids_in_page:
1928                                         ids_in_page.append(mobj.group(1))
1929                         video_ids.extend(ids_in_page)
1930
1931                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1932                                 break
1933                         pagenum = pagenum + 1
1934
1935                 for id in video_ids:
1936                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1937                 return
1938
1939 class YoutubeUserIE(InfoExtractor):
1940         """Information Extractor for YouTube users."""
1941
1942         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1943         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1944         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1945         _youtube_ie = None
1946
1947         def __init__(self, youtube_ie, downloader=None):
1948                 InfoExtractor.__init__(self, downloader)
1949                 self._youtube_ie = youtube_ie
1950         
1951         @staticmethod
1952         def suitable(url):
1953                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1954
1955         def report_download_page(self, username):
1956                 """Report attempt to download user page."""
1957                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1958
1959         def _real_initialize(self):
1960                 self._youtube_ie.initialize()
1961         
1962         def _real_extract(self, url):
1963                 # Extract username
1964                 mobj = re.match(self._VALID_URL, url)
1965                 if mobj is None:
1966                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1967                         return
1968
1969                 # Download user page
1970                 username = mobj.group(1)
1971                 video_ids = []
1972                 pagenum = 1
1973
1974                 self.report_download_page(username)
1975                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1976                 try:
1977                         page = urllib2.urlopen(request).read()
1978                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1979                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1980                         return
1981
1982                 # Extract video identifiers
1983                 ids_in_page = []
1984
1985                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1986                         if mobj.group(1) not in ids_in_page:
1987                                 ids_in_page.append(mobj.group(1))
1988                 video_ids.extend(ids_in_page)
1989
1990                 for id in video_ids:
1991                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1992                 return
1993
1994 class PostProcessor(object):
1995         """Post Processor class.
1996
1997         PostProcessor objects can be added to downloaders with their
1998         add_post_processor() method. When the downloader has finished a
1999         successful download, it will take its internal chain of PostProcessors
2000         and start calling the run() method on each one of them, first with
2001         an initial argument and then with the returned value of the previous
2002         PostProcessor.
2003
2004         The chain will be stopped if one of them ever returns None or the end
2005         of the chain is reached.
2006
2007         PostProcessor objects follow a "mutual registration" process similar
2008         to InfoExtractor objects.
2009         """
2010
2011         _downloader = None
2012
2013         def __init__(self, downloader=None):
2014                 self._downloader = downloader
2015
2016         def set_downloader(self, downloader):
2017                 """Sets the downloader for this PP."""
2018                 self._downloader = downloader
2019         
2020         def run(self, information):
2021                 """Run the PostProcessor.
2022
2023                 The "information" argument is a dictionary like the ones
2024                 composed by InfoExtractors. The only difference is that this
2025                 one has an extra field called "filepath" that points to the
2026                 downloaded file.
2027
2028                 When this method returns None, the postprocessing chain is
2029                 stopped. However, this method may return an information
2030                 dictionary that will be passed to the next postprocessing
2031                 object in the chain. It can be the one it received after
2032                 changing some fields.
2033
2034                 In addition, this method may raise a PostProcessingError
2035                 exception that will be taken into account by the downloader
2036                 it was called from.
2037                 """
2038                 return information # by default, do nothing
2039         
2040 ### MAIN PROGRAM ###
2041 if __name__ == '__main__':
2042         try:
2043                 # Modules needed only when running the main program
2044                 import getpass
2045                 import optparse
2046
2047                 # Function to update the program file with the latest version from bitbucket.org
2048                 def update_self(downloader, filename):
2049                         # Note: downloader only used for options
2050                         if not os.access (filename, os.W_OK):
2051                                 sys.exit('ERROR: no write permissions on %s' % filename)
2052
2053                         downloader.to_stdout('Updating to latest stable version...')
2054                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2055                         latest_version = urllib.urlopen(latest_url).read().strip()
2056                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2057                         newcontent = urllib.urlopen(prog_url).read()
2058                         stream = open(filename, 'w')
2059                         stream.write(newcontent)
2060                         stream.close()
2061                         downloader.to_stdout('Updated to version %s' % latest_version)
2062
2063                 # General configuration
2064                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2065                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2066                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2067
2068                 # Parse command line
2069                 parser = optparse.OptionParser(
2070                         usage='Usage: %prog [options] url...',
2071                         version='2010.07.22',
2072                         conflict_handler='resolve',
2073                 )
2074
2075                 parser.add_option('-h', '--help',
2076                                 action='help', help='print this help text and exit')
2077                 parser.add_option('-v', '--version',
2078                                 action='version', help='print program version and exit')
2079                 parser.add_option('-U', '--update',
2080                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2081                 parser.add_option('-i', '--ignore-errors',
2082                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2083                 parser.add_option('-r', '--rate-limit',
2084                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2085                 parser.add_option('-R', '--retries',
2086                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2087
2088                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2089                 authentication.add_option('-u', '--username',
2090                                 dest='username', metavar='USERNAME', help='account username')
2091                 authentication.add_option('-p', '--password',
2092                                 dest='password', metavar='PASSWORD', help='account password')
2093                 authentication.add_option('-n', '--netrc',
2094                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2095                 parser.add_option_group(authentication)
2096
2097                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2098                 video_format.add_option('-f', '--format',
2099                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2100                 video_format.add_option('-m', '--mobile-version',
2101                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2102                 video_format.add_option('--all-formats',
2103                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2104                 video_format.add_option('--max-quality',
2105                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2106                 parser.add_option_group(video_format)
2107
2108                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2109                 verbosity.add_option('-q', '--quiet',
2110                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2111                 verbosity.add_option('-s', '--simulate',
2112                                 action='store_true', dest='simulate', help='do not download video', default=False)
2113                 verbosity.add_option('-g', '--get-url',
2114                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2115                 verbosity.add_option('-e', '--get-title',
2116                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2117                 verbosity.add_option('--get-thumbnail',
2118                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2119                 verbosity.add_option('--get-description',
2120                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2121                 verbosity.add_option('--no-progress',
2122                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2123                 parser.add_option_group(verbosity)
2124
2125                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2126                 filesystem.add_option('-t', '--title',
2127                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2128                 filesystem.add_option('-l', '--literal',
2129                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2130                 filesystem.add_option('-o', '--output',
2131                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2132                 filesystem.add_option('-a', '--batch-file',
2133                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2134                 filesystem.add_option('-w', '--no-overwrites',
2135                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2136                 filesystem.add_option('-c', '--continue',
2137                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2138                 parser.add_option_group(filesystem)
2139
2140                 (opts, args) = parser.parse_args()
2141
2142                 # Batch file verification
2143                 batchurls = []
2144                 if opts.batchfile is not None:
2145                         try:
2146                                 if opts.batchfile == '-':
2147                                         batchfd = sys.stdin
2148                                 else:
2149                                         batchfd = open(opts.batchfile, 'r')
2150                                 batchurls = batchfd.readlines()
2151                                 batchurls = [x.strip() for x in batchurls]
2152                                 batchurls = [x for x in batchurls if len(x) > 0]
2153                         except IOError:
2154                                 sys.exit(u'ERROR: batch file could not be read')
2155                 all_urls = batchurls + args
2156
2157                 # Conflicting, missing and erroneous options
2158                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2159                         parser.error(u'using .netrc conflicts with giving username/password')
2160                 if opts.password is not None and opts.username is None:
2161                         parser.error(u'account username missing')
2162                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2163                         parser.error(u'using output template conflicts with using title or literal title')
2164                 if opts.usetitle and opts.useliteral:
2165                         parser.error(u'using title conflicts with using literal title')
2166                 if opts.username is not None and opts.password is None:
2167                         opts.password = getpass.getpass(u'Type account password and press return:')
2168                 if opts.ratelimit is not None:
2169                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2170                         if numeric_limit is None:
2171                                 parser.error(u'invalid rate limit specified')
2172                         opts.ratelimit = numeric_limit
2173                 if opts.retries is not None:
2174                         try:
2175                                 opts.retries = long(opts.retries)
2176                         except (TypeError, ValueError), err:
2177                                 parser.error(u'invalid retry count specified')
2178
2179                 # Information extractors
2180                 youtube_ie = YoutubeIE()
2181                 metacafe_ie = MetacafeIE(youtube_ie)
2182                 dailymotion_ie = DailymotionIE()
2183                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2184                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2185                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2186                 google_ie = GoogleIE()
2187                 google_search_ie = GoogleSearchIE(google_ie)
2188                 photobucket_ie = PhotobucketIE()
2189                 yahoo_ie = YahooIE()
2190                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2191                 generic_ie = GenericIE()
2192
2193                 # File downloader
2194                 fd = FileDownloader({
2195                         'usenetrc': opts.usenetrc,
2196                         'username': opts.username,
2197                         'password': opts.password,
2198                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2199                         'forceurl': opts.geturl,
2200                         'forcetitle': opts.gettitle,
2201                         'forcethumbnail': opts.getthumbnail,
2202                         'forcedescription': opts.getdescription,
2203                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2204                         'format': opts.format,
2205                         'format_limit': opts.format_limit,
2206                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2207                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2208                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2209                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2210                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2211                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2212                                 or u'%(id)s.%(ext)s'),
2213                         'ignoreerrors': opts.ignoreerrors,
2214                         'ratelimit': opts.ratelimit,
2215                         'nooverwrites': opts.nooverwrites,
2216                         'retries': opts.retries,
2217                         'continuedl': opts.continue_dl,
2218                         'noprogress': opts.noprogress,
2219                         })
2220                 fd.add_info_extractor(youtube_search_ie)
2221                 fd.add_info_extractor(youtube_pl_ie)
2222                 fd.add_info_extractor(youtube_user_ie)
2223                 fd.add_info_extractor(metacafe_ie)
2224                 fd.add_info_extractor(dailymotion_ie)
2225                 fd.add_info_extractor(youtube_ie)
2226                 fd.add_info_extractor(google_ie)
2227                 fd.add_info_extractor(google_search_ie)
2228                 fd.add_info_extractor(photobucket_ie)
2229                 fd.add_info_extractor(yahoo_ie)
2230                 fd.add_info_extractor(yahoo_search_ie)
2231
2232                 # This must come last since it's the
2233                 # fallback if none of the others work
2234                 fd.add_info_extractor(generic_ie)
2235
2236                 # Update version
2237                 if opts.update_self:
2238                         update_self(fd, sys.argv[0])
2239
2240                 # Maybe do nothing
2241                 if len(all_urls) < 1:
2242                         if not opts.update_self:
2243                                 parser.error(u'you must provide at least one URL')
2244                         else:
2245                                 sys.exit()
2246                 retcode = fd.download(all_urls)
2247                 sys.exit(retcode)
2248
2249         except DownloadError:
2250                 sys.exit(1)
2251         except SameFileError:
2252                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2253         except KeyboardInterrupt:
2254                 sys.exit(u'\nERROR: Interrupted by user')