cf0336e56c7bfc377eefcbcb339ca788801243a0
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Firefox/3.6.7',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         def add_info_extractor(self, ie):
291                 """Add an InfoExtractor object to the end of the list."""
292                 self._ies.append(ie)
293                 ie.set_downloader(self)
294         
295         def add_post_processor(self, pp):
296                 """Add a PostProcessor object to the end of the chain."""
297                 self._pps.append(pp)
298                 pp.set_downloader(self)
299         
300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301                 """Print message to stdout if not in quiet mode."""
302                 try:
303                         if not self.params.get('quiet', False):
304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305                         sys.stdout.flush()
306                 except (UnicodeEncodeError), err:
307                         if not ignore_encoding_errors:
308                                 raise
309         
310         def to_stderr(self, message):
311                 """Print message to stderr."""
312                 print >>sys.stderr, message.encode(preferredencoding())
313         
314         def fixed_template(self):
315                 """Checks if the output template is fixed."""
316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318         def trouble(self, message=None):
319                 """Determine action to take when a download problem appears.
320
321                 Depending on if the downloader has been configured to ignore
322                 download errors or not, this method may throw an exception or
323                 not when errors are found, after printing the message.
324                 """
325                 if message is not None:
326                         self.to_stderr(message)
327                 if not self.params.get('ignoreerrors', False):
328                         raise DownloadError(message)
329                 self._download_retcode = 1
330
331         def slow_down(self, start_time, byte_counter):
332                 """Sleep if the download speed is over the rate limit."""
333                 rate_limit = self.params.get('ratelimit', None)
334                 if rate_limit is None or byte_counter == 0:
335                         return
336                 now = time.time()
337                 elapsed = now - start_time
338                 if elapsed <= 0.0:
339                         return
340                 speed = float(byte_counter) / elapsed
341                 if speed > rate_limit:
342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344         def report_destination(self, filename):
345                 """Report destination filename."""
346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347         
348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349                 """Report download progress."""
350                 if self.params.get('noprogress', False):
351                         return
352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355         def report_resuming_byte(self, resume_len):
356                 """Report attemtp to resume at given byte."""
357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358         
359         def report_retry(self, count, retries):
360                 """Report retry in case of HTTP error 503"""
361                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
362         
363         def report_file_already_downloaded(self, file_name):
364                 """Report file has already been fully downloaded."""
365                 try:
366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367                 except (UnicodeEncodeError), err:
368                         self.to_stdout(u'[download] The file has already been downloaded')
369         
370         def report_unable_to_resume(self):
371                 """Report it was impossible to resume download."""
372                 self.to_stdout(u'[download] Unable to resume')
373         
374         def report_finish(self):
375                 """Report download finished."""
376                 if self.params.get('noprogress', False):
377                         self.to_stdout(u'[download] Download completed')
378                 else:
379                         self.to_stdout(u'')
380         
381         def increment_downloads(self):
382                 """Increment the ordinal that assigns a number to each file."""
383                 self._num_downloads += 1
384
385         def process_info(self, info_dict):
386                 """Process a single dictionary returned by an InfoExtractor."""
387                 # Do nothing else if in simulate mode
388                 if self.params.get('simulate', False):
389                         # Forced printings
390                         if self.params.get('forcetitle', False):
391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392                         if self.params.get('forceurl', False):
393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399                         return
400                         
401                 try:
402                         template_dict = dict(info_dict)
403                         template_dict['epoch'] = unicode(long(time.time()))
404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
405                         filename = self.params['outtmpl'] % template_dict
406                 except (ValueError, KeyError), err:
407                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
408                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
409                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
410                         return
411
412                 try:
413                         self.pmkdir(filename)
414                 except (OSError, IOError), err:
415                         self.trouble('ERROR: unable to create directories: %s' % str(err))
416                         return
417
418                 try:
419                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
420                 except (OSError, IOError), err:
421                         raise UnavailableVideoError
422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
423                         self.trouble('ERROR: unable to download video data: %s' % str(err))
424                         return
425                 except (ContentTooShortError, ), err:
426                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
427                         return
428
429                 if success:
430                         try:
431                                 self.post_process(filename, info_dict)
432                         except (PostProcessingError), err:
433                                 self.trouble('ERROR: postprocessing: %s' % str(err))
434                                 return
435
436         def download(self, url_list):
437                 """Download a given list of URLs."""
438                 if len(url_list) > 1 and self.fixed_template():
439                         raise SameFileError(self.params['outtmpl'])
440
441                 for url in url_list:
442                         suitable_found = False
443                         for ie in self._ies:
444                                 # Go to next InfoExtractor if not suitable
445                                 if not ie.suitable(url):
446                                         continue
447
448                                 # Suitable InfoExtractor found
449                                 suitable_found = True
450
451                                 # Extract information from URL and process it
452                                 ie.extract(url)
453
454                                 # Suitable InfoExtractor had been found; go to next URL
455                                 break
456
457                         if not suitable_found:
458                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
459
460                 return self._download_retcode
461
462         def post_process(self, filename, ie_info):
463                 """Run the postprocessing chain on the given file."""
464                 info = dict(ie_info)
465                 info['filepath'] = filename
466                 for pp in self._pps:
467                         info = pp.run(info)
468                         if info is None:
469                                 break
470         
471         def _download_with_rtmpdump(self, filename, url, player_url):
472                 self.report_destination(filename)
473
474                 # Check for rtmpdump first
475                 try:
476                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
477                 except (OSError, IOError):
478                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
479                         return False
480
481                 # Download using rtmpdump. rtmpdump returns exit code 2 when
482                 # the connection was interrumpted and resuming appears to be
483                 # possible. This is part of rtmpdump's normal usage, AFAIK.
484                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
485                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
486                 while retval == 2 or retval == 1:
487                         prevsize = os.path.getsize(filename)
488                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
489                         time.sleep(5.0) # This seems to be needed
490                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
491                         cursize = os.path.getsize(filename)
492                         if prevsize == cursize and retval == 1:
493                                 break
494                 if retval == 0:
495                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
496                         return True
497                 else:
498                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
499                         return False
500
501         def _do_download(self, filename, url, player_url):
502                 # Attempt to download using rtmpdump
503                 if url.startswith('rtmp'):
504                         return self._download_with_rtmpdump(filename, url, player_url)
505
506                 stream = None
507                 open_mode = 'wb'
508                 basic_request = urllib2.Request(url, None, std_headers)
509                 request = urllib2.Request(url, None, std_headers)
510
511                 # Establish possible resume length
512                 if os.path.isfile(filename):
513                         resume_len = os.path.getsize(filename)
514                 else:
515                         resume_len = 0
516
517                 # Request parameters in case of being able to resume
518                 if self.params.get('continuedl', False) and resume_len != 0:
519                         self.report_resuming_byte(resume_len)
520                         request.add_header('Range','bytes=%d-' % resume_len)
521                         open_mode = 'ab'
522
523                 count = 0
524                 retries = self.params.get('retries', 0)
525                 while count <= retries:
526                         # Establish connection
527                         try:
528                                 data = urllib2.urlopen(request)
529                                 break
530                         except (urllib2.HTTPError, ), err:
531                                 if err.code != 503 and err.code != 416:
532                                         # Unexpected HTTP error
533                                         raise
534                                 elif err.code == 416:
535                                         # Unable to resume (requested range not satisfiable)
536                                         try:
537                                                 # Open the connection again without the range header
538                                                 data = urllib2.urlopen(basic_request)
539                                                 content_length = data.info()['Content-Length']
540                                         except (urllib2.HTTPError, ), err:
541                                                 if err.code != 503:
542                                                         raise
543                                         else:
544                                                 # Examine the reported length
545                                                 if content_length is not None and long(content_length) == resume_len:
546                                                         # The file had already been fully downloaded
547                                                         self.report_file_already_downloaded(filename)
548                                                         return True
549                                                 else:
550                                                         # The length does not match, we start the download over
551                                                         self.report_unable_to_resume()
552                                                         open_mode = 'wb'
553                                                         break
554                         # Retry
555                         count += 1
556                         if count <= retries:
557                                 self.report_retry(count, retries)
558
559                 if count > retries:
560                         self.trouble(u'ERROR: giving up after %s retries' % retries)
561                         return False
562
563                 data_len = data.info().get('Content-length', None)
564                 data_len_str = self.format_bytes(data_len)
565                 byte_counter = 0
566                 block_size = 1024
567                 start = time.time()
568                 while True:
569                         # Download and write
570                         before = time.time()
571                         data_block = data.read(block_size)
572                         after = time.time()
573                         data_block_len = len(data_block)
574                         if data_block_len == 0:
575                                 break
576                         byte_counter += data_block_len
577
578                         # Open file just in time
579                         if stream is None:
580                                 try:
581                                         (stream, filename) = sanitize_open(filename, open_mode)
582                                         self.report_destination(filename)
583                                 except (OSError, IOError), err:
584                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
585                                         return False
586                         try:
587                                 stream.write(data_block)
588                         except (IOError, OSError), err:
589                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
590                         block_size = self.best_block_size(after - before, data_block_len)
591
592                         # Progress message
593                         percent_str = self.calc_percent(byte_counter, data_len)
594                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
595                         speed_str = self.calc_speed(start, time.time(), byte_counter)
596                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
597
598                         # Apply rate limit
599                         self.slow_down(start, byte_counter)
600
601                 self.report_finish()
602                 if data_len is not None and str(byte_counter) != data_len:
603                         raise ContentTooShortError(byte_counter, long(data_len))
604                 return True
605
606 class InfoExtractor(object):
607         """Information Extractor class.
608
609         Information extractors are the classes that, given a URL, extract
610         information from the video (or videos) the URL refers to. This
611         information includes the real video URL, the video title and simplified
612         title, author and others. The information is stored in a dictionary
613         which is then passed to the FileDownloader. The FileDownloader
614         processes this information possibly downloading the video to the file
615         system, among other possible outcomes. The dictionaries must include
616         the following fields:
617
618         id:             Video identifier.
619         url:            Final video URL.
620         uploader:       Nickname of the video uploader.
621         title:          Literal title.
622         stitle:         Simplified title.
623         ext:            Video filename extension.
624         format:         Video format.
625         player_url:     SWF Player URL (may be None).
626
627         The following fields are optional. Their primary purpose is to allow
628         youtube-dl to serve as the backend for a video search function, such
629         as the one in youtube2mp3.  They are only used when their respective
630         forced printing functions are called:
631
632         thumbnail:      Full URL to a video thumbnail image.
633         description:    One-line video description.
634
635         Subclasses of this one should re-define the _real_initialize() and
636         _real_extract() methods, as well as the suitable() static method.
637         Probably, they should also be instantiated and added to the main
638         downloader.
639         """
640
641         _ready = False
642         _downloader = None
643
644         def __init__(self, downloader=None):
645                 """Constructor. Receives an optional downloader."""
646                 self._ready = False
647                 self.set_downloader(downloader)
648
649         @staticmethod
650         def suitable(url):
651                 """Receives a URL and returns True if suitable for this IE."""
652                 return False
653
654         def initialize(self):
655                 """Initializes an instance (authentication, etc)."""
656                 if not self._ready:
657                         self._real_initialize()
658                         self._ready = True
659
660         def extract(self, url):
661                 """Extracts URL information and returns it in list of dicts."""
662                 self.initialize()
663                 return self._real_extract(url)
664
665         def set_downloader(self, downloader):
666                 """Sets the downloader for this IE."""
667                 self._downloader = downloader
668         
669         def _real_initialize(self):
670                 """Real initialization process. Redefine in subclasses."""
671                 pass
672
673         def _real_extract(self, url):
674                 """Real extraction process. Redefine in subclasses."""
675                 pass
676
677 class YoutubeIE(InfoExtractor):
678         """Information extractor for youtube.com."""
679
680         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
681         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
682         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
683         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
684         _NETRC_MACHINE = 'youtube'
685         # Listed in order of quality
686         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
687         _video_extensions = {
688                 '13': '3gp',
689                 '17': 'mp4',
690                 '18': 'mp4',
691                 '22': 'mp4',
692                 '37': 'mp4',
693                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
694                 '43': 'webm',
695                 '45': 'webm',
696         }
697
698         @staticmethod
699         def suitable(url):
700                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
701
702         def report_lang(self):
703                 """Report attempt to set language."""
704                 self._downloader.to_stdout(u'[youtube] Setting language')
705
706         def report_login(self):
707                 """Report attempt to log in."""
708                 self._downloader.to_stdout(u'[youtube] Logging in')
709         
710         def report_age_confirmation(self):
711                 """Report attempt to confirm age."""
712                 self._downloader.to_stdout(u'[youtube] Confirming age')
713         
714         def report_video_webpage_download(self, video_id):
715                 """Report attempt to download video webpage."""
716                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
717         
718         def report_video_info_webpage_download(self, video_id):
719                 """Report attempt to download video info webpage."""
720                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
721         
722         def report_information_extraction(self, video_id):
723                 """Report attempt to extract video information."""
724                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
725         
726         def report_unavailable_format(self, video_id, format):
727                 """Report extracted video URL."""
728                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
729         
730         def report_rtmp_download(self):
731                 """Indicate the download will use the RTMP protocol."""
732                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
733         
734         def _real_initialize(self):
735                 if self._downloader is None:
736                         return
737
738                 username = None
739                 password = None
740                 downloader_params = self._downloader.params
741
742                 # Attempt to use provided username and password or .netrc data
743                 if downloader_params.get('username', None) is not None:
744                         username = downloader_params['username']
745                         password = downloader_params['password']
746                 elif downloader_params.get('usenetrc', False):
747                         try:
748                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
749                                 if info is not None:
750                                         username = info[0]
751                                         password = info[2]
752                                 else:
753                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
754                         except (IOError, netrc.NetrcParseError), err:
755                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
756                                 return
757
758                 # Set language
759                 request = urllib2.Request(self._LANG_URL, None, std_headers)
760                 try:
761                         self.report_lang()
762                         urllib2.urlopen(request).read()
763                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
764                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
765                         return
766
767                 # No authentication to be performed
768                 if username is None:
769                         return
770
771                 # Log in
772                 login_form = {
773                                 'current_form': 'loginForm',
774                                 'next':         '/',
775                                 'action_login': 'Log In',
776                                 'username':     username,
777                                 'password':     password,
778                                 }
779                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
780                 try:
781                         self.report_login()
782                         login_results = urllib2.urlopen(request).read()
783                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
784                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
785                                 return
786                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
787                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
788                         return
789         
790                 # Confirm age
791                 age_form = {
792                                 'next_url':             '/',
793                                 'action_confirm':       'Confirm',
794                                 }
795                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
796                 try:
797                         self.report_age_confirmation()
798                         age_results = urllib2.urlopen(request).read()
799                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
800                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
801                         return
802
803         def _real_extract(self, url):
804                 # Extract video id from URL
805                 mobj = re.match(self._VALID_URL, url)
806                 if mobj is None:
807                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
808                         return
809                 video_id = mobj.group(2)
810
811                 # Get video webpage
812                 self.report_video_webpage_download(video_id)
813                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
814                 try:
815                         video_webpage = urllib2.urlopen(request).read()
816                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
817                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
818                         return
819
820                 # Attempt to extract SWF player URL
821                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
822                 if mobj is not None:
823                         player_url = mobj.group(1)
824                 else:
825                         player_url = None
826
827                 # Get video info
828                 self.report_video_info_webpage_download(video_id)
829                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
830                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
831                                            % (video_id, el_type))
832                         request = urllib2.Request(video_info_url, None, std_headers)
833                         try:
834                                 video_info_webpage = urllib2.urlopen(request).read()
835                                 video_info = parse_qs(video_info_webpage)
836                                 if 'token' in video_info:
837                                         break
838                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
839                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
840                                 return
841                 if 'token' not in video_info:
842                         if 'reason' in video_info:
843                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
844                         else:
845                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
846                         return
847
848                 # Start extracting information
849                 self.report_information_extraction(video_id)
850
851                 # uploader
852                 if 'author' not in video_info:
853                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
854                         return
855                 video_uploader = urllib.unquote_plus(video_info['author'][0])
856
857                 # title
858                 if 'title' not in video_info:
859                         self._downloader.trouble(u'ERROR: unable to extract video title')
860                         return
861                 video_title = urllib.unquote_plus(video_info['title'][0])
862                 video_title = video_title.decode('utf-8')
863                 video_title = sanitize_title(video_title)
864
865                 # simplified title
866                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
867                 simple_title = simple_title.strip(ur'_')
868
869                 # thumbnail image
870                 if 'thumbnail_url' not in video_info:
871                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
872                         video_thumbnail = ''
873                 else:   # don't panic if we can't find it
874                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
875
876                 # description
877                 video_description = 'No description available.'
878                 if self._downloader.params.get('forcedescription', False):
879                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
880                         if mobj is not None:
881                                 video_description = mobj.group(1)
882
883                 # token
884                 video_token = urllib.unquote_plus(video_info['token'][0])
885
886                 # Decide which formats to download
887                 requested_format = self._downloader.params.get('format', None)
888                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
889
890                 if 'fmt_url_map' in video_info:
891                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
892                         format_limit = self._downloader.params.get('format_limit', None)
893                         if format_limit is not None and format_limit in self._available_formats:
894                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
895                         else:
896                                 format_list = self._available_formats
897                         existing_formats = [x for x in format_list if x in url_map]
898                         if len(existing_formats) == 0:
899                                 self._downloader.trouble(u'ERROR: no known formats available for video')
900                                 return
901                         if requested_format is None:
902                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
903                         elif requested_format == '-1':
904                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
905                         else:
906                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
907
908                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
909                         self.report_rtmp_download()
910                         video_url_list = [(None, video_info['conn'][0])]
911
912                 else:
913                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
914                         return
915
916                 for format_param, video_real_url in video_url_list:
917                         # At this point we have a new video
918                         self._downloader.increment_downloads()
919
920                         # Extension
921                         video_extension = self._video_extensions.get(format_param, 'flv')
922
923                         # Find the video URL in fmt_url_map or conn paramters
924                         try:
925                                 # Process video information
926                                 self._downloader.process_info({
927                                         'id':           video_id.decode('utf-8'),
928                                         'url':          video_real_url.decode('utf-8'),
929                                         'uploader':     video_uploader.decode('utf-8'),
930                                         'title':        video_title,
931                                         'stitle':       simple_title,
932                                         'ext':          video_extension.decode('utf-8'),
933                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
934                                         'thumbnail':    video_thumbnail.decode('utf-8'),
935                                         'description':  video_description.decode('utf-8'),
936                                         'player_url':   player_url,
937                                 })
938                         except UnavailableVideoError, err:
939                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
940
941
942 class MetacafeIE(InfoExtractor):
943         """Information Extractor for metacafe.com."""
944
945         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
946         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
947         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
948         _youtube_ie = None
949
950         def __init__(self, youtube_ie, downloader=None):
951                 InfoExtractor.__init__(self, downloader)
952                 self._youtube_ie = youtube_ie
953
954         @staticmethod
955         def suitable(url):
956                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
957
958         def report_disclaimer(self):
959                 """Report disclaimer retrieval."""
960                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
961
962         def report_age_confirmation(self):
963                 """Report attempt to confirm age."""
964                 self._downloader.to_stdout(u'[metacafe] Confirming age')
965         
966         def report_download_webpage(self, video_id):
967                 """Report webpage download."""
968                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
969         
970         def report_extraction(self, video_id):
971                 """Report information extraction."""
972                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
973
974         def _real_initialize(self):
975                 # Retrieve disclaimer
976                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
977                 try:
978                         self.report_disclaimer()
979                         disclaimer = urllib2.urlopen(request).read()
980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
981                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
982                         return
983
984                 # Confirm age
985                 disclaimer_form = {
986                         'filters': '0',
987                         'submit': "Continue - I'm over 18",
988                         }
989                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
990                 try:
991                         self.report_age_confirmation()
992                         disclaimer = urllib2.urlopen(request).read()
993                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
994                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
995                         return
996         
997         def _real_extract(self, url):
998                 # Extract id and simplified title from URL
999                 mobj = re.match(self._VALID_URL, url)
1000                 if mobj is None:
1001                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1002                         return
1003
1004                 video_id = mobj.group(1)
1005
1006                 # Check if video comes from YouTube
1007                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1008                 if mobj2 is not None:
1009                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1010                         return
1011
1012                 # At this point we have a new video
1013                 self._downloader.increment_downloads()
1014
1015                 simple_title = mobj.group(2).decode('utf-8')
1016                 video_extension = 'flv'
1017
1018                 # Retrieve video webpage to extract further information
1019                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1020                 try:
1021                         self.report_download_webpage(video_id)
1022                         webpage = urllib2.urlopen(request).read()
1023                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1024                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1025                         return
1026
1027                 # Extract URL, uploader and title from webpage
1028                 self.report_extraction(video_id)
1029                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1030                 if mobj is None:
1031                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1032                         return
1033                 mediaURL = urllib.unquote(mobj.group(1))
1034
1035                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1036                 #if mobj is None:
1037                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1038                 #       return
1039                 #gdaKey = mobj.group(1)
1040                 #
1041                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1042
1043                 video_url = mediaURL
1044
1045                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1046                 if mobj is None:
1047                         self._downloader.trouble(u'ERROR: unable to extract title')
1048                         return
1049                 video_title = mobj.group(1).decode('utf-8')
1050                 video_title = sanitize_title(video_title)
1051
1052                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1053                 if mobj is None:
1054                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1055                         return
1056                 video_uploader = mobj.group(1)
1057
1058                 try:
1059                         # Process video information
1060                         self._downloader.process_info({
1061                                 'id':           video_id.decode('utf-8'),
1062                                 'url':          video_url.decode('utf-8'),
1063                                 'uploader':     video_uploader.decode('utf-8'),
1064                                 'title':        video_title,
1065                                 'stitle':       simple_title,
1066                                 'ext':          video_extension.decode('utf-8'),
1067                                 'format':       u'NA',
1068                                 'player_url':   None,
1069                         })
1070                 except UnavailableVideoError:
1071                         self._downloader.trouble(u'ERROR: unable to download video')
1072
1073
1074 class DailymotionIE(InfoExtractor):
1075         """Information Extractor for Dailymotion"""
1076
1077         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1078
1079         def __init__(self, downloader=None):
1080                 InfoExtractor.__init__(self, downloader)
1081
1082         @staticmethod
1083         def suitable(url):
1084                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1085
1086         def report_download_webpage(self, video_id):
1087                 """Report webpage download."""
1088                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1089         
1090         def report_extraction(self, video_id):
1091                 """Report information extraction."""
1092                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1093
1094         def _real_initialize(self):
1095                 return
1096
1097         def _real_extract(self, url):
1098                 # Extract id and simplified title from URL
1099                 mobj = re.match(self._VALID_URL, url)
1100                 if mobj is None:
1101                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1102                         return
1103
1104                 # At this point we have a new video
1105                 self._downloader.increment_downloads()
1106                 video_id = mobj.group(1)
1107
1108                 simple_title = mobj.group(2).decode('utf-8')
1109                 video_extension = 'flv'
1110
1111                 # Retrieve video webpage to extract further information
1112                 request = urllib2.Request(url)
1113                 try:
1114                         self.report_download_webpage(video_id)
1115                         webpage = urllib2.urlopen(request).read()
1116                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1117                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1118                         return
1119
1120                 # Extract URL, uploader and title from webpage
1121                 self.report_extraction(video_id)
1122                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1123                 if mobj is None:
1124                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1125                         return
1126                 mediaURL = urllib.unquote(mobj.group(1))
1127
1128                 # if needed add http://www.dailymotion.com/ if relative URL
1129
1130                 video_url = mediaURL
1131
1132                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1133                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1134                 if mobj is None:
1135                         self._downloader.trouble(u'ERROR: unable to extract title')
1136                         return
1137                 video_title = mobj.group(1).decode('utf-8')
1138                 video_title = sanitize_title(video_title)
1139
1140                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1141                 if mobj is None:
1142                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1143                         return
1144                 video_uploader = mobj.group(1)
1145
1146                 try:
1147                         # Process video information
1148                         self._downloader.process_info({
1149                                 'id':           video_id.decode('utf-8'),
1150                                 'url':          video_url.decode('utf-8'),
1151                                 'uploader':     video_uploader.decode('utf-8'),
1152                                 'title':        video_title,
1153                                 'stitle':       simple_title,
1154                                 'ext':          video_extension.decode('utf-8'),
1155                                 'format':       u'NA',
1156                                 'player_url':   None,
1157                         })
1158                 except UnavailableVideoError:
1159                         self._downloader.trouble(u'ERROR: unable to download video')
1160
1161 class GoogleIE(InfoExtractor):
1162         """Information extractor for video.google.com."""
1163
1164         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1165
1166         def __init__(self, downloader=None):
1167                 InfoExtractor.__init__(self, downloader)
1168
1169         @staticmethod
1170         def suitable(url):
1171                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1172
1173         def report_download_webpage(self, video_id):
1174                 """Report webpage download."""
1175                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1176
1177         def report_extraction(self, video_id):
1178                 """Report information extraction."""
1179                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1180
1181         def _real_initialize(self):
1182                 return
1183
1184         def _real_extract(self, url):
1185                 # Extract id from URL
1186                 mobj = re.match(self._VALID_URL, url)
1187                 if mobj is None:
1188                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189                         return
1190
1191                 # At this point we have a new video
1192                 self._downloader.increment_downloads()
1193                 video_id = mobj.group(1)
1194
1195                 video_extension = 'mp4'
1196
1197                 # Retrieve video webpage to extract further information
1198                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1199                 try:
1200                         self.report_download_webpage(video_id)
1201                         webpage = urllib2.urlopen(request).read()
1202                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1203                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1204                         return
1205
1206                 # Extract URL, uploader, and title from webpage
1207                 self.report_extraction(video_id)
1208                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1209                 if mobj is None:
1210                         video_extension = 'flv'
1211                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1212                 if mobj is None:
1213                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1214                         return
1215                 mediaURL = urllib.unquote(mobj.group(1))
1216                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1217                 mediaURL = mediaURL.replace('\\x26', '\x26')
1218
1219                 video_url = mediaURL
1220
1221                 mobj = re.search(r'<title>(.*)</title>', webpage)
1222                 if mobj is None:
1223                         self._downloader.trouble(u'ERROR: unable to extract title')
1224                         return
1225                 video_title = mobj.group(1).decode('utf-8')
1226                 video_title = sanitize_title(video_title)
1227                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1228
1229                 # Extract video description
1230                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1231                 if mobj is None:
1232                         self._downloader.trouble(u'ERROR: unable to extract video description')
1233                         return
1234                 video_description = mobj.group(1).decode('utf-8')
1235                 if not video_description:
1236                         video_description = 'No description available.'
1237
1238                 # Extract video thumbnail
1239                 if self._downloader.params.get('forcethumbnail', False):
1240                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1241                         try:
1242                                 webpage = urllib2.urlopen(request).read()
1243                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1244                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1245                                 return
1246                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1247                         if mobj is None:
1248                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1249                                 return
1250                         video_thumbnail = mobj.group(1)
1251                 else:   # we need something to pass to process_info
1252                         video_thumbnail = ''
1253
1254
1255                 try:
1256                         # Process video information
1257                         self._downloader.process_info({
1258                                 'id':           video_id.decode('utf-8'),
1259                                 'url':          video_url.decode('utf-8'),
1260                                 'uploader':     u'NA',
1261                                 'title':        video_title,
1262                                 'stitle':       simple_title,
1263                                 'ext':          video_extension.decode('utf-8'),
1264                                 'format':       u'NA',
1265                                 'player_url':   None,
1266                         })
1267                 except UnavailableVideoError:
1268                         self._downloader.trouble(u'ERROR: unable to download video')
1269
1270
1271 class PhotobucketIE(InfoExtractor):
1272         """Information extractor for photobucket.com."""
1273
1274         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1275
1276         def __init__(self, downloader=None):
1277                 InfoExtractor.__init__(self, downloader)
1278
1279         @staticmethod
1280         def suitable(url):
1281                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1282
1283         def report_download_webpage(self, video_id):
1284                 """Report webpage download."""
1285                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1286
1287         def report_extraction(self, video_id):
1288                 """Report information extraction."""
1289                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1290
1291         def _real_initialize(self):
1292                 return
1293
1294         def _real_extract(self, url):
1295                 # Extract id from URL
1296                 mobj = re.match(self._VALID_URL, url)
1297                 if mobj is None:
1298                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1299                         return
1300
1301                 # At this point we have a new video
1302                 self._downloader.increment_downloads()
1303                 video_id = mobj.group(1)
1304
1305                 video_extension = 'flv'
1306
1307                 # Retrieve video webpage to extract further information
1308                 request = urllib2.Request(url)
1309                 try:
1310                         self.report_download_webpage(video_id)
1311                         webpage = urllib2.urlopen(request).read()
1312                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1313                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1314                         return
1315
1316                 # Extract URL, uploader, and title from webpage
1317                 self.report_extraction(video_id)
1318                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1319                 if mobj is None:
1320                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1321                         return
1322                 mediaURL = urllib.unquote(mobj.group(1))
1323
1324                 video_url = mediaURL
1325
1326                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1327                 if mobj is None:
1328                         self._downloader.trouble(u'ERROR: unable to extract title')
1329                         return
1330                 video_title = mobj.group(1).decode('utf-8')
1331                 video_title = sanitize_title(video_title)
1332                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1333
1334                 video_uploader = mobj.group(2).decode('utf-8')
1335
1336                 try:
1337                         # Process video information
1338                         self._downloader.process_info({
1339                                 'id':           video_id.decode('utf-8'),
1340                                 'url':          video_url.decode('utf-8'),
1341                                 'uploader':     video_uploader,
1342                                 'title':        video_title,
1343                                 'stitle':       simple_title,
1344                                 'ext':          video_extension.decode('utf-8'),
1345                                 'format':       u'NA',
1346                                 'player_url':   None,
1347                         })
1348                 except UnavailableVideoError:
1349                         self._downloader.trouble(u'ERROR: unable to download video')
1350
1351
1352 class YahooIE(InfoExtractor):
1353         """Information extractor for video.yahoo.com."""
1354
1355         # _VALID_URL matches all Yahoo! Video URLs
1356         # _VPAGE_URL matches only the extractable '/watch/' URLs
1357         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1358         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1359
1360         def __init__(self, downloader=None):
1361                 InfoExtractor.__init__(self, downloader)
1362
1363         @staticmethod
1364         def suitable(url):
1365                 return (re.match(YahooIE._VALID_URL, url) is not None)
1366
1367         def report_download_webpage(self, video_id):
1368                 """Report webpage download."""
1369                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1370
1371         def report_extraction(self, video_id):
1372                 """Report information extraction."""
1373                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1374
1375         def _real_initialize(self):
1376                 return
1377
1378         def _real_extract(self, url, new_video=True):
1379                 # Extract ID from URL
1380                 mobj = re.match(self._VALID_URL, url)
1381                 if mobj is None:
1382                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1383                         return
1384
1385                 # At this point we have a new video
1386                 self._downloader.increment_downloads()
1387                 video_id = mobj.group(2)
1388                 video_extension = 'flv'
1389
1390                 # Rewrite valid but non-extractable URLs as
1391                 # extractable English language /watch/ URLs
1392                 if re.match(self._VPAGE_URL, url) is None:
1393                         request = urllib2.Request(url)
1394                         try:
1395                                 webpage = urllib2.urlopen(request).read()
1396                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1397                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1398                                 return
1399
1400                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1401                         if mobj is None:
1402                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1403                                 return
1404                         yahoo_id = mobj.group(1)
1405
1406                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1407                         if mobj is None:
1408                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1409                                 return
1410                         yahoo_vid = mobj.group(1)
1411
1412                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1413                         return self._real_extract(url, new_video=False)
1414
1415                 # Retrieve video webpage to extract further information
1416                 request = urllib2.Request(url)
1417                 try:
1418                         self.report_download_webpage(video_id)
1419                         webpage = urllib2.urlopen(request).read()
1420                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1422                         return
1423
1424                 # Extract uploader and title from webpage
1425                 self.report_extraction(video_id)
1426                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1427                 if mobj is None:
1428                         self._downloader.trouble(u'ERROR: unable to extract video title')
1429                         return
1430                 video_title = mobj.group(1).decode('utf-8')
1431                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1432
1433                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1434                 if mobj is None:
1435                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1436                         return
1437                 video_uploader = mobj.group(1).decode('utf-8')
1438
1439                 # Extract video thumbnail
1440                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1441                 if mobj is None:
1442                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1443                         return
1444                 video_thumbnail = mobj.group(1).decode('utf-8')
1445
1446                 # Extract video description
1447                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1448                 if mobj is None:
1449                         self._downloader.trouble(u'ERROR: unable to extract video description')
1450                         return
1451                 video_description = mobj.group(1).decode('utf-8')
1452                 if not video_description: video_description = 'No description available.'
1453
1454                 # Extract video height and width
1455                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract video height')
1458                         return
1459                 yv_video_height = mobj.group(1)
1460
1461                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract video width')
1464                         return
1465                 yv_video_width = mobj.group(1)
1466
1467                 # Retrieve video playlist to extract media URL
1468                 # I'm not completely sure what all these options are, but we
1469                 # seem to need most of them, otherwise the server sends a 401.
1470                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1471                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1472                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1473                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1474                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1475                 try:
1476                         self.report_download_webpage(video_id)
1477                         webpage = urllib2.urlopen(request).read()
1478                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1479                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1480                         return
1481
1482                 # Extract media URL from playlist XML
1483                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1484                 if mobj is None:
1485                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1486                         return
1487                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1488                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1489
1490                 try:
1491                         # Process video information
1492                         self._downloader.process_info({
1493                                 'id':           video_id.decode('utf-8'),
1494                                 'url':          video_url,
1495                                 'uploader':     video_uploader,
1496                                 'title':        video_title,
1497                                 'stitle':       simple_title,
1498                                 'ext':          video_extension.decode('utf-8'),
1499                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1500                                 'description':  video_description,
1501                                 'thumbnail':    video_thumbnail,
1502                                 'description':  video_description,
1503                                 'player_url':   None,
1504                         })
1505                 except UnavailableVideoError:
1506                         self._downloader.trouble(u'ERROR: unable to download video')
1507
1508
1509 class GenericIE(InfoExtractor):
1510         """Generic last-resort information extractor."""
1511
1512         def __init__(self, downloader=None):
1513                 InfoExtractor.__init__(self, downloader)
1514
1515         @staticmethod
1516         def suitable(url):
1517                 return True
1518
1519         def report_download_webpage(self, video_id):
1520                 """Report webpage download."""
1521                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1522                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1523
1524         def report_extraction(self, video_id):
1525                 """Report information extraction."""
1526                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1527
1528         def _real_initialize(self):
1529                 return
1530
1531         def _real_extract(self, url):
1532                 # At this point we have a new video
1533                 self._downloader.increment_downloads()
1534
1535                 video_id = url.split('/')[-1]
1536                 request = urllib2.Request(url)
1537                 try:
1538                         self.report_download_webpage(video_id)
1539                         webpage = urllib2.urlopen(request).read()
1540                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1542                         return
1543                 except ValueError, err:
1544                         # since this is the last-resort InfoExtractor, if
1545                         # this error is thrown, it'll be thrown here
1546                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1547                         return
1548
1549                 # Start with something easy: JW Player in SWFObject
1550                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1551                 if mobj is None:
1552                         # Broaden the search a little bit
1553                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1554                 if mobj is None:
1555                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1556                         return
1557
1558                 # It's possible that one of the regexes
1559                 # matched, but returned an empty group:
1560                 if mobj.group(1) is None:
1561                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1562                         return
1563
1564                 video_url = urllib.unquote(mobj.group(1))
1565                 video_id  = os.path.basename(video_url)
1566
1567                 # here's a fun little line of code for you:
1568                 video_extension = os.path.splitext(video_id)[1][1:]
1569                 video_id        = os.path.splitext(video_id)[0]
1570
1571                 # it's tempting to parse this further, but you would
1572                 # have to take into account all the variations like
1573                 #   Video Title - Site Name
1574                 #   Site Name | Video Title
1575                 #   Video Title - Tagline | Site Name
1576                 # and so on and so forth; it's just not practical
1577                 mobj = re.search(r'<title>(.*)</title>', webpage)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: unable to extract title')
1580                         return
1581                 video_title = mobj.group(1).decode('utf-8')
1582                 video_title = sanitize_title(video_title)
1583                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1584
1585                 # video uploader is domain name
1586                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1587                 if mobj is None:
1588                         self._downloader.trouble(u'ERROR: unable to extract title')
1589                         return
1590                 video_uploader = mobj.group(1).decode('utf-8')
1591
1592                 try:
1593                         # Process video information
1594                         self._downloader.process_info({
1595                                 'id':           video_id.decode('utf-8'),
1596                                 'url':          video_url.decode('utf-8'),
1597                                 'uploader':     video_uploader,
1598                                 'title':        video_title,
1599                                 'stitle':       simple_title,
1600                                 'ext':          video_extension.decode('utf-8'),
1601                                 'format':       u'NA',
1602                                 'player_url':   None,
1603                         })
1604                 except UnavailableVideoError, err:
1605                         self._downloader.trouble(u'ERROR: unable to download video')
1606
1607
1608 class YoutubeSearchIE(InfoExtractor):
1609         """Information Extractor for YouTube search queries."""
1610         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1611         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1612         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1613         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1614         _youtube_ie = None
1615         _max_youtube_results = 1000
1616
1617         def __init__(self, youtube_ie, downloader=None):
1618                 InfoExtractor.__init__(self, downloader)
1619                 self._youtube_ie = youtube_ie
1620         
1621         @staticmethod
1622         def suitable(url):
1623                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1624
1625         def report_download_page(self, query, pagenum):
1626                 """Report attempt to download playlist page with given number."""
1627                 query = query.decode(preferredencoding())
1628                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1629
1630         def _real_initialize(self):
1631                 self._youtube_ie.initialize()
1632         
1633         def _real_extract(self, query):
1634                 mobj = re.match(self._VALID_QUERY, query)
1635                 if mobj is None:
1636                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1637                         return
1638
1639                 prefix, query = query.split(':')
1640                 prefix = prefix[8:]
1641                 query  = query.encode('utf-8')
1642                 if prefix == '':
1643                         self._download_n_results(query, 1)
1644                         return
1645                 elif prefix == 'all':
1646                         self._download_n_results(query, self._max_youtube_results)
1647                         return
1648                 else:
1649                         try:
1650                                 n = long(prefix)
1651                                 if n <= 0:
1652                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1653                                         return
1654                                 elif n > self._max_youtube_results:
1655                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1656                                         n = self._max_youtube_results
1657                                 self._download_n_results(query, n)
1658                                 return
1659                         except ValueError: # parsing prefix as integer fails
1660                                 self._download_n_results(query, 1)
1661                                 return
1662
1663         def _download_n_results(self, query, n):
1664                 """Downloads a specified number of results for a query"""
1665
1666                 video_ids = []
1667                 already_seen = set()
1668                 pagenum = 1
1669
1670                 while True:
1671                         self.report_download_page(query, pagenum)
1672                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1673                         request = urllib2.Request(result_url, None, std_headers)
1674                         try:
1675                                 page = urllib2.urlopen(request).read()
1676                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1677                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1678                                 return
1679
1680                         # Extract video identifiers
1681                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1682                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1683                                 if video_id not in already_seen:
1684                                         video_ids.append(video_id)
1685                                         already_seen.add(video_id)
1686                                         if len(video_ids) == n:
1687                                                 # Specified n videos reached
1688                                                 for id in video_ids:
1689                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1690                                                 return
1691
1692                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1693                                 for id in video_ids:
1694                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1695                                 return
1696
1697                         pagenum = pagenum + 1
1698
1699 class GoogleSearchIE(InfoExtractor):
1700         """Information Extractor for Google Video search queries."""
1701         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1702         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1703         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1704         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1705         _google_ie = None
1706         _max_google_results = 1000
1707
1708         def __init__(self, google_ie, downloader=None):
1709                 InfoExtractor.__init__(self, downloader)
1710                 self._google_ie = google_ie
1711         
1712         @staticmethod
1713         def suitable(url):
1714                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1715
1716         def report_download_page(self, query, pagenum):
1717                 """Report attempt to download playlist page with given number."""
1718                 query = query.decode(preferredencoding())
1719                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1720
1721         def _real_initialize(self):
1722                 self._google_ie.initialize()
1723         
1724         def _real_extract(self, query):
1725                 mobj = re.match(self._VALID_QUERY, query)
1726                 if mobj is None:
1727                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1728                         return
1729
1730                 prefix, query = query.split(':')
1731                 prefix = prefix[8:]
1732                 query  = query.encode('utf-8')
1733                 if prefix == '':
1734                         self._download_n_results(query, 1)
1735                         return
1736                 elif prefix == 'all':
1737                         self._download_n_results(query, self._max_google_results)
1738                         return
1739                 else:
1740                         try:
1741                                 n = long(prefix)
1742                                 if n <= 0:
1743                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1744                                         return
1745                                 elif n > self._max_google_results:
1746                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1747                                         n = self._max_google_results
1748                                 self._download_n_results(query, n)
1749                                 return
1750                         except ValueError: # parsing prefix as integer fails
1751                                 self._download_n_results(query, 1)
1752                                 return
1753
1754         def _download_n_results(self, query, n):
1755                 """Downloads a specified number of results for a query"""
1756
1757                 video_ids = []
1758                 already_seen = set()
1759                 pagenum = 1
1760
1761                 while True:
1762                         self.report_download_page(query, pagenum)
1763                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1764                         request = urllib2.Request(result_url, None, std_headers)
1765                         try:
1766                                 page = urllib2.urlopen(request).read()
1767                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1768                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1769                                 return
1770
1771                         # Extract video identifiers
1772                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1773                                 video_id = mobj.group(1)
1774                                 if video_id not in already_seen:
1775                                         video_ids.append(video_id)
1776                                         already_seen.add(video_id)
1777                                         if len(video_ids) == n:
1778                                                 # Specified n videos reached
1779                                                 for id in video_ids:
1780                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1781                                                 return
1782
1783                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1784                                 for id in video_ids:
1785                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1786                                 return
1787
1788                         pagenum = pagenum + 1
1789
1790 class YahooSearchIE(InfoExtractor):
1791         """Information Extractor for Yahoo! Video search queries."""
1792         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1793         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1794         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1795         _MORE_PAGES_INDICATOR = r'\s*Next'
1796         _yahoo_ie = None
1797         _max_yahoo_results = 1000
1798
1799         def __init__(self, yahoo_ie, downloader=None):
1800                 InfoExtractor.__init__(self, downloader)
1801                 self._yahoo_ie = yahoo_ie
1802         
1803         @staticmethod
1804         def suitable(url):
1805                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1806
1807         def report_download_page(self, query, pagenum):
1808                 """Report attempt to download playlist page with given number."""
1809                 query = query.decode(preferredencoding())
1810                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1811
1812         def _real_initialize(self):
1813                 self._yahoo_ie.initialize()
1814         
1815         def _real_extract(self, query):
1816                 mobj = re.match(self._VALID_QUERY, query)
1817                 if mobj is None:
1818                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1819                         return
1820
1821                 prefix, query = query.split(':')
1822                 prefix = prefix[8:]
1823                 query  = query.encode('utf-8')
1824                 if prefix == '':
1825                         self._download_n_results(query, 1)
1826                         return
1827                 elif prefix == 'all':
1828                         self._download_n_results(query, self._max_yahoo_results)
1829                         return
1830                 else:
1831                         try:
1832                                 n = long(prefix)
1833                                 if n <= 0:
1834                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1835                                         return
1836                                 elif n > self._max_yahoo_results:
1837                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1838                                         n = self._max_yahoo_results
1839                                 self._download_n_results(query, n)
1840                                 return
1841                         except ValueError: # parsing prefix as integer fails
1842                                 self._download_n_results(query, 1)
1843                                 return
1844
1845         def _download_n_results(self, query, n):
1846                 """Downloads a specified number of results for a query"""
1847
1848                 video_ids = []
1849                 already_seen = set()
1850                 pagenum = 1
1851
1852                 while True:
1853                         self.report_download_page(query, pagenum)
1854                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1855                         request = urllib2.Request(result_url, None, std_headers)
1856                         try:
1857                                 page = urllib2.urlopen(request).read()
1858                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1859                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1860                                 return
1861
1862                         # Extract video identifiers
1863                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1864                                 video_id = mobj.group(1)
1865                                 if video_id not in already_seen:
1866                                         video_ids.append(video_id)
1867                                         already_seen.add(video_id)
1868                                         if len(video_ids) == n:
1869                                                 # Specified n videos reached
1870                                                 for id in video_ids:
1871                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1872                                                 return
1873
1874                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1875                                 for id in video_ids:
1876                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1877                                 return
1878
1879                         pagenum = pagenum + 1
1880
1881 class YoutubePlaylistIE(InfoExtractor):
1882         """Information Extractor for YouTube playlists."""
1883
1884         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1885         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1886         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1887         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1888         _youtube_ie = None
1889
1890         def __init__(self, youtube_ie, downloader=None):
1891                 InfoExtractor.__init__(self, downloader)
1892                 self._youtube_ie = youtube_ie
1893         
1894         @staticmethod
1895         def suitable(url):
1896                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1897
1898         def report_download_page(self, playlist_id, pagenum):
1899                 """Report attempt to download playlist page with given number."""
1900                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1901
1902         def _real_initialize(self):
1903                 self._youtube_ie.initialize()
1904         
1905         def _real_extract(self, url):
1906                 # Extract playlist id
1907                 mobj = re.match(self._VALID_URL, url)
1908                 if mobj is None:
1909                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1910                         return
1911
1912                 # Download playlist pages
1913                 playlist_id = mobj.group(1)
1914                 video_ids = []
1915                 pagenum = 1
1916
1917                 while True:
1918                         self.report_download_page(playlist_id, pagenum)
1919                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1920                         try:
1921                                 page = urllib2.urlopen(request).read()
1922                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1923                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1924                                 return
1925
1926                         # Extract video identifiers
1927                         ids_in_page = []
1928                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1929                                 if mobj.group(1) not in ids_in_page:
1930                                         ids_in_page.append(mobj.group(1))
1931                         video_ids.extend(ids_in_page)
1932
1933                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1934                                 break
1935                         pagenum = pagenum + 1
1936
1937                 for id in video_ids:
1938                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1939                 return
1940
1941 class YoutubeUserIE(InfoExtractor):
1942         """Information Extractor for YouTube users."""
1943
1944         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1945         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1946         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1947         _youtube_ie = None
1948
1949         def __init__(self, youtube_ie, downloader=None):
1950                 InfoExtractor.__init__(self, downloader)
1951                 self._youtube_ie = youtube_ie
1952         
1953         @staticmethod
1954         def suitable(url):
1955                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1956
1957         def report_download_page(self, username):
1958                 """Report attempt to download user page."""
1959                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1960
1961         def _real_initialize(self):
1962                 self._youtube_ie.initialize()
1963         
1964         def _real_extract(self, url):
1965                 # Extract username
1966                 mobj = re.match(self._VALID_URL, url)
1967                 if mobj is None:
1968                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1969                         return
1970
1971                 # Download user page
1972                 username = mobj.group(1)
1973                 video_ids = []
1974                 pagenum = 1
1975
1976                 self.report_download_page(username)
1977                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1978                 try:
1979                         page = urllib2.urlopen(request).read()
1980                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1981                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1982                         return
1983
1984                 # Extract video identifiers
1985                 ids_in_page = []
1986
1987                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1988                         if mobj.group(1) not in ids_in_page:
1989                                 ids_in_page.append(mobj.group(1))
1990                 video_ids.extend(ids_in_page)
1991
1992                 for id in video_ids:
1993                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1994                 return
1995
1996 class PostProcessor(object):
1997         """Post Processor class.
1998
1999         PostProcessor objects can be added to downloaders with their
2000         add_post_processor() method. When the downloader has finished a
2001         successful download, it will take its internal chain of PostProcessors
2002         and start calling the run() method on each one of them, first with
2003         an initial argument and then with the returned value of the previous
2004         PostProcessor.
2005
2006         The chain will be stopped if one of them ever returns None or the end
2007         of the chain is reached.
2008
2009         PostProcessor objects follow a "mutual registration" process similar
2010         to InfoExtractor objects.
2011         """
2012
2013         _downloader = None
2014
2015         def __init__(self, downloader=None):
2016                 self._downloader = downloader
2017
2018         def set_downloader(self, downloader):
2019                 """Sets the downloader for this PP."""
2020                 self._downloader = downloader
2021         
2022         def run(self, information):
2023                 """Run the PostProcessor.
2024
2025                 The "information" argument is a dictionary like the ones
2026                 composed by InfoExtractors. The only difference is that this
2027                 one has an extra field called "filepath" that points to the
2028                 downloaded file.
2029
2030                 When this method returns None, the postprocessing chain is
2031                 stopped. However, this method may return an information
2032                 dictionary that will be passed to the next postprocessing
2033                 object in the chain. It can be the one it received after
2034                 changing some fields.
2035
2036                 In addition, this method may raise a PostProcessingError
2037                 exception that will be taken into account by the downloader
2038                 it was called from.
2039                 """
2040                 return information # by default, do nothing
2041         
2042 ### MAIN PROGRAM ###
2043 if __name__ == '__main__':
2044         try:
2045                 # Modules needed only when running the main program
2046                 import getpass
2047                 import optparse
2048
2049                 # Function to update the program file with the latest version from bitbucket.org
2050                 def update_self(downloader, filename):
2051                         # Note: downloader only used for options
2052                         if not os.access (filename, os.W_OK):
2053                                 sys.exit('ERROR: no write permissions on %s' % filename)
2054
2055                         downloader.to_stdout('Updating to latest stable version...')
2056                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2057                         latest_version = urllib.urlopen(latest_url).read().strip()
2058                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2059                         newcontent = urllib.urlopen(prog_url).read()
2060                         stream = open(filename, 'w')
2061                         stream.write(newcontent)
2062                         stream.close()
2063                         downloader.to_stdout('Updated to version %s' % latest_version)
2064
2065                 # General configuration
2066                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2067                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2068                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2069
2070                 # Parse command line
2071                 parser = optparse.OptionParser(
2072                         usage='Usage: %prog [options] url...',
2073                         version='2010.07.24',
2074                         conflict_handler='resolve',
2075                 )
2076
2077                 parser.add_option('-h', '--help',
2078                                 action='help', help='print this help text and exit')
2079                 parser.add_option('-v', '--version',
2080                                 action='version', help='print program version and exit')
2081                 parser.add_option('-U', '--update',
2082                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2083                 parser.add_option('-i', '--ignore-errors',
2084                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2085                 parser.add_option('-r', '--rate-limit',
2086                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2087                 parser.add_option('-R', '--retries',
2088                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2089
2090                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2091                 authentication.add_option('-u', '--username',
2092                                 dest='username', metavar='USERNAME', help='account username')
2093                 authentication.add_option('-p', '--password',
2094                                 dest='password', metavar='PASSWORD', help='account password')
2095                 authentication.add_option('-n', '--netrc',
2096                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2097                 parser.add_option_group(authentication)
2098
2099                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2100                 video_format.add_option('-f', '--format',
2101                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2102                 video_format.add_option('-m', '--mobile-version',
2103                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2104                 video_format.add_option('--all-formats',
2105                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2106                 video_format.add_option('--max-quality',
2107                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2108                 parser.add_option_group(video_format)
2109
2110                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2111                 verbosity.add_option('-q', '--quiet',
2112                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2113                 verbosity.add_option('-s', '--simulate',
2114                                 action='store_true', dest='simulate', help='do not download video', default=False)
2115                 verbosity.add_option('-g', '--get-url',
2116                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2117                 verbosity.add_option('-e', '--get-title',
2118                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2119                 verbosity.add_option('--get-thumbnail',
2120                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2121                 verbosity.add_option('--get-description',
2122                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2123                 verbosity.add_option('--no-progress',
2124                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2125                 parser.add_option_group(verbosity)
2126
2127                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2128                 filesystem.add_option('-t', '--title',
2129                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2130                 filesystem.add_option('-l', '--literal',
2131                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2132                 filesystem.add_option('-o', '--output',
2133                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2134                 filesystem.add_option('-a', '--batch-file',
2135                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2136                 filesystem.add_option('-w', '--no-overwrites',
2137                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2138                 filesystem.add_option('-c', '--continue',
2139                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2140                 parser.add_option_group(filesystem)
2141
2142                 (opts, args) = parser.parse_args()
2143
2144                 # Batch file verification
2145                 batchurls = []
2146                 if opts.batchfile is not None:
2147                         try:
2148                                 if opts.batchfile == '-':
2149                                         batchfd = sys.stdin
2150                                 else:
2151                                         batchfd = open(opts.batchfile, 'r')
2152                                 batchurls = batchfd.readlines()
2153                                 batchurls = [x.strip() for x in batchurls]
2154                                 batchurls = [x for x in batchurls if len(x) > 0]
2155                         except IOError:
2156                                 sys.exit(u'ERROR: batch file could not be read')
2157                 all_urls = batchurls + args
2158
2159                 # Conflicting, missing and erroneous options
2160                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2161                         parser.error(u'using .netrc conflicts with giving username/password')
2162                 if opts.password is not None and opts.username is None:
2163                         parser.error(u'account username missing')
2164                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2165                         parser.error(u'using output template conflicts with using title or literal title')
2166                 if opts.usetitle and opts.useliteral:
2167                         parser.error(u'using title conflicts with using literal title')
2168                 if opts.username is not None and opts.password is None:
2169                         opts.password = getpass.getpass(u'Type account password and press return:')
2170                 if opts.ratelimit is not None:
2171                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2172                         if numeric_limit is None:
2173                                 parser.error(u'invalid rate limit specified')
2174                         opts.ratelimit = numeric_limit
2175                 if opts.retries is not None:
2176                         try:
2177                                 opts.retries = long(opts.retries)
2178                         except (TypeError, ValueError), err:
2179                                 parser.error(u'invalid retry count specified')
2180
2181                 # Information extractors
2182                 youtube_ie = YoutubeIE()
2183                 metacafe_ie = MetacafeIE(youtube_ie)
2184                 dailymotion_ie = DailymotionIE()
2185                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2186                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2187                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2188                 google_ie = GoogleIE()
2189                 google_search_ie = GoogleSearchIE(google_ie)
2190                 photobucket_ie = PhotobucketIE()
2191                 yahoo_ie = YahooIE()
2192                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2193                 generic_ie = GenericIE()
2194
2195                 # File downloader
2196                 fd = FileDownloader({
2197                         'usenetrc': opts.usenetrc,
2198                         'username': opts.username,
2199                         'password': opts.password,
2200                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2201                         'forceurl': opts.geturl,
2202                         'forcetitle': opts.gettitle,
2203                         'forcethumbnail': opts.getthumbnail,
2204                         'forcedescription': opts.getdescription,
2205                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2206                         'format': opts.format,
2207                         'format_limit': opts.format_limit,
2208                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2209                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2210                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2211                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2212                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2213                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2214                                 or u'%(id)s.%(ext)s'),
2215                         'ignoreerrors': opts.ignoreerrors,
2216                         'ratelimit': opts.ratelimit,
2217                         'nooverwrites': opts.nooverwrites,
2218                         'retries': opts.retries,
2219                         'continuedl': opts.continue_dl,
2220                         'noprogress': opts.noprogress,
2221                         })
2222                 fd.add_info_extractor(youtube_search_ie)
2223                 fd.add_info_extractor(youtube_pl_ie)
2224                 fd.add_info_extractor(youtube_user_ie)
2225                 fd.add_info_extractor(metacafe_ie)
2226                 fd.add_info_extractor(dailymotion_ie)
2227                 fd.add_info_extractor(youtube_ie)
2228                 fd.add_info_extractor(google_ie)
2229                 fd.add_info_extractor(google_search_ie)
2230                 fd.add_info_extractor(photobucket_ie)
2231                 fd.add_info_extractor(yahoo_ie)
2232                 fd.add_info_extractor(yahoo_search_ie)
2233
2234                 # This must come last since it's the
2235                 # fallback if none of the others work
2236                 fd.add_info_extractor(generic_ie)
2237
2238                 # Update version
2239                 if opts.update_self:
2240                         update_self(fd, sys.argv[0])
2241
2242                 # Maybe do nothing
2243                 if len(all_urls) < 1:
2244                         if not opts.update_self:
2245                                 parser.error(u'you must provide at least one URL')
2246                         else:
2247                                 sys.exit()
2248                 retcode = fd.download(all_urls)
2249                 sys.exit(retcode)
2250
2251         except DownloadError:
2252                 sys.exit(1)
2253         except SameFileError:
2254                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2255         except KeyboardInterrupt:
2256                 sys.exit(u'\nERROR: Interrupted by user')