00d9696d6299ad4b0743798684aa34df6af57682
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Firefox/3.6.7',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         @staticmethod
291         def verify_url(url):
292                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
293                 request = urllib2.Request(url, None, std_headers)
294                 data = urllib2.urlopen(request)
295                 data.read(1)
296                 url = data.geturl()
297                 data.close()
298                 return url
299
300         def add_info_extractor(self, ie):
301                 """Add an InfoExtractor object to the end of the list."""
302                 self._ies.append(ie)
303                 ie.set_downloader(self)
304         
305         def add_post_processor(self, pp):
306                 """Add a PostProcessor object to the end of the chain."""
307                 self._pps.append(pp)
308                 pp.set_downloader(self)
309         
310         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
311                 """Print message to stdout if not in quiet mode."""
312                 try:
313                         if not self.params.get('quiet', False):
314                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
315                         sys.stdout.flush()
316                 except (UnicodeEncodeError), err:
317                         if not ignore_encoding_errors:
318                                 raise
319         
320         def to_stderr(self, message):
321                 """Print message to stderr."""
322                 print >>sys.stderr, message.encode(preferredencoding())
323         
324         def fixed_template(self):
325                 """Checks if the output template is fixed."""
326                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
327
328         def trouble(self, message=None):
329                 """Determine action to take when a download problem appears.
330
331                 Depending on if the downloader has been configured to ignore
332                 download errors or not, this method may throw an exception or
333                 not when errors are found, after printing the message.
334                 """
335                 if message is not None:
336                         self.to_stderr(message)
337                 if not self.params.get('ignoreerrors', False):
338                         raise DownloadError(message)
339                 self._download_retcode = 1
340
341         def slow_down(self, start_time, byte_counter):
342                 """Sleep if the download speed is over the rate limit."""
343                 rate_limit = self.params.get('ratelimit', None)
344                 if rate_limit is None or byte_counter == 0:
345                         return
346                 now = time.time()
347                 elapsed = now - start_time
348                 if elapsed <= 0.0:
349                         return
350                 speed = float(byte_counter) / elapsed
351                 if speed > rate_limit:
352                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
353
354         def report_destination(self, filename):
355                 """Report destination filename."""
356                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
357         
358         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
359                 """Report download progress."""
360                 if self.params.get('noprogress', False):
361                         return
362                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
363                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
364
365         def report_resuming_byte(self, resume_len):
366                 """Report attemtp to resume at given byte."""
367                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
368         
369         def report_retry(self, count, retries):
370                 """Report retry in case of HTTP error 503"""
371                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
372         
373         def report_file_already_downloaded(self, file_name):
374                 """Report file has already been fully downloaded."""
375                 try:
376                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
377                 except (UnicodeEncodeError), err:
378                         self.to_stdout(u'[download] The file has already been downloaded')
379         
380         def report_unable_to_resume(self):
381                 """Report it was impossible to resume download."""
382                 self.to_stdout(u'[download] Unable to resume')
383         
384         def report_finish(self):
385                 """Report download finished."""
386                 if self.params.get('noprogress', False):
387                         self.to_stdout(u'[download] Download completed')
388                 else:
389                         self.to_stdout(u'')
390         
391         def increment_downloads(self):
392                 """Increment the ordinal that assigns a number to each file."""
393                 self._num_downloads += 1
394
395         def process_info(self, info_dict):
396                 """Process a single dictionary returned by an InfoExtractor."""
397                 # Do nothing else if in simulate mode
398                 if self.params.get('simulate', False):
399                         # Verify URL if it's an HTTP one
400                         if info_dict['url'].startswith('http'):
401                                 try:
402                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
403                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
404                                         raise UnavailableVideoError
405
406                         # Forced printings
407                         if self.params.get('forcetitle', False):
408                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
409                         if self.params.get('forceurl', False):
410                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
411                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
412                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
413                         if self.params.get('forcedescription', False) and 'description' in info_dict:
414                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
415
416                         return
417                         
418                 try:
419                         template_dict = dict(info_dict)
420                         template_dict['epoch'] = unicode(long(time.time()))
421                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
422                         filename = self.params['outtmpl'] % template_dict
423                 except (ValueError, KeyError), err:
424                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
425                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
426                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
427                         return
428
429                 try:
430                         self.pmkdir(filename)
431                 except (OSError, IOError), err:
432                         self.trouble('ERROR: unable to create directories: %s' % str(err))
433                         return
434
435                 try:
436                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
437                 except (OSError, IOError), err:
438                         raise UnavailableVideoError
439                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
440                         self.trouble('ERROR: unable to download video data: %s' % str(err))
441                         return
442                 except (ContentTooShortError, ), err:
443                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
444                         return
445
446                 if success:
447                         try:
448                                 self.post_process(filename, info_dict)
449                         except (PostProcessingError), err:
450                                 self.trouble('ERROR: postprocessing: %s' % str(err))
451                                 return
452
453         def download(self, url_list):
454                 """Download a given list of URLs."""
455                 if len(url_list) > 1 and self.fixed_template():
456                         raise SameFileError(self.params['outtmpl'])
457
458                 for url in url_list:
459                         suitable_found = False
460                         for ie in self._ies:
461                                 # Go to next InfoExtractor if not suitable
462                                 if not ie.suitable(url):
463                                         continue
464
465                                 # Suitable InfoExtractor found
466                                 suitable_found = True
467
468                                 # Extract information from URL and process it
469                                 ie.extract(url)
470
471                                 # Suitable InfoExtractor had been found; go to next URL
472                                 break
473
474                         if not suitable_found:
475                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
476
477                 return self._download_retcode
478
479         def post_process(self, filename, ie_info):
480                 """Run the postprocessing chain on the given file."""
481                 info = dict(ie_info)
482                 info['filepath'] = filename
483                 for pp in self._pps:
484                         info = pp.run(info)
485                         if info is None:
486                                 break
487         
488         def _download_with_rtmpdump(self, filename, url, player_url):
489                 self.report_destination(filename)
490
491                 # Check for rtmpdump first
492                 try:
493                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
494                 except (OSError, IOError):
495                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
496                         return False
497
498                 # Download using rtmpdump. rtmpdump returns exit code 2 when
499                 # the connection was interrumpted and resuming appears to be
500                 # possible. This is part of rtmpdump's normal usage, AFAIK.
501                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
502                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
503                 while retval == 2 or retval == 1:
504                         prevsize = os.path.getsize(filename)
505                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
506                         time.sleep(5.0) # This seems to be needed
507                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
508                         cursize = os.path.getsize(filename)
509                         if prevsize == cursize and retval == 1:
510                                 break
511                 if retval == 0:
512                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
513                         return True
514                 else:
515                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
516                         return False
517
518         def _do_download(self, filename, url, player_url):
519                 # Attempt to download using rtmpdump
520                 if url.startswith('rtmp'):
521                         return self._download_with_rtmpdump(filename, url, player_url)
522
523                 stream = None
524                 open_mode = 'wb'
525                 basic_request = urllib2.Request(url, None, std_headers)
526                 request = urllib2.Request(url, None, std_headers)
527
528                 # Establish possible resume length
529                 if os.path.isfile(filename):
530                         resume_len = os.path.getsize(filename)
531                 else:
532                         resume_len = 0
533
534                 # Request parameters in case of being able to resume
535                 if self.params.get('continuedl', False) and resume_len != 0:
536                         self.report_resuming_byte(resume_len)
537                         request.add_header('Range','bytes=%d-' % resume_len)
538                         open_mode = 'ab'
539
540                 count = 0
541                 retries = self.params.get('retries', 0)
542                 while True:
543                         # Establish connection
544                         try:
545                                 data = urllib2.urlopen(request)
546                                 break
547                         except (urllib2.HTTPError, ), err:
548                                 if err.code == 503:
549                                         # Retry in case of HTTP error 503
550                                         count += 1
551                                         if count <= retries:
552                                                 self.report_retry(count, retries)
553                                                 continue
554                                 if err.code != 416: #  416 is 'Requested range not satisfiable'
555                                         raise
556                                 # Unable to resume
557                                 data = urllib2.urlopen(basic_request)
558                                 content_length = data.info()['Content-Length']
559
560                                 if content_length is not None and long(content_length) == resume_len:
561                                         # Because the file had already been fully downloaded
562                                         self.report_file_already_downloaded(filename)
563                                         return True
564                                 else:
565                                         # Because the server didn't let us
566                                         self.report_unable_to_resume()
567                                         open_mode = 'wb'
568
569                 data_len = data.info().get('Content-length', None)
570                 data_len_str = self.format_bytes(data_len)
571                 byte_counter = 0
572                 block_size = 1024
573                 start = time.time()
574                 while True:
575                         # Download and write
576                         before = time.time()
577                         data_block = data.read(block_size)
578                         after = time.time()
579                         data_block_len = len(data_block)
580                         if data_block_len == 0:
581                                 break
582                         byte_counter += data_block_len
583
584                         # Open file just in time
585                         if stream is None:
586                                 try:
587                                         (stream, filename) = sanitize_open(filename, open_mode)
588                                         self.report_destination(filename)
589                                 except (OSError, IOError), err:
590                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
591                                         return False
592                         try:
593                                 stream.write(data_block)
594                         except (IOError, OSError), err:
595                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
596                         block_size = self.best_block_size(after - before, data_block_len)
597
598                         # Progress message
599                         percent_str = self.calc_percent(byte_counter, data_len)
600                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
601                         speed_str = self.calc_speed(start, time.time(), byte_counter)
602                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
603
604                         # Apply rate limit
605                         self.slow_down(start, byte_counter)
606
607                 self.report_finish()
608                 if data_len is not None and str(byte_counter) != data_len:
609                         raise ContentTooShortError(byte_counter, long(data_len))
610                 return True
611
612 class InfoExtractor(object):
613         """Information Extractor class.
614
615         Information extractors are the classes that, given a URL, extract
616         information from the video (or videos) the URL refers to. This
617         information includes the real video URL, the video title and simplified
618         title, author and others. The information is stored in a dictionary
619         which is then passed to the FileDownloader. The FileDownloader
620         processes this information possibly downloading the video to the file
621         system, among other possible outcomes. The dictionaries must include
622         the following fields:
623
624         id:             Video identifier.
625         url:            Final video URL.
626         uploader:       Nickname of the video uploader.
627         title:          Literal title.
628         stitle:         Simplified title.
629         ext:            Video filename extension.
630         format:         Video format.
631         player_url:     SWF Player URL (may be None).
632
633         The following fields are optional. Their primary purpose is to allow
634         youtube-dl to serve as the backend for a video search function, such
635         as the one in youtube2mp3.  They are only used when their respective
636         forced printing functions are called:
637
638         thumbnail:      Full URL to a video thumbnail image.
639         description:    One-line video description.
640
641         Subclasses of this one should re-define the _real_initialize() and
642         _real_extract() methods, as well as the suitable() static method.
643         Probably, they should also be instantiated and added to the main
644         downloader.
645         """
646
647         _ready = False
648         _downloader = None
649
650         def __init__(self, downloader=None):
651                 """Constructor. Receives an optional downloader."""
652                 self._ready = False
653                 self.set_downloader(downloader)
654
655         @staticmethod
656         def suitable(url):
657                 """Receives a URL and returns True if suitable for this IE."""
658                 return False
659
660         def initialize(self):
661                 """Initializes an instance (authentication, etc)."""
662                 if not self._ready:
663                         self._real_initialize()
664                         self._ready = True
665
666         def extract(self, url):
667                 """Extracts URL information and returns it in list of dicts."""
668                 self.initialize()
669                 return self._real_extract(url)
670
671         def set_downloader(self, downloader):
672                 """Sets the downloader for this IE."""
673                 self._downloader = downloader
674         
675         def _real_initialize(self):
676                 """Real initialization process. Redefine in subclasses."""
677                 pass
678
679         def _real_extract(self, url):
680                 """Real extraction process. Redefine in subclasses."""
681                 pass
682
683 class YoutubeIE(InfoExtractor):
684         """Information extractor for youtube.com."""
685
686         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
687         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
688         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
689         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
690         _NETRC_MACHINE = 'youtube'
691         # Listed in order of quality
692         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
693         _video_extensions = {
694                 '13': '3gp',
695                 '17': 'mp4',
696                 '18': 'mp4',
697                 '22': 'mp4',
698                 '37': 'mp4',
699                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
700                 '43': 'webm',
701                 '45': 'webm',
702         }
703
704         @staticmethod
705         def suitable(url):
706                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
707
708         def report_lang(self):
709                 """Report attempt to set language."""
710                 self._downloader.to_stdout(u'[youtube] Setting language')
711
712         def report_login(self):
713                 """Report attempt to log in."""
714                 self._downloader.to_stdout(u'[youtube] Logging in')
715         
716         def report_age_confirmation(self):
717                 """Report attempt to confirm age."""
718                 self._downloader.to_stdout(u'[youtube] Confirming age')
719         
720         def report_video_webpage_download(self, video_id):
721                 """Report attempt to download video webpage."""
722                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
723         
724         def report_video_info_webpage_download(self, video_id):
725                 """Report attempt to download video info webpage."""
726                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
727         
728         def report_information_extraction(self, video_id):
729                 """Report attempt to extract video information."""
730                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
731         
732         def report_unavailable_format(self, video_id, format):
733                 """Report extracted video URL."""
734                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
735         
736         def report_rtmp_download(self):
737                 """Indicate the download will use the RTMP protocol."""
738                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
739         
740         def _real_initialize(self):
741                 if self._downloader is None:
742                         return
743
744                 username = None
745                 password = None
746                 downloader_params = self._downloader.params
747
748                 # Attempt to use provided username and password or .netrc data
749                 if downloader_params.get('username', None) is not None:
750                         username = downloader_params['username']
751                         password = downloader_params['password']
752                 elif downloader_params.get('usenetrc', False):
753                         try:
754                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
755                                 if info is not None:
756                                         username = info[0]
757                                         password = info[2]
758                                 else:
759                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
760                         except (IOError, netrc.NetrcParseError), err:
761                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
762                                 return
763
764                 # Set language
765                 request = urllib2.Request(self._LANG_URL, None, std_headers)
766                 try:
767                         self.report_lang()
768                         urllib2.urlopen(request).read()
769                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
770                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
771                         return
772
773                 # No authentication to be performed
774                 if username is None:
775                         return
776
777                 # Log in
778                 login_form = {
779                                 'current_form': 'loginForm',
780                                 'next':         '/',
781                                 'action_login': 'Log In',
782                                 'username':     username,
783                                 'password':     password,
784                                 }
785                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
786                 try:
787                         self.report_login()
788                         login_results = urllib2.urlopen(request).read()
789                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
790                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
791                                 return
792                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
793                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
794                         return
795         
796                 # Confirm age
797                 age_form = {
798                                 'next_url':             '/',
799                                 'action_confirm':       'Confirm',
800                                 }
801                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
802                 try:
803                         self.report_age_confirmation()
804                         age_results = urllib2.urlopen(request).read()
805                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
806                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
807                         return
808
809         def _real_extract(self, url):
810                 # Extract video id from URL
811                 mobj = re.match(self._VALID_URL, url)
812                 if mobj is None:
813                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
814                         return
815                 video_id = mobj.group(2)
816
817                 # Get video webpage
818                 self.report_video_webpage_download(video_id)
819                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
820                 try:
821                         video_webpage = urllib2.urlopen(request).read()
822                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
823                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
824                         return
825
826                 # Attempt to extract SWF player URL
827                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
828                 if mobj is not None:
829                         player_url = mobj.group(1)
830                 else:
831                         player_url = None
832
833                 # Get video info
834                 self.report_video_info_webpage_download(video_id)
835                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
836                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
837                                            % (video_id, el_type))
838                         request = urllib2.Request(video_info_url, None, std_headers)
839                         try:
840                                 video_info_webpage = urllib2.urlopen(request).read()
841                                 video_info = parse_qs(video_info_webpage)
842                                 if 'token' in video_info:
843                                         break
844                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
845                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
846                                 return
847                 self.report_information_extraction(video_id)
848
849                 # uploader
850                 if 'author' not in video_info:
851                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
852                         return
853                 video_uploader = urllib.unquote_plus(video_info['author'][0])
854
855                 # title
856                 if 'title' not in video_info:
857                         self._downloader.trouble(u'ERROR: unable to extract video title')
858                         return
859                 video_title = urllib.unquote_plus(video_info['title'][0])
860                 video_title = video_title.decode('utf-8')
861                 video_title = sanitize_title(video_title)
862
863                 # simplified title
864                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
865                 simple_title = simple_title.strip(ur'_')
866
867                 # thumbnail image
868                 if 'thumbnail_url' not in video_info:
869                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
870                         video_thumbnail = ''
871                 else:   # don't panic if we can't find it
872                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
873
874                 # description
875                 video_description = 'No description available.'
876                 if self._downloader.params.get('forcedescription', False):
877                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
878                         if mobj is not None:
879                                 video_description = mobj.group(1)
880
881                 # Decide which formats to download
882                 requested_format = self._downloader.params.get('format', None)
883
884                 if requested_format in ["43", "45"]: # webm formats
885                         # Join the HTML5 beta
886                         html5form = { "enable_html5": "true" }
887                         request = urllib2.Request('http://www.youtube.com/html5', urllib.urlencode(html5form), std_headers)
888                         try:
889                                 self._downloader.to_stdout(u'[youtube] Joining the HTML5 Beta')
890                                 urllib2.urlopen(request).read()
891                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
892                                 self._downloader.trouble(u'ERROR: unable to join the HTML5 Beta: %s' % str(err))
893                                 return
894
895                         # Request the video webpage with webm enabled
896                         request = urllib2.Request('http://www.youtube.com/watch?v=%s&webm=1' % video_id, None, std_headers)
897                         try:
898                                 self._downloader.to_stdout(u'[youtube] Requesting HTML5 video webpage')
899                                 video_webpage = urllib2.urlopen(request).read()
900                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
901                                 self._downloader.trouble(u'ERROR: unable to get the HTML5 video webpage: %s' % str(err))
902                                 return
903
904                         # Find the URL for the requested format
905                         mobj = re.search(ur'setAvailableFormat\("(.*?)".*?"%s"\);' % requested_format, video_webpage)
906                         if mobj is None:
907                                 self._downloader.trouble(u'ERROR: format not available for video')
908                                 return
909                         video_url_list = [(requested_format, mobj.group(1))]
910
911                 elif 'fmt_url_map' in video_info:
912                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
913                         format_limit = self._downloader.params.get('format_limit', None)
914                         if format_limit is not None and format_limit in self._available_formats:
915                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
916                         else:
917                                 format_list = self._available_formats
918                         existing_formats = [x for x in format_list if x in url_map]
919                         if len(existing_formats) == 0:
920                                 self._downloader.trouble(u'ERROR: no known formats available for video')
921                                 return
922                         if requested_format is None:
923                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
924                         elif requested_format == '-1':
925                                 video_url_list = url_map.items() # All formats
926                         else:
927                                 if requested_format not in existing_formats:
928                                         self._downloader.trouble(u'ERROR: format not available for video')
929                                         return
930                                 video_url_list = [(requested_format, url_map[requested_format])] # Specific format
931
932                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
933                         self.report_rtmp_download()
934                         video_url_list = [(None, video_info['conn'][0])]
935
936                 else:
937                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
938                         return
939
940                 for format_param, video_real_url in video_url_list:
941                         # At this point we have a new video
942                         self._downloader.increment_downloads()
943
944                         # Extension
945                         video_extension = self._video_extensions.get(format_param, 'flv')
946
947                         # Find the video URL in fmt_url_map or conn paramters
948                         try:
949                                 # Process video information
950                                 self._downloader.process_info({
951                                         'id':           video_id.decode('utf-8'),
952                                         'url':          video_real_url.decode('utf-8'),
953                                         'uploader':     video_uploader.decode('utf-8'),
954                                         'title':        video_title,
955                                         'stitle':       simple_title,
956                                         'ext':          video_extension.decode('utf-8'),
957                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
958                                         'thumbnail':    video_thumbnail.decode('utf-8'),
959                                         'description':  video_description.decode('utf-8'),
960                                         'player_url':   player_url,
961                                 })
962                         except UnavailableVideoError, err:
963                                 self._downloader.trouble(u'ERROR: unable to download video')
964
965
966 class MetacafeIE(InfoExtractor):
967         """Information Extractor for metacafe.com."""
968
969         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
970         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
971         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
972         _youtube_ie = None
973
974         def __init__(self, youtube_ie, downloader=None):
975                 InfoExtractor.__init__(self, downloader)
976                 self._youtube_ie = youtube_ie
977
978         @staticmethod
979         def suitable(url):
980                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
981
982         def report_disclaimer(self):
983                 """Report disclaimer retrieval."""
984                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
985
986         def report_age_confirmation(self):
987                 """Report attempt to confirm age."""
988                 self._downloader.to_stdout(u'[metacafe] Confirming age')
989         
990         def report_download_webpage(self, video_id):
991                 """Report webpage download."""
992                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
993         
994         def report_extraction(self, video_id):
995                 """Report information extraction."""
996                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
997
998         def _real_initialize(self):
999                 # Retrieve disclaimer
1000                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1001                 try:
1002                         self.report_disclaimer()
1003                         disclaimer = urllib2.urlopen(request).read()
1004                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1005                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1006                         return
1007
1008                 # Confirm age
1009                 disclaimer_form = {
1010                         'filters': '0',
1011                         'submit': "Continue - I'm over 18",
1012                         }
1013                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1014                 try:
1015                         self.report_age_confirmation()
1016                         disclaimer = urllib2.urlopen(request).read()
1017                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1018                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1019                         return
1020         
1021         def _real_extract(self, url):
1022                 # Extract id and simplified title from URL
1023                 mobj = re.match(self._VALID_URL, url)
1024                 if mobj is None:
1025                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1026                         return
1027
1028                 video_id = mobj.group(1)
1029
1030                 # Check if video comes from YouTube
1031                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1032                 if mobj2 is not None:
1033                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1034                         return
1035
1036                 # At this point we have a new video
1037                 self._downloader.increment_downloads()
1038
1039                 simple_title = mobj.group(2).decode('utf-8')
1040                 video_extension = 'flv'
1041
1042                 # Retrieve video webpage to extract further information
1043                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1044                 try:
1045                         self.report_download_webpage(video_id)
1046                         webpage = urllib2.urlopen(request).read()
1047                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1048                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1049                         return
1050
1051                 # Extract URL, uploader and title from webpage
1052                 self.report_extraction(video_id)
1053                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1054                 if mobj is None:
1055                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1056                         return
1057                 mediaURL = urllib.unquote(mobj.group(1))
1058
1059                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1060                 #if mobj is None:
1061                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1062                 #       return
1063                 #gdaKey = mobj.group(1)
1064                 #
1065                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1066
1067                 video_url = mediaURL
1068
1069                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1070                 if mobj is None:
1071                         self._downloader.trouble(u'ERROR: unable to extract title')
1072                         return
1073                 video_title = mobj.group(1).decode('utf-8')
1074                 video_title = sanitize_title(video_title)
1075
1076                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1077                 if mobj is None:
1078                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1079                         return
1080                 video_uploader = mobj.group(1)
1081
1082                 try:
1083                         # Process video information
1084                         self._downloader.process_info({
1085                                 'id':           video_id.decode('utf-8'),
1086                                 'url':          video_url.decode('utf-8'),
1087                                 'uploader':     video_uploader.decode('utf-8'),
1088                                 'title':        video_title,
1089                                 'stitle':       simple_title,
1090                                 'ext':          video_extension.decode('utf-8'),
1091                                 'format':       u'NA',
1092                                 'player_url':   None,
1093                         })
1094                 except UnavailableVideoError:
1095                         self._downloader.trouble(u'ERROR: unable to download video')
1096
1097
1098 class DailymotionIE(InfoExtractor):
1099         """Information Extractor for Dailymotion"""
1100
1101         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1102
1103         def __init__(self, downloader=None):
1104                 InfoExtractor.__init__(self, downloader)
1105
1106         @staticmethod
1107         def suitable(url):
1108                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1109
1110         def report_download_webpage(self, video_id):
1111                 """Report webpage download."""
1112                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1113         
1114         def report_extraction(self, video_id):
1115                 """Report information extraction."""
1116                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1117
1118         def _real_initialize(self):
1119                 return
1120
1121         def _real_extract(self, url):
1122                 # Extract id and simplified title from URL
1123                 mobj = re.match(self._VALID_URL, url)
1124                 if mobj is None:
1125                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1126                         return
1127
1128                 # At this point we have a new video
1129                 self._downloader.increment_downloads()
1130                 video_id = mobj.group(1)
1131
1132                 simple_title = mobj.group(2).decode('utf-8')
1133                 video_extension = 'flv'
1134
1135                 # Retrieve video webpage to extract further information
1136                 request = urllib2.Request(url)
1137                 try:
1138                         self.report_download_webpage(video_id)
1139                         webpage = urllib2.urlopen(request).read()
1140                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1141                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1142                         return
1143
1144                 # Extract URL, uploader and title from webpage
1145                 self.report_extraction(video_id)
1146                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1147                 if mobj is None:
1148                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1149                         return
1150                 mediaURL = urllib.unquote(mobj.group(1))
1151
1152                 # if needed add http://www.dailymotion.com/ if relative URL
1153
1154                 video_url = mediaURL
1155
1156                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1157                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1158                 if mobj is None:
1159                         self._downloader.trouble(u'ERROR: unable to extract title')
1160                         return
1161                 video_title = mobj.group(1).decode('utf-8')
1162                 video_title = sanitize_title(video_title)
1163
1164                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1165                 if mobj is None:
1166                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1167                         return
1168                 video_uploader = mobj.group(1)
1169
1170                 try:
1171                         # Process video information
1172                         self._downloader.process_info({
1173                                 'id':           video_id.decode('utf-8'),
1174                                 'url':          video_url.decode('utf-8'),
1175                                 'uploader':     video_uploader.decode('utf-8'),
1176                                 'title':        video_title,
1177                                 'stitle':       simple_title,
1178                                 'ext':          video_extension.decode('utf-8'),
1179                                 'format':       u'NA',
1180                                 'player_url':   None,
1181                         })
1182                 except UnavailableVideoError:
1183                         self._downloader.trouble(u'ERROR: unable to download video')
1184
1185 class GoogleIE(InfoExtractor):
1186         """Information extractor for video.google.com."""
1187
1188         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1189
1190         def __init__(self, downloader=None):
1191                 InfoExtractor.__init__(self, downloader)
1192
1193         @staticmethod
1194         def suitable(url):
1195                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1196
1197         def report_download_webpage(self, video_id):
1198                 """Report webpage download."""
1199                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1200
1201         def report_extraction(self, video_id):
1202                 """Report information extraction."""
1203                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1204
1205         def _real_initialize(self):
1206                 return
1207
1208         def _real_extract(self, url):
1209                 # Extract id from URL
1210                 mobj = re.match(self._VALID_URL, url)
1211                 if mobj is None:
1212                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1213                         return
1214
1215                 # At this point we have a new video
1216                 self._downloader.increment_downloads()
1217                 video_id = mobj.group(1)
1218
1219                 video_extension = 'mp4'
1220
1221                 # Retrieve video webpage to extract further information
1222                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1223                 try:
1224                         self.report_download_webpage(video_id)
1225                         webpage = urllib2.urlopen(request).read()
1226                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1228                         return
1229
1230                 # Extract URL, uploader, and title from webpage
1231                 self.report_extraction(video_id)
1232                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1233                 if mobj is None:
1234                         video_extension = 'flv'
1235                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1236                 if mobj is None:
1237                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1238                         return
1239                 mediaURL = urllib.unquote(mobj.group(1))
1240                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1241                 mediaURL = mediaURL.replace('\\x26', '\x26')
1242
1243                 video_url = mediaURL
1244
1245                 mobj = re.search(r'<title>(.*)</title>', webpage)
1246                 if mobj is None:
1247                         self._downloader.trouble(u'ERROR: unable to extract title')
1248                         return
1249                 video_title = mobj.group(1).decode('utf-8')
1250                 video_title = sanitize_title(video_title)
1251                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1252
1253                 # Extract video description
1254                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1255                 if mobj is None:
1256                         self._downloader.trouble(u'ERROR: unable to extract video description')
1257                         return
1258                 video_description = mobj.group(1).decode('utf-8')
1259                 if not video_description:
1260                         video_description = 'No description available.'
1261
1262                 # Extract video thumbnail
1263                 if self._downloader.params.get('forcethumbnail', False):
1264                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1265                         try:
1266                                 webpage = urllib2.urlopen(request).read()
1267                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1268                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1269                                 return
1270                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1271                         if mobj is None:
1272                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1273                                 return
1274                         video_thumbnail = mobj.group(1)
1275                 else:   # we need something to pass to process_info
1276                         video_thumbnail = ''
1277
1278
1279                 try:
1280                         # Process video information
1281                         self._downloader.process_info({
1282                                 'id':           video_id.decode('utf-8'),
1283                                 'url':          video_url.decode('utf-8'),
1284                                 'uploader':     u'NA',
1285                                 'title':        video_title,
1286                                 'stitle':       simple_title,
1287                                 'ext':          video_extension.decode('utf-8'),
1288                                 'format':       u'NA',
1289                                 'player_url':   None,
1290                         })
1291                 except UnavailableVideoError:
1292                         self._downloader.trouble(u'ERROR: unable to download video')
1293
1294
1295 class PhotobucketIE(InfoExtractor):
1296         """Information extractor for photobucket.com."""
1297
1298         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1299
1300         def __init__(self, downloader=None):
1301                 InfoExtractor.__init__(self, downloader)
1302
1303         @staticmethod
1304         def suitable(url):
1305                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1306
1307         def report_download_webpage(self, video_id):
1308                 """Report webpage download."""
1309                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1310
1311         def report_extraction(self, video_id):
1312                 """Report information extraction."""
1313                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1314
1315         def _real_initialize(self):
1316                 return
1317
1318         def _real_extract(self, url):
1319                 # Extract id from URL
1320                 mobj = re.match(self._VALID_URL, url)
1321                 if mobj is None:
1322                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1323                         return
1324
1325                 # At this point we have a new video
1326                 self._downloader.increment_downloads()
1327                 video_id = mobj.group(1)
1328
1329                 video_extension = 'flv'
1330
1331                 # Retrieve video webpage to extract further information
1332                 request = urllib2.Request(url)
1333                 try:
1334                         self.report_download_webpage(video_id)
1335                         webpage = urllib2.urlopen(request).read()
1336                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1337                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1338                         return
1339
1340                 # Extract URL, uploader, and title from webpage
1341                 self.report_extraction(video_id)
1342                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1343                 if mobj is None:
1344                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1345                         return
1346                 mediaURL = urllib.unquote(mobj.group(1))
1347
1348                 video_url = mediaURL
1349
1350                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1351                 if mobj is None:
1352                         self._downloader.trouble(u'ERROR: unable to extract title')
1353                         return
1354                 video_title = mobj.group(1).decode('utf-8')
1355                 video_title = sanitize_title(video_title)
1356                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1357
1358                 video_uploader = mobj.group(2).decode('utf-8')
1359
1360                 try:
1361                         # Process video information
1362                         self._downloader.process_info({
1363                                 'id':           video_id.decode('utf-8'),
1364                                 'url':          video_url.decode('utf-8'),
1365                                 'uploader':     video_uploader,
1366                                 'title':        video_title,
1367                                 'stitle':       simple_title,
1368                                 'ext':          video_extension.decode('utf-8'),
1369                                 'format':       u'NA',
1370                                 'player_url':   None,
1371                         })
1372                 except UnavailableVideoError:
1373                         self._downloader.trouble(u'ERROR: unable to download video')
1374
1375
1376 class YahooIE(InfoExtractor):
1377         """Information extractor for video.yahoo.com."""
1378
1379         # _VALID_URL matches all Yahoo! Video URLs
1380         # _VPAGE_URL matches only the extractable '/watch/' URLs
1381         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1382         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1383
1384         def __init__(self, downloader=None):
1385                 InfoExtractor.__init__(self, downloader)
1386
1387         @staticmethod
1388         def suitable(url):
1389                 return (re.match(YahooIE._VALID_URL, url) is not None)
1390
1391         def report_download_webpage(self, video_id):
1392                 """Report webpage download."""
1393                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1394
1395         def report_extraction(self, video_id):
1396                 """Report information extraction."""
1397                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1398
1399         def _real_initialize(self):
1400                 return
1401
1402         def _real_extract(self, url, new_video=True):
1403                 # Extract ID from URL
1404                 mobj = re.match(self._VALID_URL, url)
1405                 if mobj is None:
1406                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1407                         return
1408
1409                 # At this point we have a new video
1410                 self._downloader.increment_downloads()
1411                 video_id = mobj.group(2)
1412                 video_extension = 'flv'
1413
1414                 # Rewrite valid but non-extractable URLs as
1415                 # extractable English language /watch/ URLs
1416                 if re.match(self._VPAGE_URL, url) is None:
1417                         request = urllib2.Request(url)
1418                         try:
1419                                 webpage = urllib2.urlopen(request).read()
1420                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1422                                 return
1423
1424                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1425                         if mobj is None:
1426                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1427                                 return
1428                         yahoo_id = mobj.group(1)
1429
1430                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1431                         if mobj is None:
1432                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1433                                 return
1434                         yahoo_vid = mobj.group(1)
1435
1436                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1437                         return self._real_extract(url, new_video=False)
1438
1439                 # Retrieve video webpage to extract further information
1440                 request = urllib2.Request(url)
1441                 try:
1442                         self.report_download_webpage(video_id)
1443                         webpage = urllib2.urlopen(request).read()
1444                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1445                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1446                         return
1447
1448                 # Extract uploader and title from webpage
1449                 self.report_extraction(video_id)
1450                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1451                 if mobj is None:
1452                         self._downloader.trouble(u'ERROR: unable to extract video title')
1453                         return
1454                 video_title = mobj.group(1).decode('utf-8')
1455                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1456
1457                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1458                 if mobj is None:
1459                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1460                         return
1461                 video_uploader = mobj.group(1).decode('utf-8')
1462
1463                 # Extract video thumbnail
1464                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1465                 if mobj is None:
1466                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1467                         return
1468                 video_thumbnail = mobj.group(1).decode('utf-8')
1469
1470                 # Extract video description
1471                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1472                 if mobj is None:
1473                         self._downloader.trouble(u'ERROR: unable to extract video description')
1474                         return
1475                 video_description = mobj.group(1).decode('utf-8')
1476                 if not video_description: video_description = 'No description available.'
1477
1478                 # Extract video height and width
1479                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1480                 if mobj is None:
1481                         self._downloader.trouble(u'ERROR: unable to extract video height')
1482                         return
1483                 yv_video_height = mobj.group(1)
1484
1485                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1486                 if mobj is None:
1487                         self._downloader.trouble(u'ERROR: unable to extract video width')
1488                         return
1489                 yv_video_width = mobj.group(1)
1490
1491                 # Retrieve video playlist to extract media URL
1492                 # I'm not completely sure what all these options are, but we
1493                 # seem to need most of them, otherwise the server sends a 401.
1494                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1495                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1496                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1497                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1498                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1499                 try:
1500                         self.report_download_webpage(video_id)
1501                         webpage = urllib2.urlopen(request).read()
1502                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1503                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1504                         return
1505
1506                 # Extract media URL from playlist XML
1507                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1508                 if mobj is None:
1509                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1510                         return
1511                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1512                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1513
1514                 try:
1515                         # Process video information
1516                         self._downloader.process_info({
1517                                 'id':           video_id.decode('utf-8'),
1518                                 'url':          video_url,
1519                                 'uploader':     video_uploader,
1520                                 'title':        video_title,
1521                                 'stitle':       simple_title,
1522                                 'ext':          video_extension.decode('utf-8'),
1523                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1524                                 'description':  video_description,
1525                                 'thumbnail':    video_thumbnail,
1526                                 'description':  video_description,
1527                                 'player_url':   None,
1528                         })
1529                 except UnavailableVideoError:
1530                         self._downloader.trouble(u'ERROR: unable to download video')
1531
1532
1533 class GenericIE(InfoExtractor):
1534         """Generic last-resort information extractor."""
1535
1536         def __init__(self, downloader=None):
1537                 InfoExtractor.__init__(self, downloader)
1538
1539         @staticmethod
1540         def suitable(url):
1541                 return True
1542
1543         def report_download_webpage(self, video_id):
1544                 """Report webpage download."""
1545                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1546                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1547
1548         def report_extraction(self, video_id):
1549                 """Report information extraction."""
1550                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1551
1552         def _real_initialize(self):
1553                 return
1554
1555         def _real_extract(self, url):
1556                 # At this point we have a new video
1557                 self._downloader.increment_downloads()
1558
1559                 video_id = url.split('/')[-1]
1560                 request = urllib2.Request(url)
1561                 try:
1562                         self.report_download_webpage(video_id)
1563                         webpage = urllib2.urlopen(request).read()
1564                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1565                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1566                         return
1567                 except ValueError, err:
1568                         # since this is the last-resort InfoExtractor, if
1569                         # this error is thrown, it'll be thrown here
1570                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1571                         return
1572
1573                 # Start with something easy: JW Player in SWFObject
1574                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1575                 if mobj is None:
1576                         # Broaden the search a little bit
1577                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1578                 if mobj is None:
1579                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1580                         return
1581
1582                 # It's possible that one of the regexes
1583                 # matched, but returned an empty group:
1584                 if mobj.group(1) is None:
1585                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1586                         return
1587
1588                 video_url = urllib.unquote(mobj.group(1))
1589                 video_id  = os.path.basename(video_url)
1590
1591                 # here's a fun little line of code for you:
1592                 video_extension = os.path.splitext(video_id)[1][1:]
1593                 video_id        = os.path.splitext(video_id)[0]
1594
1595                 # it's tempting to parse this further, but you would
1596                 # have to take into account all the variations like
1597                 #   Video Title - Site Name
1598                 #   Site Name | Video Title
1599                 #   Video Title - Tagline | Site Name
1600                 # and so on and so forth; it's just not practical
1601                 mobj = re.search(r'<title>(.*)</title>', webpage)
1602                 if mobj is None:
1603                         self._downloader.trouble(u'ERROR: unable to extract title')
1604                         return
1605                 video_title = mobj.group(1).decode('utf-8')
1606                 video_title = sanitize_title(video_title)
1607                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1608
1609                 # video uploader is domain name
1610                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1611                 if mobj is None:
1612                         self._downloader.trouble(u'ERROR: unable to extract title')
1613                         return
1614                 video_uploader = mobj.group(1).decode('utf-8')
1615
1616                 try:
1617                         # Process video information
1618                         self._downloader.process_info({
1619                                 'id':           video_id.decode('utf-8'),
1620                                 'url':          video_url.decode('utf-8'),
1621                                 'uploader':     video_uploader,
1622                                 'title':        video_title,
1623                                 'stitle':       simple_title,
1624                                 'ext':          video_extension.decode('utf-8'),
1625                                 'format':       u'NA',
1626                                 'player_url':   None,
1627                         })
1628                 except UnavailableVideoError, err:
1629                         self._downloader.trouble(u'ERROR: unable to download video')
1630
1631
1632 class YoutubeSearchIE(InfoExtractor):
1633         """Information Extractor for YouTube search queries."""
1634         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1635         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1636         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1637         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1638         _youtube_ie = None
1639         _max_youtube_results = 1000
1640
1641         def __init__(self, youtube_ie, downloader=None):
1642                 InfoExtractor.__init__(self, downloader)
1643                 self._youtube_ie = youtube_ie
1644         
1645         @staticmethod
1646         def suitable(url):
1647                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1648
1649         def report_download_page(self, query, pagenum):
1650                 """Report attempt to download playlist page with given number."""
1651                 query = query.decode(preferredencoding())
1652                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1653
1654         def _real_initialize(self):
1655                 self._youtube_ie.initialize()
1656         
1657         def _real_extract(self, query):
1658                 mobj = re.match(self._VALID_QUERY, query)
1659                 if mobj is None:
1660                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1661                         return
1662
1663                 prefix, query = query.split(':')
1664                 prefix = prefix[8:]
1665                 query  = query.encode('utf-8')
1666                 if prefix == '':
1667                         self._download_n_results(query, 1)
1668                         return
1669                 elif prefix == 'all':
1670                         self._download_n_results(query, self._max_youtube_results)
1671                         return
1672                 else:
1673                         try:
1674                                 n = long(prefix)
1675                                 if n <= 0:
1676                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1677                                         return
1678                                 elif n > self._max_youtube_results:
1679                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1680                                         n = self._max_youtube_results
1681                                 self._download_n_results(query, n)
1682                                 return
1683                         except ValueError: # parsing prefix as integer fails
1684                                 self._download_n_results(query, 1)
1685                                 return
1686
1687         def _download_n_results(self, query, n):
1688                 """Downloads a specified number of results for a query"""
1689
1690                 video_ids = []
1691                 already_seen = set()
1692                 pagenum = 1
1693
1694                 while True:
1695                         self.report_download_page(query, pagenum)
1696                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1697                         request = urllib2.Request(result_url, None, std_headers)
1698                         try:
1699                                 page = urllib2.urlopen(request).read()
1700                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1701                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1702                                 return
1703
1704                         # Extract video identifiers
1705                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1706                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1707                                 if video_id not in already_seen:
1708                                         video_ids.append(video_id)
1709                                         already_seen.add(video_id)
1710                                         if len(video_ids) == n:
1711                                                 # Specified n videos reached
1712                                                 for id in video_ids:
1713                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1714                                                 return
1715
1716                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1717                                 for id in video_ids:
1718                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1719                                 return
1720
1721                         pagenum = pagenum + 1
1722
1723 class GoogleSearchIE(InfoExtractor):
1724         """Information Extractor for Google Video search queries."""
1725         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1726         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1727         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1728         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1729         _google_ie = None
1730         _max_google_results = 1000
1731
1732         def __init__(self, google_ie, downloader=None):
1733                 InfoExtractor.__init__(self, downloader)
1734                 self._google_ie = google_ie
1735         
1736         @staticmethod
1737         def suitable(url):
1738                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1739
1740         def report_download_page(self, query, pagenum):
1741                 """Report attempt to download playlist page with given number."""
1742                 query = query.decode(preferredencoding())
1743                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1744
1745         def _real_initialize(self):
1746                 self._google_ie.initialize()
1747         
1748         def _real_extract(self, query):
1749                 mobj = re.match(self._VALID_QUERY, query)
1750                 if mobj is None:
1751                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1752                         return
1753
1754                 prefix, query = query.split(':')
1755                 prefix = prefix[8:]
1756                 query  = query.encode('utf-8')
1757                 if prefix == '':
1758                         self._download_n_results(query, 1)
1759                         return
1760                 elif prefix == 'all':
1761                         self._download_n_results(query, self._max_google_results)
1762                         return
1763                 else:
1764                         try:
1765                                 n = long(prefix)
1766                                 if n <= 0:
1767                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1768                                         return
1769                                 elif n > self._max_google_results:
1770                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1771                                         n = self._max_google_results
1772                                 self._download_n_results(query, n)
1773                                 return
1774                         except ValueError: # parsing prefix as integer fails
1775                                 self._download_n_results(query, 1)
1776                                 return
1777
1778         def _download_n_results(self, query, n):
1779                 """Downloads a specified number of results for a query"""
1780
1781                 video_ids = []
1782                 already_seen = set()
1783                 pagenum = 1
1784
1785                 while True:
1786                         self.report_download_page(query, pagenum)
1787                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1788                         request = urllib2.Request(result_url, None, std_headers)
1789                         try:
1790                                 page = urllib2.urlopen(request).read()
1791                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1792                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1793                                 return
1794
1795                         # Extract video identifiers
1796                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797                                 video_id = mobj.group(1)
1798                                 if video_id not in already_seen:
1799                                         video_ids.append(video_id)
1800                                         already_seen.add(video_id)
1801                                         if len(video_ids) == n:
1802                                                 # Specified n videos reached
1803                                                 for id in video_ids:
1804                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1805                                                 return
1806
1807                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1808                                 for id in video_ids:
1809                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1810                                 return
1811
1812                         pagenum = pagenum + 1
1813
1814 class YahooSearchIE(InfoExtractor):
1815         """Information Extractor for Yahoo! Video search queries."""
1816         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1817         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1818         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1819         _MORE_PAGES_INDICATOR = r'\s*Next'
1820         _yahoo_ie = None
1821         _max_yahoo_results = 1000
1822
1823         def __init__(self, yahoo_ie, downloader=None):
1824                 InfoExtractor.__init__(self, downloader)
1825                 self._yahoo_ie = yahoo_ie
1826         
1827         @staticmethod
1828         def suitable(url):
1829                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1830
1831         def report_download_page(self, query, pagenum):
1832                 """Report attempt to download playlist page with given number."""
1833                 query = query.decode(preferredencoding())
1834                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1835
1836         def _real_initialize(self):
1837                 self._yahoo_ie.initialize()
1838         
1839         def _real_extract(self, query):
1840                 mobj = re.match(self._VALID_QUERY, query)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1843                         return
1844
1845                 prefix, query = query.split(':')
1846                 prefix = prefix[8:]
1847                 query  = query.encode('utf-8')
1848                 if prefix == '':
1849                         self._download_n_results(query, 1)
1850                         return
1851                 elif prefix == 'all':
1852                         self._download_n_results(query, self._max_yahoo_results)
1853                         return
1854                 else:
1855                         try:
1856                                 n = long(prefix)
1857                                 if n <= 0:
1858                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1859                                         return
1860                                 elif n > self._max_yahoo_results:
1861                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1862                                         n = self._max_yahoo_results
1863                                 self._download_n_results(query, n)
1864                                 return
1865                         except ValueError: # parsing prefix as integer fails
1866                                 self._download_n_results(query, 1)
1867                                 return
1868
1869         def _download_n_results(self, query, n):
1870                 """Downloads a specified number of results for a query"""
1871
1872                 video_ids = []
1873                 already_seen = set()
1874                 pagenum = 1
1875
1876                 while True:
1877                         self.report_download_page(query, pagenum)
1878                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1879                         request = urllib2.Request(result_url, None, std_headers)
1880                         try:
1881                                 page = urllib2.urlopen(request).read()
1882                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1884                                 return
1885
1886                         # Extract video identifiers
1887                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1888                                 video_id = mobj.group(1)
1889                                 if video_id not in already_seen:
1890                                         video_ids.append(video_id)
1891                                         already_seen.add(video_id)
1892                                         if len(video_ids) == n:
1893                                                 # Specified n videos reached
1894                                                 for id in video_ids:
1895                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1896                                                 return
1897
1898                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1899                                 for id in video_ids:
1900                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1901                                 return
1902
1903                         pagenum = pagenum + 1
1904
1905 class YoutubePlaylistIE(InfoExtractor):
1906         """Information Extractor for YouTube playlists."""
1907
1908         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1909         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1910         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1911         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1912         _youtube_ie = None
1913
1914         def __init__(self, youtube_ie, downloader=None):
1915                 InfoExtractor.__init__(self, downloader)
1916                 self._youtube_ie = youtube_ie
1917         
1918         @staticmethod
1919         def suitable(url):
1920                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1921
1922         def report_download_page(self, playlist_id, pagenum):
1923                 """Report attempt to download playlist page with given number."""
1924                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1925
1926         def _real_initialize(self):
1927                 self._youtube_ie.initialize()
1928         
1929         def _real_extract(self, url):
1930                 # Extract playlist id
1931                 mobj = re.match(self._VALID_URL, url)
1932                 if mobj is None:
1933                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1934                         return
1935
1936                 # Download playlist pages
1937                 playlist_id = mobj.group(1)
1938                 video_ids = []
1939                 pagenum = 1
1940
1941                 while True:
1942                         self.report_download_page(playlist_id, pagenum)
1943                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1944                         try:
1945                                 page = urllib2.urlopen(request).read()
1946                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1947                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1948                                 return
1949
1950                         # Extract video identifiers
1951                         ids_in_page = []
1952                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1953                                 if mobj.group(1) not in ids_in_page:
1954                                         ids_in_page.append(mobj.group(1))
1955                         video_ids.extend(ids_in_page)
1956
1957                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1958                                 break
1959                         pagenum = pagenum + 1
1960
1961                 for id in video_ids:
1962                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1963                 return
1964
1965 class YoutubeUserIE(InfoExtractor):
1966         """Information Extractor for YouTube users."""
1967
1968         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1969         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1970         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1971         _youtube_ie = None
1972
1973         def __init__(self, youtube_ie, downloader=None):
1974                 InfoExtractor.__init__(self, downloader)
1975                 self._youtube_ie = youtube_ie
1976         
1977         @staticmethod
1978         def suitable(url):
1979                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1980
1981         def report_download_page(self, username):
1982                 """Report attempt to download user page."""
1983                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1984
1985         def _real_initialize(self):
1986                 self._youtube_ie.initialize()
1987         
1988         def _real_extract(self, url):
1989                 # Extract username
1990                 mobj = re.match(self._VALID_URL, url)
1991                 if mobj is None:
1992                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1993                         return
1994
1995                 # Download user page
1996                 username = mobj.group(1)
1997                 video_ids = []
1998                 pagenum = 1
1999
2000                 self.report_download_page(username)
2001                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2002                 try:
2003                         page = urllib2.urlopen(request).read()
2004                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2005                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2006                         return
2007
2008                 # Extract video identifiers
2009                 ids_in_page = []
2010
2011                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2012                         if mobj.group(1) not in ids_in_page:
2013                                 ids_in_page.append(mobj.group(1))
2014                 video_ids.extend(ids_in_page)
2015
2016                 for id in video_ids:
2017                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2018                 return
2019
2020 class PostProcessor(object):
2021         """Post Processor class.
2022
2023         PostProcessor objects can be added to downloaders with their
2024         add_post_processor() method. When the downloader has finished a
2025         successful download, it will take its internal chain of PostProcessors
2026         and start calling the run() method on each one of them, first with
2027         an initial argument and then with the returned value of the previous
2028         PostProcessor.
2029
2030         The chain will be stopped if one of them ever returns None or the end
2031         of the chain is reached.
2032
2033         PostProcessor objects follow a "mutual registration" process similar
2034         to InfoExtractor objects.
2035         """
2036
2037         _downloader = None
2038
2039         def __init__(self, downloader=None):
2040                 self._downloader = downloader
2041
2042         def set_downloader(self, downloader):
2043                 """Sets the downloader for this PP."""
2044                 self._downloader = downloader
2045         
2046         def run(self, information):
2047                 """Run the PostProcessor.
2048
2049                 The "information" argument is a dictionary like the ones
2050                 composed by InfoExtractors. The only difference is that this
2051                 one has an extra field called "filepath" that points to the
2052                 downloaded file.
2053
2054                 When this method returns None, the postprocessing chain is
2055                 stopped. However, this method may return an information
2056                 dictionary that will be passed to the next postprocessing
2057                 object in the chain. It can be the one it received after
2058                 changing some fields.
2059
2060                 In addition, this method may raise a PostProcessingError
2061                 exception that will be taken into account by the downloader
2062                 it was called from.
2063                 """
2064                 return information # by default, do nothing
2065         
2066 ### MAIN PROGRAM ###
2067 if __name__ == '__main__':
2068         try:
2069                 # Modules needed only when running the main program
2070                 import getpass
2071                 import optparse
2072
2073                 # Function to update the program file with the latest version from bitbucket.org
2074                 def update_self(downloader, filename):
2075                         # Note: downloader only used for options
2076                         if not os.access (filename, os.W_OK):
2077                                 sys.exit('ERROR: no write permissions on %s' % filename)
2078
2079                         downloader.to_stdout('Updating to latest stable version...')
2080                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2081                         latest_version = urllib.urlopen(latest_url).read().strip()
2082                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2083                         newcontent = urllib.urlopen(prog_url).read()
2084                         stream = open(filename, 'w')
2085                         stream.write(newcontent)
2086                         stream.close()
2087                         downloader.to_stdout('Updated to version %s' % latest_version)
2088
2089                 # General configuration
2090                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2091                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2092                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2093
2094                 # Parse command line
2095                 parser = optparse.OptionParser(
2096                         usage='Usage: %prog [options] url...',
2097                         version='2010.07.22',
2098                         conflict_handler='resolve',
2099                 )
2100
2101                 parser.add_option('-h', '--help',
2102                                 action='help', help='print this help text and exit')
2103                 parser.add_option('-v', '--version',
2104                                 action='version', help='print program version and exit')
2105                 parser.add_option('-U', '--update',
2106                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2107                 parser.add_option('-i', '--ignore-errors',
2108                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2109                 parser.add_option('-r', '--rate-limit',
2110                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2111                 parser.add_option('-R', '--retries',
2112                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2113
2114                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2115                 authentication.add_option('-u', '--username',
2116                                 dest='username', metavar='USERNAME', help='account username')
2117                 authentication.add_option('-p', '--password',
2118                                 dest='password', metavar='PASSWORD', help='account password')
2119                 authentication.add_option('-n', '--netrc',
2120                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2121                 parser.add_option_group(authentication)
2122
2123                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2124                 video_format.add_option('-f', '--format',
2125                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2126                 video_format.add_option('-m', '--mobile-version',
2127                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2128                 video_format.add_option('--all-formats',
2129                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2130                 video_format.add_option('--max-quality',
2131                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2132                 parser.add_option_group(video_format)
2133
2134                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2135                 verbosity.add_option('-q', '--quiet',
2136                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2137                 verbosity.add_option('-s', '--simulate',
2138                                 action='store_true', dest='simulate', help='do not download video', default=False)
2139                 verbosity.add_option('-g', '--get-url',
2140                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2141                 verbosity.add_option('-e', '--get-title',
2142                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2143                 verbosity.add_option('--get-thumbnail',
2144                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2145                 verbosity.add_option('--get-description',
2146                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2147                 verbosity.add_option('--no-progress',
2148                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2149                 parser.add_option_group(verbosity)
2150
2151                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2152                 filesystem.add_option('-t', '--title',
2153                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2154                 filesystem.add_option('-l', '--literal',
2155                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2156                 filesystem.add_option('-o', '--output',
2157                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2158                 filesystem.add_option('-a', '--batch-file',
2159                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2160                 filesystem.add_option('-w', '--no-overwrites',
2161                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2162                 filesystem.add_option('-c', '--continue',
2163                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2164                 parser.add_option_group(filesystem)
2165
2166                 (opts, args) = parser.parse_args()
2167
2168                 # Batch file verification
2169                 batchurls = []
2170                 if opts.batchfile is not None:
2171                         try:
2172                                 if opts.batchfile == '-':
2173                                         batchfd = sys.stdin
2174                                 else:
2175                                         batchfd = open(opts.batchfile, 'r')
2176                                 batchurls = batchfd.readlines()
2177                                 batchurls = [x.strip() for x in batchurls]
2178                                 batchurls = [x for x in batchurls if len(x) > 0]
2179                         except IOError:
2180                                 sys.exit(u'ERROR: batch file could not be read')
2181                 all_urls = batchurls + args
2182
2183                 # Conflicting, missing and erroneous options
2184                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2185                         parser.error(u'using .netrc conflicts with giving username/password')
2186                 if opts.password is not None and opts.username is None:
2187                         parser.error(u'account username missing')
2188                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2189                         parser.error(u'using output template conflicts with using title or literal title')
2190                 if opts.usetitle and opts.useliteral:
2191                         parser.error(u'using title conflicts with using literal title')
2192                 if opts.username is not None and opts.password is None:
2193                         opts.password = getpass.getpass(u'Type account password and press return:')
2194                 if opts.ratelimit is not None:
2195                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2196                         if numeric_limit is None:
2197                                 parser.error(u'invalid rate limit specified')
2198                         opts.ratelimit = numeric_limit
2199                 if opts.retries is not None:
2200                         try:
2201                                 opts.retries = long(opts.retries)
2202                         except (TypeError, ValueError), err:
2203                                 parser.error(u'invalid retry count specified')
2204
2205                 # Information extractors
2206                 youtube_ie = YoutubeIE()
2207                 metacafe_ie = MetacafeIE(youtube_ie)
2208                 dailymotion_ie = DailymotionIE()
2209                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2210                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2211                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2212                 google_ie = GoogleIE()
2213                 google_search_ie = GoogleSearchIE(google_ie)
2214                 photobucket_ie = PhotobucketIE()
2215                 yahoo_ie = YahooIE()
2216                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2217                 generic_ie = GenericIE()
2218
2219                 # File downloader
2220                 fd = FileDownloader({
2221                         'usenetrc': opts.usenetrc,
2222                         'username': opts.username,
2223                         'password': opts.password,
2224                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2225                         'forceurl': opts.geturl,
2226                         'forcetitle': opts.gettitle,
2227                         'forcethumbnail': opts.getthumbnail,
2228                         'forcedescription': opts.getdescription,
2229                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2230                         'format': opts.format,
2231                         'format_limit': opts.format_limit,
2232                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2233                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2234                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2235                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2236                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2237                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2238                                 or u'%(id)s.%(ext)s'),
2239                         'ignoreerrors': opts.ignoreerrors,
2240                         'ratelimit': opts.ratelimit,
2241                         'nooverwrites': opts.nooverwrites,
2242                         'retries': opts.retries,
2243                         'continuedl': opts.continue_dl,
2244                         'noprogress': opts.noprogress,
2245                         })
2246                 fd.add_info_extractor(youtube_search_ie)
2247                 fd.add_info_extractor(youtube_pl_ie)
2248                 fd.add_info_extractor(youtube_user_ie)
2249                 fd.add_info_extractor(metacafe_ie)
2250                 fd.add_info_extractor(dailymotion_ie)
2251                 fd.add_info_extractor(youtube_ie)
2252                 fd.add_info_extractor(google_ie)
2253                 fd.add_info_extractor(google_search_ie)
2254                 fd.add_info_extractor(photobucket_ie)
2255                 fd.add_info_extractor(yahoo_ie)
2256                 fd.add_info_extractor(yahoo_search_ie)
2257
2258                 # This must come last since it's the
2259                 # fallback if none of the others work
2260                 fd.add_info_extractor(generic_ie)
2261
2262                 # Update version
2263                 if opts.update_self:
2264                         update_self(fd, sys.argv[0])
2265
2266                 # Maybe do nothing
2267                 if len(all_urls) < 1:
2268                         if not opts.update_self:
2269                                 parser.error(u'you must provide at least one URL')
2270                         else:
2271                                 sys.exit()
2272                 retcode = fd.download(all_urls)
2273                 sys.exit(retcode)
2274
2275         except DownloadError:
2276                 sys.exit(1)
2277         except SameFileError:
2278                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2279         except KeyboardInterrupt:
2280                 sys.exit(u'\nERROR: Interrupted by user')