727c331f60b2629ba4984ec6694b6a1f3dd3cd4c
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         def add_info_extractor(self, ie):
291                 """Add an InfoExtractor object to the end of the list."""
292                 self._ies.append(ie)
293                 ie.set_downloader(self)
294         
295         def add_post_processor(self, pp):
296                 """Add a PostProcessor object to the end of the chain."""
297                 self._pps.append(pp)
298                 pp.set_downloader(self)
299         
300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301                 """Print message to stdout if not in quiet mode."""
302                 try:
303                         if not self.params.get('quiet', False):
304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305                         sys.stdout.flush()
306                 except (UnicodeEncodeError), err:
307                         if not ignore_encoding_errors:
308                                 raise
309         
310         def to_stderr(self, message):
311                 """Print message to stderr."""
312                 print >>sys.stderr, message.encode(preferredencoding())
313         
314         def fixed_template(self):
315                 """Checks if the output template is fixed."""
316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318         def trouble(self, message=None):
319                 """Determine action to take when a download problem appears.
320
321                 Depending on if the downloader has been configured to ignore
322                 download errors or not, this method may throw an exception or
323                 not when errors are found, after printing the message.
324                 """
325                 if message is not None:
326                         self.to_stderr(message)
327                 if not self.params.get('ignoreerrors', False):
328                         raise DownloadError(message)
329                 self._download_retcode = 1
330
331         def slow_down(self, start_time, byte_counter):
332                 """Sleep if the download speed is over the rate limit."""
333                 rate_limit = self.params.get('ratelimit', None)
334                 if rate_limit is None or byte_counter == 0:
335                         return
336                 now = time.time()
337                 elapsed = now - start_time
338                 if elapsed <= 0.0:
339                         return
340                 speed = float(byte_counter) / elapsed
341                 if speed > rate_limit:
342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344         def report_destination(self, filename):
345                 """Report destination filename."""
346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347         
348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349                 """Report download progress."""
350                 if self.params.get('noprogress', False):
351                         return
352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355         def report_resuming_byte(self, resume_len):
356                 """Report attemtp to resume at given byte."""
357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358         
359         def report_retry(self, count, retries):
360                 """Report retry in case of HTTP error 503"""
361                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
362         
363         def report_file_already_downloaded(self, file_name):
364                 """Report file has already been fully downloaded."""
365                 try:
366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367                 except (UnicodeEncodeError), err:
368                         self.to_stdout(u'[download] The file has already been downloaded')
369         
370         def report_unable_to_resume(self):
371                 """Report it was impossible to resume download."""
372                 self.to_stdout(u'[download] Unable to resume')
373         
374         def report_finish(self):
375                 """Report download finished."""
376                 if self.params.get('noprogress', False):
377                         self.to_stdout(u'[download] Download completed')
378                 else:
379                         self.to_stdout(u'')
380         
381         def increment_downloads(self):
382                 """Increment the ordinal that assigns a number to each file."""
383                 self._num_downloads += 1
384
385         def process_info(self, info_dict):
386                 """Process a single dictionary returned by an InfoExtractor."""
387                 # Do nothing else if in simulate mode
388                 if self.params.get('simulate', False):
389                         # Forced printings
390                         if self.params.get('forcetitle', False):
391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392                         if self.params.get('forceurl', False):
393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399                         return
400                         
401                 try:
402                         template_dict = dict(info_dict)
403                         template_dict['epoch'] = unicode(long(time.time()))
404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
405                         filename = self.params['outtmpl'] % template_dict
406                 except (ValueError, KeyError), err:
407                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
408                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
409                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
410                         return
411
412                 try:
413                         self.pmkdir(filename)
414                 except (OSError, IOError), err:
415                         self.trouble('ERROR: unable to create directories: %s' % str(err))
416                         return
417
418                 try:
419                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
420                 except (OSError, IOError), err:
421                         raise UnavailableVideoError
422                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
423                         self.trouble('ERROR: unable to download video data: %s' % str(err))
424                         return
425                 except (ContentTooShortError, ), err:
426                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
427                         return
428
429                 if success:
430                         try:
431                                 self.post_process(filename, info_dict)
432                         except (PostProcessingError), err:
433                                 self.trouble('ERROR: postprocessing: %s' % str(err))
434                                 return
435
436         def download(self, url_list):
437                 """Download a given list of URLs."""
438                 if len(url_list) > 1 and self.fixed_template():
439                         raise SameFileError(self.params['outtmpl'])
440
441                 for url in url_list:
442                         suitable_found = False
443                         for ie in self._ies:
444                                 # Go to next InfoExtractor if not suitable
445                                 if not ie.suitable(url):
446                                         continue
447
448                                 # Suitable InfoExtractor found
449                                 suitable_found = True
450
451                                 # Extract information from URL and process it
452                                 ie.extract(url)
453
454                                 # Suitable InfoExtractor had been found; go to next URL
455                                 break
456
457                         if not suitable_found:
458                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
459
460                 return self._download_retcode
461
462         def post_process(self, filename, ie_info):
463                 """Run the postprocessing chain on the given file."""
464                 info = dict(ie_info)
465                 info['filepath'] = filename
466                 for pp in self._pps:
467                         info = pp.run(info)
468                         if info is None:
469                                 break
470         
471         def _download_with_rtmpdump(self, filename, url, player_url):
472                 self.report_destination(filename)
473
474                 # Check for rtmpdump first
475                 try:
476                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
477                 except (OSError, IOError):
478                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
479                         return False
480
481                 # Download using rtmpdump. rtmpdump returns exit code 2 when
482                 # the connection was interrumpted and resuming appears to be
483                 # possible. This is part of rtmpdump's normal usage, AFAIK.
484                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
485                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
486                 while retval == 2 or retval == 1:
487                         prevsize = os.path.getsize(filename)
488                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
489                         time.sleep(5.0) # This seems to be needed
490                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
491                         cursize = os.path.getsize(filename)
492                         if prevsize == cursize and retval == 1:
493                                 break
494                 if retval == 0:
495                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
496                         return True
497                 else:
498                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
499                         return False
500
501         def _do_download(self, filename, url, player_url):
502                 # Attempt to download using rtmpdump
503                 if url.startswith('rtmp'):
504                         return self._download_with_rtmpdump(filename, url, player_url)
505
506                 stream = None
507                 open_mode = 'wb'
508                 basic_request = urllib2.Request(url, None, std_headers)
509                 request = urllib2.Request(url, None, std_headers)
510
511                 # Establish possible resume length
512                 if os.path.isfile(filename):
513                         resume_len = os.path.getsize(filename)
514                 else:
515                         resume_len = 0
516
517                 # Request parameters in case of being able to resume
518                 if self.params.get('continuedl', False) and resume_len != 0:
519                         self.report_resuming_byte(resume_len)
520                         request.add_header('Range','bytes=%d-' % resume_len)
521                         open_mode = 'ab'
522
523                 count = 0
524                 retries = self.params.get('retries', 0)
525                 while count <= retries:
526                         # Establish connection
527                         try:
528                                 data = urllib2.urlopen(request)
529                                 break
530                         except (urllib2.HTTPError, ), err:
531                                 if err.code != 503 and err.code != 416:
532                                         # Unexpected HTTP error
533                                         raise
534                                 elif err.code == 416:
535                                         # Unable to resume (requested range not satisfiable)
536                                         try:
537                                                 # Open the connection again without the range header
538                                                 data = urllib2.urlopen(basic_request)
539                                                 content_length = data.info()['Content-Length']
540                                         except (urllib2.HTTPError, ), err:
541                                                 if err.code != 503:
542                                                         raise
543                                         else:
544                                                 # Examine the reported length
545                                                 if (content_length is not None and
546                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
547                                                         # The file had already been fully downloaded.
548                                                         # Explanation to the above condition: in issue #175 it was revealed that
549                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
550                                                         # changing the file size slightly and causing problems for some users. So
551                                                         # I decided to implement a suggested change and consider the file
552                                                         # completely downloaded if the file size differs less than 100 bytes from
553                                                         # the one in the hard drive.
554                                                         self.report_file_already_downloaded(filename)
555                                                         return True
556                                                 else:
557                                                         # The length does not match, we start the download over
558                                                         self.report_unable_to_resume()
559                                                         open_mode = 'wb'
560                                                         break
561                         # Retry
562                         count += 1
563                         if count <= retries:
564                                 self.report_retry(count, retries)
565
566                 if count > retries:
567                         self.trouble(u'ERROR: giving up after %s retries' % retries)
568                         return False
569
570                 data_len = data.info().get('Content-length', None)
571                 data_len_str = self.format_bytes(data_len)
572                 byte_counter = 0
573                 block_size = 1024
574                 start = time.time()
575                 while True:
576                         # Download and write
577                         before = time.time()
578                         data_block = data.read(block_size)
579                         after = time.time()
580                         data_block_len = len(data_block)
581                         if data_block_len == 0:
582                                 break
583                         byte_counter += data_block_len
584
585                         # Open file just in time
586                         if stream is None:
587                                 try:
588                                         (stream, filename) = sanitize_open(filename, open_mode)
589                                         self.report_destination(filename)
590                                 except (OSError, IOError), err:
591                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
592                                         return False
593                         try:
594                                 stream.write(data_block)
595                         except (IOError, OSError), err:
596                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
597                         block_size = self.best_block_size(after - before, data_block_len)
598
599                         # Progress message
600                         percent_str = self.calc_percent(byte_counter, data_len)
601                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
602                         speed_str = self.calc_speed(start, time.time(), byte_counter)
603                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
604
605                         # Apply rate limit
606                         self.slow_down(start, byte_counter)
607
608                 self.report_finish()
609                 if data_len is not None and str(byte_counter) != data_len:
610                         raise ContentTooShortError(byte_counter, long(data_len))
611                 return True
612
613 class InfoExtractor(object):
614         """Information Extractor class.
615
616         Information extractors are the classes that, given a URL, extract
617         information from the video (or videos) the URL refers to. This
618         information includes the real video URL, the video title and simplified
619         title, author and others. The information is stored in a dictionary
620         which is then passed to the FileDownloader. The FileDownloader
621         processes this information possibly downloading the video to the file
622         system, among other possible outcomes. The dictionaries must include
623         the following fields:
624
625         id:             Video identifier.
626         url:            Final video URL.
627         uploader:       Nickname of the video uploader.
628         title:          Literal title.
629         stitle:         Simplified title.
630         ext:            Video filename extension.
631         format:         Video format.
632         player_url:     SWF Player URL (may be None).
633
634         The following fields are optional. Their primary purpose is to allow
635         youtube-dl to serve as the backend for a video search function, such
636         as the one in youtube2mp3.  They are only used when their respective
637         forced printing functions are called:
638
639         thumbnail:      Full URL to a video thumbnail image.
640         description:    One-line video description.
641
642         Subclasses of this one should re-define the _real_initialize() and
643         _real_extract() methods, as well as the suitable() static method.
644         Probably, they should also be instantiated and added to the main
645         downloader.
646         """
647
648         _ready = False
649         _downloader = None
650
651         def __init__(self, downloader=None):
652                 """Constructor. Receives an optional downloader."""
653                 self._ready = False
654                 self.set_downloader(downloader)
655
656         @staticmethod
657         def suitable(url):
658                 """Receives a URL and returns True if suitable for this IE."""
659                 return False
660
661         def initialize(self):
662                 """Initializes an instance (authentication, etc)."""
663                 if not self._ready:
664                         self._real_initialize()
665                         self._ready = True
666
667         def extract(self, url):
668                 """Extracts URL information and returns it in list of dicts."""
669                 self.initialize()
670                 return self._real_extract(url)
671
672         def set_downloader(self, downloader):
673                 """Sets the downloader for this IE."""
674                 self._downloader = downloader
675         
676         def _real_initialize(self):
677                 """Real initialization process. Redefine in subclasses."""
678                 pass
679
680         def _real_extract(self, url):
681                 """Real extraction process. Redefine in subclasses."""
682                 pass
683
684 class YoutubeIE(InfoExtractor):
685         """Information extractor for youtube.com."""
686
687         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
688         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
689         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
690         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
691         _NETRC_MACHINE = 'youtube'
692         # Listed in order of quality
693         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
694         _video_extensions = {
695                 '13': '3gp',
696                 '17': 'mp4',
697                 '18': 'mp4',
698                 '22': 'mp4',
699                 '37': 'mp4',
700                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
701                 '43': 'webm',
702                 '45': 'webm',
703         }
704
705         @staticmethod
706         def suitable(url):
707                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
708
709         def report_lang(self):
710                 """Report attempt to set language."""
711                 self._downloader.to_stdout(u'[youtube] Setting language')
712
713         def report_login(self):
714                 """Report attempt to log in."""
715                 self._downloader.to_stdout(u'[youtube] Logging in')
716         
717         def report_age_confirmation(self):
718                 """Report attempt to confirm age."""
719                 self._downloader.to_stdout(u'[youtube] Confirming age')
720         
721         def report_video_webpage_download(self, video_id):
722                 """Report attempt to download video webpage."""
723                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
724         
725         def report_video_info_webpage_download(self, video_id):
726                 """Report attempt to download video info webpage."""
727                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
728         
729         def report_information_extraction(self, video_id):
730                 """Report attempt to extract video information."""
731                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
732         
733         def report_unavailable_format(self, video_id, format):
734                 """Report extracted video URL."""
735                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
736         
737         def report_rtmp_download(self):
738                 """Indicate the download will use the RTMP protocol."""
739                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
740         
741         def _real_initialize(self):
742                 if self._downloader is None:
743                         return
744
745                 username = None
746                 password = None
747                 downloader_params = self._downloader.params
748
749                 # Attempt to use provided username and password or .netrc data
750                 if downloader_params.get('username', None) is not None:
751                         username = downloader_params['username']
752                         password = downloader_params['password']
753                 elif downloader_params.get('usenetrc', False):
754                         try:
755                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
756                                 if info is not None:
757                                         username = info[0]
758                                         password = info[2]
759                                 else:
760                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
761                         except (IOError, netrc.NetrcParseError), err:
762                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
763                                 return
764
765                 # Set language
766                 request = urllib2.Request(self._LANG_URL, None, std_headers)
767                 try:
768                         self.report_lang()
769                         urllib2.urlopen(request).read()
770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
772                         return
773
774                 # No authentication to be performed
775                 if username is None:
776                         return
777
778                 # Log in
779                 login_form = {
780                                 'current_form': 'loginForm',
781                                 'next':         '/',
782                                 'action_login': 'Log In',
783                                 'username':     username,
784                                 'password':     password,
785                                 }
786                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
787                 try:
788                         self.report_login()
789                         login_results = urllib2.urlopen(request).read()
790                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
791                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
792                                 return
793                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
794                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
795                         return
796         
797                 # Confirm age
798                 age_form = {
799                                 'next_url':             '/',
800                                 'action_confirm':       'Confirm',
801                                 }
802                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
803                 try:
804                         self.report_age_confirmation()
805                         age_results = urllib2.urlopen(request).read()
806                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
808                         return
809
810         def _real_extract(self, url):
811                 # Extract video id from URL
812                 mobj = re.match(self._VALID_URL, url)
813                 if mobj is None:
814                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
815                         return
816                 video_id = mobj.group(2)
817
818                 # Get video webpage
819                 self.report_video_webpage_download(video_id)
820                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
821                 try:
822                         video_webpage = urllib2.urlopen(request).read()
823                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
824                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
825                         return
826
827                 # Attempt to extract SWF player URL
828                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
829                 if mobj is not None:
830                         player_url = mobj.group(1)
831                 else:
832                         player_url = None
833
834                 # Get video info
835                 self.report_video_info_webpage_download(video_id)
836                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
837                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
838                                            % (video_id, el_type))
839                         request = urllib2.Request(video_info_url, None, std_headers)
840                         try:
841                                 video_info_webpage = urllib2.urlopen(request).read()
842                                 video_info = parse_qs(video_info_webpage)
843                                 if 'token' in video_info:
844                                         break
845                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
846                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
847                                 return
848                 if 'token' not in video_info:
849                         if 'reason' in video_info:
850                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
851                         else:
852                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
853                         return
854
855                 # Start extracting information
856                 self.report_information_extraction(video_id)
857
858                 # uploader
859                 if 'author' not in video_info:
860                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
861                         return
862                 video_uploader = urllib.unquote_plus(video_info['author'][0])
863
864                 # title
865                 if 'title' not in video_info:
866                         self._downloader.trouble(u'ERROR: unable to extract video title')
867                         return
868                 video_title = urllib.unquote_plus(video_info['title'][0])
869                 video_title = video_title.decode('utf-8')
870                 video_title = sanitize_title(video_title)
871
872                 # simplified title
873                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
874                 simple_title = simple_title.strip(ur'_')
875
876                 # thumbnail image
877                 if 'thumbnail_url' not in video_info:
878                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
879                         video_thumbnail = ''
880                 else:   # don't panic if we can't find it
881                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
882
883                 # description
884                 video_description = 'No description available.'
885                 if self._downloader.params.get('forcedescription', False):
886                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
887                         if mobj is not None:
888                                 video_description = mobj.group(1)
889
890                 # token
891                 video_token = urllib.unquote_plus(video_info['token'][0])
892
893                 # Decide which formats to download
894                 requested_format = self._downloader.params.get('format', None)
895                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
896
897                 if 'fmt_url_map' in video_info:
898                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
899                         format_limit = self._downloader.params.get('format_limit', None)
900                         if format_limit is not None and format_limit in self._available_formats:
901                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
902                         else:
903                                 format_list = self._available_formats
904                         existing_formats = [x for x in format_list if x in url_map]
905                         if len(existing_formats) == 0:
906                                 self._downloader.trouble(u'ERROR: no known formats available for video')
907                                 return
908                         if requested_format is None:
909                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
910                         elif requested_format == '-1':
911                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
912                         else:
913                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
914
915                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
916                         self.report_rtmp_download()
917                         video_url_list = [(None, video_info['conn'][0])]
918
919                 else:
920                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
921                         return
922
923                 for format_param, video_real_url in video_url_list:
924                         # At this point we have a new video
925                         self._downloader.increment_downloads()
926
927                         # Extension
928                         video_extension = self._video_extensions.get(format_param, 'flv')
929
930                         # Find the video URL in fmt_url_map or conn paramters
931                         try:
932                                 # Process video information
933                                 self._downloader.process_info({
934                                         'id':           video_id.decode('utf-8'),
935                                         'url':          video_real_url.decode('utf-8'),
936                                         'uploader':     video_uploader.decode('utf-8'),
937                                         'title':        video_title,
938                                         'stitle':       simple_title,
939                                         'ext':          video_extension.decode('utf-8'),
940                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
941                                         'thumbnail':    video_thumbnail.decode('utf-8'),
942                                         'description':  video_description.decode('utf-8'),
943                                         'player_url':   player_url,
944                                 })
945                         except UnavailableVideoError, err:
946                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
947
948
949 class MetacafeIE(InfoExtractor):
950         """Information Extractor for metacafe.com."""
951
952         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
953         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
954         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
955         _youtube_ie = None
956
957         def __init__(self, youtube_ie, downloader=None):
958                 InfoExtractor.__init__(self, downloader)
959                 self._youtube_ie = youtube_ie
960
961         @staticmethod
962         def suitable(url):
963                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
964
965         def report_disclaimer(self):
966                 """Report disclaimer retrieval."""
967                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
968
969         def report_age_confirmation(self):
970                 """Report attempt to confirm age."""
971                 self._downloader.to_stdout(u'[metacafe] Confirming age')
972         
973         def report_download_webpage(self, video_id):
974                 """Report webpage download."""
975                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
976         
977         def report_extraction(self, video_id):
978                 """Report information extraction."""
979                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
980
981         def _real_initialize(self):
982                 # Retrieve disclaimer
983                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
984                 try:
985                         self.report_disclaimer()
986                         disclaimer = urllib2.urlopen(request).read()
987                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
988                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
989                         return
990
991                 # Confirm age
992                 disclaimer_form = {
993                         'filters': '0',
994                         'submit': "Continue - I'm over 18",
995                         }
996                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
997                 try:
998                         self.report_age_confirmation()
999                         disclaimer = urllib2.urlopen(request).read()
1000                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1001                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1002                         return
1003         
1004         def _real_extract(self, url):
1005                 # Extract id and simplified title from URL
1006                 mobj = re.match(self._VALID_URL, url)
1007                 if mobj is None:
1008                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1009                         return
1010
1011                 video_id = mobj.group(1)
1012
1013                 # Check if video comes from YouTube
1014                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1015                 if mobj2 is not None:
1016                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1017                         return
1018
1019                 # At this point we have a new video
1020                 self._downloader.increment_downloads()
1021
1022                 simple_title = mobj.group(2).decode('utf-8')
1023                 video_extension = 'flv'
1024
1025                 # Retrieve video webpage to extract further information
1026                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1027                 try:
1028                         self.report_download_webpage(video_id)
1029                         webpage = urllib2.urlopen(request).read()
1030                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1032                         return
1033
1034                 # Extract URL, uploader and title from webpage
1035                 self.report_extraction(video_id)
1036                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1037                 if mobj is None:
1038                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1039                         return
1040                 mediaURL = urllib.unquote(mobj.group(1))
1041
1042                 # Extract gdaKey if available
1043                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1044                 if mobj is None:
1045                         video_url = mediaURL
1046                         #self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1047                         #return
1048                 else:
1049                         gdaKey = mobj.group(1)
1050                         video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1051
1052                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1053                 if mobj is None:
1054                         self._downloader.trouble(u'ERROR: unable to extract title')
1055                         return
1056                 video_title = mobj.group(1).decode('utf-8')
1057                 video_title = sanitize_title(video_title)
1058
1059                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1060                 if mobj is None:
1061                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1062                         return
1063                 video_uploader = mobj.group(1)
1064
1065                 try:
1066                         # Process video information
1067                         self._downloader.process_info({
1068                                 'id':           video_id.decode('utf-8'),
1069                                 'url':          video_url.decode('utf-8'),
1070                                 'uploader':     video_uploader.decode('utf-8'),
1071                                 'title':        video_title,
1072                                 'stitle':       simple_title,
1073                                 'ext':          video_extension.decode('utf-8'),
1074                                 'format':       u'NA',
1075                                 'player_url':   None,
1076                         })
1077                 except UnavailableVideoError:
1078                         self._downloader.trouble(u'ERROR: unable to download video')
1079
1080
1081 class DailymotionIE(InfoExtractor):
1082         """Information Extractor for Dailymotion"""
1083
1084         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1085
1086         def __init__(self, downloader=None):
1087                 InfoExtractor.__init__(self, downloader)
1088
1089         @staticmethod
1090         def suitable(url):
1091                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1092
1093         def report_download_webpage(self, video_id):
1094                 """Report webpage download."""
1095                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1096         
1097         def report_extraction(self, video_id):
1098                 """Report information extraction."""
1099                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1100
1101         def _real_initialize(self):
1102                 return
1103
1104         def _real_extract(self, url):
1105                 # Extract id and simplified title from URL
1106                 mobj = re.match(self._VALID_URL, url)
1107                 if mobj is None:
1108                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1109                         return
1110
1111                 # At this point we have a new video
1112                 self._downloader.increment_downloads()
1113                 video_id = mobj.group(1)
1114
1115                 simple_title = mobj.group(2).decode('utf-8')
1116                 video_extension = 'flv'
1117
1118                 # Retrieve video webpage to extract further information
1119                 request = urllib2.Request(url)
1120                 try:
1121                         self.report_download_webpage(video_id)
1122                         webpage = urllib2.urlopen(request).read()
1123                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1124                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1125                         return
1126
1127                 # Extract URL, uploader and title from webpage
1128                 self.report_extraction(video_id)
1129                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1130                 if mobj is None:
1131                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1132                         return
1133                 mediaURL = urllib.unquote(mobj.group(1))
1134
1135                 # if needed add http://www.dailymotion.com/ if relative URL
1136
1137                 video_url = mediaURL
1138
1139                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1140                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1141                 if mobj is None:
1142                         self._downloader.trouble(u'ERROR: unable to extract title')
1143                         return
1144                 video_title = mobj.group(1).decode('utf-8')
1145                 video_title = sanitize_title(video_title)
1146
1147                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1148                 if mobj is None:
1149                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1150                         return
1151                 video_uploader = mobj.group(1)
1152
1153                 try:
1154                         # Process video information
1155                         self._downloader.process_info({
1156                                 'id':           video_id.decode('utf-8'),
1157                                 'url':          video_url.decode('utf-8'),
1158                                 'uploader':     video_uploader.decode('utf-8'),
1159                                 'title':        video_title,
1160                                 'stitle':       simple_title,
1161                                 'ext':          video_extension.decode('utf-8'),
1162                                 'format':       u'NA',
1163                                 'player_url':   None,
1164                         })
1165                 except UnavailableVideoError:
1166                         self._downloader.trouble(u'ERROR: unable to download video')
1167
1168 class GoogleIE(InfoExtractor):
1169         """Information extractor for video.google.com."""
1170
1171         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1172
1173         def __init__(self, downloader=None):
1174                 InfoExtractor.__init__(self, downloader)
1175
1176         @staticmethod
1177         def suitable(url):
1178                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1179
1180         def report_download_webpage(self, video_id):
1181                 """Report webpage download."""
1182                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1183
1184         def report_extraction(self, video_id):
1185                 """Report information extraction."""
1186                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1187
1188         def _real_initialize(self):
1189                 return
1190
1191         def _real_extract(self, url):
1192                 # Extract id from URL
1193                 mobj = re.match(self._VALID_URL, url)
1194                 if mobj is None:
1195                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196                         return
1197
1198                 # At this point we have a new video
1199                 self._downloader.increment_downloads()
1200                 video_id = mobj.group(1)
1201
1202                 video_extension = 'mp4'
1203
1204                 # Retrieve video webpage to extract further information
1205                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1206                 try:
1207                         self.report_download_webpage(video_id)
1208                         webpage = urllib2.urlopen(request).read()
1209                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1210                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1211                         return
1212
1213                 # Extract URL, uploader, and title from webpage
1214                 self.report_extraction(video_id)
1215                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1216                 if mobj is None:
1217                         video_extension = 'flv'
1218                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1219                 if mobj is None:
1220                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1221                         return
1222                 mediaURL = urllib.unquote(mobj.group(1))
1223                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1224                 mediaURL = mediaURL.replace('\\x26', '\x26')
1225
1226                 video_url = mediaURL
1227
1228                 mobj = re.search(r'<title>(.*)</title>', webpage)
1229                 if mobj is None:
1230                         self._downloader.trouble(u'ERROR: unable to extract title')
1231                         return
1232                 video_title = mobj.group(1).decode('utf-8')
1233                 video_title = sanitize_title(video_title)
1234                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1235
1236                 # Extract video description
1237                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1238                 if mobj is None:
1239                         self._downloader.trouble(u'ERROR: unable to extract video description')
1240                         return
1241                 video_description = mobj.group(1).decode('utf-8')
1242                 if not video_description:
1243                         video_description = 'No description available.'
1244
1245                 # Extract video thumbnail
1246                 if self._downloader.params.get('forcethumbnail', False):
1247                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1248                         try:
1249                                 webpage = urllib2.urlopen(request).read()
1250                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1251                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1252                                 return
1253                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1254                         if mobj is None:
1255                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1256                                 return
1257                         video_thumbnail = mobj.group(1)
1258                 else:   # we need something to pass to process_info
1259                         video_thumbnail = ''
1260
1261
1262                 try:
1263                         # Process video information
1264                         self._downloader.process_info({
1265                                 'id':           video_id.decode('utf-8'),
1266                                 'url':          video_url.decode('utf-8'),
1267                                 'uploader':     u'NA',
1268                                 'title':        video_title,
1269                                 'stitle':       simple_title,
1270                                 'ext':          video_extension.decode('utf-8'),
1271                                 'format':       u'NA',
1272                                 'player_url':   None,
1273                         })
1274                 except UnavailableVideoError:
1275                         self._downloader.trouble(u'ERROR: unable to download video')
1276
1277
1278 class PhotobucketIE(InfoExtractor):
1279         """Information extractor for photobucket.com."""
1280
1281         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1282
1283         def __init__(self, downloader=None):
1284                 InfoExtractor.__init__(self, downloader)
1285
1286         @staticmethod
1287         def suitable(url):
1288                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1289
1290         def report_download_webpage(self, video_id):
1291                 """Report webpage download."""
1292                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1293
1294         def report_extraction(self, video_id):
1295                 """Report information extraction."""
1296                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1297
1298         def _real_initialize(self):
1299                 return
1300
1301         def _real_extract(self, url):
1302                 # Extract id from URL
1303                 mobj = re.match(self._VALID_URL, url)
1304                 if mobj is None:
1305                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306                         return
1307
1308                 # At this point we have a new video
1309                 self._downloader.increment_downloads()
1310                 video_id = mobj.group(1)
1311
1312                 video_extension = 'flv'
1313
1314                 # Retrieve video webpage to extract further information
1315                 request = urllib2.Request(url)
1316                 try:
1317                         self.report_download_webpage(video_id)
1318                         webpage = urllib2.urlopen(request).read()
1319                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1321                         return
1322
1323                 # Extract URL, uploader, and title from webpage
1324                 self.report_extraction(video_id)
1325                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1326                 if mobj is None:
1327                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1328                         return
1329                 mediaURL = urllib.unquote(mobj.group(1))
1330
1331                 video_url = mediaURL
1332
1333                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1334                 if mobj is None:
1335                         self._downloader.trouble(u'ERROR: unable to extract title')
1336                         return
1337                 video_title = mobj.group(1).decode('utf-8')
1338                 video_title = sanitize_title(video_title)
1339                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1340
1341                 video_uploader = mobj.group(2).decode('utf-8')
1342
1343                 try:
1344                         # Process video information
1345                         self._downloader.process_info({
1346                                 'id':           video_id.decode('utf-8'),
1347                                 'url':          video_url.decode('utf-8'),
1348                                 'uploader':     video_uploader,
1349                                 'title':        video_title,
1350                                 'stitle':       simple_title,
1351                                 'ext':          video_extension.decode('utf-8'),
1352                                 'format':       u'NA',
1353                                 'player_url':   None,
1354                         })
1355                 except UnavailableVideoError:
1356                         self._downloader.trouble(u'ERROR: unable to download video')
1357
1358
1359 class YahooIE(InfoExtractor):
1360         """Information extractor for video.yahoo.com."""
1361
1362         # _VALID_URL matches all Yahoo! Video URLs
1363         # _VPAGE_URL matches only the extractable '/watch/' URLs
1364         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1365         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1366
1367         def __init__(self, downloader=None):
1368                 InfoExtractor.__init__(self, downloader)
1369
1370         @staticmethod
1371         def suitable(url):
1372                 return (re.match(YahooIE._VALID_URL, url) is not None)
1373
1374         def report_download_webpage(self, video_id):
1375                 """Report webpage download."""
1376                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1377
1378         def report_extraction(self, video_id):
1379                 """Report information extraction."""
1380                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1381
1382         def _real_initialize(self):
1383                 return
1384
1385         def _real_extract(self, url, new_video=True):
1386                 # Extract ID from URL
1387                 mobj = re.match(self._VALID_URL, url)
1388                 if mobj is None:
1389                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1390                         return
1391
1392                 # At this point we have a new video
1393                 self._downloader.increment_downloads()
1394                 video_id = mobj.group(2)
1395                 video_extension = 'flv'
1396
1397                 # Rewrite valid but non-extractable URLs as
1398                 # extractable English language /watch/ URLs
1399                 if re.match(self._VPAGE_URL, url) is None:
1400                         request = urllib2.Request(url)
1401                         try:
1402                                 webpage = urllib2.urlopen(request).read()
1403                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1404                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1405                                 return
1406
1407                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1408                         if mobj is None:
1409                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1410                                 return
1411                         yahoo_id = mobj.group(1)
1412
1413                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1414                         if mobj is None:
1415                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1416                                 return
1417                         yahoo_vid = mobj.group(1)
1418
1419                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1420                         return self._real_extract(url, new_video=False)
1421
1422                 # Retrieve video webpage to extract further information
1423                 request = urllib2.Request(url)
1424                 try:
1425                         self.report_download_webpage(video_id)
1426                         webpage = urllib2.urlopen(request).read()
1427                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1428                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1429                         return
1430
1431                 # Extract uploader and title from webpage
1432                 self.report_extraction(video_id)
1433                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1434                 if mobj is None:
1435                         self._downloader.trouble(u'ERROR: unable to extract video title')
1436                         return
1437                 video_title = mobj.group(1).decode('utf-8')
1438                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1439
1440                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1441                 if mobj is None:
1442                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1443                         return
1444                 video_uploader = mobj.group(1).decode('utf-8')
1445
1446                 # Extract video thumbnail
1447                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1448                 if mobj is None:
1449                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1450                         return
1451                 video_thumbnail = mobj.group(1).decode('utf-8')
1452
1453                 # Extract video description
1454                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1455                 if mobj is None:
1456                         self._downloader.trouble(u'ERROR: unable to extract video description')
1457                         return
1458                 video_description = mobj.group(1).decode('utf-8')
1459                 if not video_description: video_description = 'No description available.'
1460
1461                 # Extract video height and width
1462                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1463                 if mobj is None:
1464                         self._downloader.trouble(u'ERROR: unable to extract video height')
1465                         return
1466                 yv_video_height = mobj.group(1)
1467
1468                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1469                 if mobj is None:
1470                         self._downloader.trouble(u'ERROR: unable to extract video width')
1471                         return
1472                 yv_video_width = mobj.group(1)
1473
1474                 # Retrieve video playlist to extract media URL
1475                 # I'm not completely sure what all these options are, but we
1476                 # seem to need most of them, otherwise the server sends a 401.
1477                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1478                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1479                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1480                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1481                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1482                 try:
1483                         self.report_download_webpage(video_id)
1484                         webpage = urllib2.urlopen(request).read()
1485                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1487                         return
1488
1489                 # Extract media URL from playlist XML
1490                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1491                 if mobj is None:
1492                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1493                         return
1494                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1495                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1496
1497                 try:
1498                         # Process video information
1499                         self._downloader.process_info({
1500                                 'id':           video_id.decode('utf-8'),
1501                                 'url':          video_url,
1502                                 'uploader':     video_uploader,
1503                                 'title':        video_title,
1504                                 'stitle':       simple_title,
1505                                 'ext':          video_extension.decode('utf-8'),
1506                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1507                                 'description':  video_description,
1508                                 'thumbnail':    video_thumbnail,
1509                                 'description':  video_description,
1510                                 'player_url':   None,
1511                         })
1512                 except UnavailableVideoError:
1513                         self._downloader.trouble(u'ERROR: unable to download video')
1514
1515
1516 class GenericIE(InfoExtractor):
1517         """Generic last-resort information extractor."""
1518
1519         def __init__(self, downloader=None):
1520                 InfoExtractor.__init__(self, downloader)
1521
1522         @staticmethod
1523         def suitable(url):
1524                 return True
1525
1526         def report_download_webpage(self, video_id):
1527                 """Report webpage download."""
1528                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1529                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1530
1531         def report_extraction(self, video_id):
1532                 """Report information extraction."""
1533                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1534
1535         def _real_initialize(self):
1536                 return
1537
1538         def _real_extract(self, url):
1539                 # At this point we have a new video
1540                 self._downloader.increment_downloads()
1541
1542                 video_id = url.split('/')[-1]
1543                 request = urllib2.Request(url)
1544                 try:
1545                         self.report_download_webpage(video_id)
1546                         webpage = urllib2.urlopen(request).read()
1547                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1549                         return
1550                 except ValueError, err:
1551                         # since this is the last-resort InfoExtractor, if
1552                         # this error is thrown, it'll be thrown here
1553                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1554                         return
1555
1556                 # Start with something easy: JW Player in SWFObject
1557                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1558                 if mobj is None:
1559                         # Broaden the search a little bit
1560                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1561                 if mobj is None:
1562                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1563                         return
1564
1565                 # It's possible that one of the regexes
1566                 # matched, but returned an empty group:
1567                 if mobj.group(1) is None:
1568                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1569                         return
1570
1571                 video_url = urllib.unquote(mobj.group(1))
1572                 video_id  = os.path.basename(video_url)
1573
1574                 # here's a fun little line of code for you:
1575                 video_extension = os.path.splitext(video_id)[1][1:]
1576                 video_id        = os.path.splitext(video_id)[0]
1577
1578                 # it's tempting to parse this further, but you would
1579                 # have to take into account all the variations like
1580                 #   Video Title - Site Name
1581                 #   Site Name | Video Title
1582                 #   Video Title - Tagline | Site Name
1583                 # and so on and so forth; it's just not practical
1584                 mobj = re.search(r'<title>(.*)</title>', webpage)
1585                 if mobj is None:
1586                         self._downloader.trouble(u'ERROR: unable to extract title')
1587                         return
1588                 video_title = mobj.group(1).decode('utf-8')
1589                 video_title = sanitize_title(video_title)
1590                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1591
1592                 # video uploader is domain name
1593                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1594                 if mobj is None:
1595                         self._downloader.trouble(u'ERROR: unable to extract title')
1596                         return
1597                 video_uploader = mobj.group(1).decode('utf-8')
1598
1599                 try:
1600                         # Process video information
1601                         self._downloader.process_info({
1602                                 'id':           video_id.decode('utf-8'),
1603                                 'url':          video_url.decode('utf-8'),
1604                                 'uploader':     video_uploader,
1605                                 'title':        video_title,
1606                                 'stitle':       simple_title,
1607                                 'ext':          video_extension.decode('utf-8'),
1608                                 'format':       u'NA',
1609                                 'player_url':   None,
1610                         })
1611                 except UnavailableVideoError, err:
1612                         self._downloader.trouble(u'ERROR: unable to download video')
1613
1614
1615 class YoutubeSearchIE(InfoExtractor):
1616         """Information Extractor for YouTube search queries."""
1617         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1618         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1619         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1620         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1621         _youtube_ie = None
1622         _max_youtube_results = 1000
1623
1624         def __init__(self, youtube_ie, downloader=None):
1625                 InfoExtractor.__init__(self, downloader)
1626                 self._youtube_ie = youtube_ie
1627         
1628         @staticmethod
1629         def suitable(url):
1630                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1631
1632         def report_download_page(self, query, pagenum):
1633                 """Report attempt to download playlist page with given number."""
1634                 query = query.decode(preferredencoding())
1635                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1636
1637         def _real_initialize(self):
1638                 self._youtube_ie.initialize()
1639         
1640         def _real_extract(self, query):
1641                 mobj = re.match(self._VALID_QUERY, query)
1642                 if mobj is None:
1643                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1644                         return
1645
1646                 prefix, query = query.split(':')
1647                 prefix = prefix[8:]
1648                 query  = query.encode('utf-8')
1649                 if prefix == '':
1650                         self._download_n_results(query, 1)
1651                         return
1652                 elif prefix == 'all':
1653                         self._download_n_results(query, self._max_youtube_results)
1654                         return
1655                 else:
1656                         try:
1657                                 n = long(prefix)
1658                                 if n <= 0:
1659                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1660                                         return
1661                                 elif n > self._max_youtube_results:
1662                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1663                                         n = self._max_youtube_results
1664                                 self._download_n_results(query, n)
1665                                 return
1666                         except ValueError: # parsing prefix as integer fails
1667                                 self._download_n_results(query, 1)
1668                                 return
1669
1670         def _download_n_results(self, query, n):
1671                 """Downloads a specified number of results for a query"""
1672
1673                 video_ids = []
1674                 already_seen = set()
1675                 pagenum = 1
1676
1677                 while True:
1678                         self.report_download_page(query, pagenum)
1679                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1680                         request = urllib2.Request(result_url, None, std_headers)
1681                         try:
1682                                 page = urllib2.urlopen(request).read()
1683                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1684                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1685                                 return
1686
1687                         # Extract video identifiers
1688                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1689                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1690                                 if video_id not in already_seen:
1691                                         video_ids.append(video_id)
1692                                         already_seen.add(video_id)
1693                                         if len(video_ids) == n:
1694                                                 # Specified n videos reached
1695                                                 for id in video_ids:
1696                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1697                                                 return
1698
1699                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1700                                 for id in video_ids:
1701                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1702                                 return
1703
1704                         pagenum = pagenum + 1
1705
1706 class GoogleSearchIE(InfoExtractor):
1707         """Information Extractor for Google Video search queries."""
1708         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1709         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1710         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1711         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1712         _google_ie = None
1713         _max_google_results = 1000
1714
1715         def __init__(self, google_ie, downloader=None):
1716                 InfoExtractor.__init__(self, downloader)
1717                 self._google_ie = google_ie
1718         
1719         @staticmethod
1720         def suitable(url):
1721                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1722
1723         def report_download_page(self, query, pagenum):
1724                 """Report attempt to download playlist page with given number."""
1725                 query = query.decode(preferredencoding())
1726                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1727
1728         def _real_initialize(self):
1729                 self._google_ie.initialize()
1730         
1731         def _real_extract(self, query):
1732                 mobj = re.match(self._VALID_QUERY, query)
1733                 if mobj is None:
1734                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1735                         return
1736
1737                 prefix, query = query.split(':')
1738                 prefix = prefix[8:]
1739                 query  = query.encode('utf-8')
1740                 if prefix == '':
1741                         self._download_n_results(query, 1)
1742                         return
1743                 elif prefix == 'all':
1744                         self._download_n_results(query, self._max_google_results)
1745                         return
1746                 else:
1747                         try:
1748                                 n = long(prefix)
1749                                 if n <= 0:
1750                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1751                                         return
1752                                 elif n > self._max_google_results:
1753                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1754                                         n = self._max_google_results
1755                                 self._download_n_results(query, n)
1756                                 return
1757                         except ValueError: # parsing prefix as integer fails
1758                                 self._download_n_results(query, 1)
1759                                 return
1760
1761         def _download_n_results(self, query, n):
1762                 """Downloads a specified number of results for a query"""
1763
1764                 video_ids = []
1765                 already_seen = set()
1766                 pagenum = 1
1767
1768                 while True:
1769                         self.report_download_page(query, pagenum)
1770                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1771                         request = urllib2.Request(result_url, None, std_headers)
1772                         try:
1773                                 page = urllib2.urlopen(request).read()
1774                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1776                                 return
1777
1778                         # Extract video identifiers
1779                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1780                                 video_id = mobj.group(1)
1781                                 if video_id not in already_seen:
1782                                         video_ids.append(video_id)
1783                                         already_seen.add(video_id)
1784                                         if len(video_ids) == n:
1785                                                 # Specified n videos reached
1786                                                 for id in video_ids:
1787                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1788                                                 return
1789
1790                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1791                                 for id in video_ids:
1792                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1793                                 return
1794
1795                         pagenum = pagenum + 1
1796
1797 class YahooSearchIE(InfoExtractor):
1798         """Information Extractor for Yahoo! Video search queries."""
1799         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1800         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1801         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1802         _MORE_PAGES_INDICATOR = r'\s*Next'
1803         _yahoo_ie = None
1804         _max_yahoo_results = 1000
1805
1806         def __init__(self, yahoo_ie, downloader=None):
1807                 InfoExtractor.__init__(self, downloader)
1808                 self._yahoo_ie = yahoo_ie
1809         
1810         @staticmethod
1811         def suitable(url):
1812                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1813
1814         def report_download_page(self, query, pagenum):
1815                 """Report attempt to download playlist page with given number."""
1816                 query = query.decode(preferredencoding())
1817                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1818
1819         def _real_initialize(self):
1820                 self._yahoo_ie.initialize()
1821         
1822         def _real_extract(self, query):
1823                 mobj = re.match(self._VALID_QUERY, query)
1824                 if mobj is None:
1825                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1826                         return
1827
1828                 prefix, query = query.split(':')
1829                 prefix = prefix[8:]
1830                 query  = query.encode('utf-8')
1831                 if prefix == '':
1832                         self._download_n_results(query, 1)
1833                         return
1834                 elif prefix == 'all':
1835                         self._download_n_results(query, self._max_yahoo_results)
1836                         return
1837                 else:
1838                         try:
1839                                 n = long(prefix)
1840                                 if n <= 0:
1841                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1842                                         return
1843                                 elif n > self._max_yahoo_results:
1844                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1845                                         n = self._max_yahoo_results
1846                                 self._download_n_results(query, n)
1847                                 return
1848                         except ValueError: # parsing prefix as integer fails
1849                                 self._download_n_results(query, 1)
1850                                 return
1851
1852         def _download_n_results(self, query, n):
1853                 """Downloads a specified number of results for a query"""
1854
1855                 video_ids = []
1856                 already_seen = set()
1857                 pagenum = 1
1858
1859                 while True:
1860                         self.report_download_page(query, pagenum)
1861                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1862                         request = urllib2.Request(result_url, None, std_headers)
1863                         try:
1864                                 page = urllib2.urlopen(request).read()
1865                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1866                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1867                                 return
1868
1869                         # Extract video identifiers
1870                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1871                                 video_id = mobj.group(1)
1872                                 if video_id not in already_seen:
1873                                         video_ids.append(video_id)
1874                                         already_seen.add(video_id)
1875                                         if len(video_ids) == n:
1876                                                 # Specified n videos reached
1877                                                 for id in video_ids:
1878                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1879                                                 return
1880
1881                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1882                                 for id in video_ids:
1883                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1884                                 return
1885
1886                         pagenum = pagenum + 1
1887
1888 class YoutubePlaylistIE(InfoExtractor):
1889         """Information Extractor for YouTube playlists."""
1890
1891         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1892         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1893         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1894         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1895         _youtube_ie = None
1896
1897         def __init__(self, youtube_ie, downloader=None):
1898                 InfoExtractor.__init__(self, downloader)
1899                 self._youtube_ie = youtube_ie
1900         
1901         @staticmethod
1902         def suitable(url):
1903                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1904
1905         def report_download_page(self, playlist_id, pagenum):
1906                 """Report attempt to download playlist page with given number."""
1907                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1908
1909         def _real_initialize(self):
1910                 self._youtube_ie.initialize()
1911         
1912         def _real_extract(self, url):
1913                 # Extract playlist id
1914                 mobj = re.match(self._VALID_URL, url)
1915                 if mobj is None:
1916                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1917                         return
1918
1919                 # Download playlist pages
1920                 playlist_id = mobj.group(1)
1921                 video_ids = []
1922                 pagenum = 1
1923
1924                 while True:
1925                         self.report_download_page(playlist_id, pagenum)
1926                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1927                         try:
1928                                 page = urllib2.urlopen(request).read()
1929                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1930                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1931                                 return
1932
1933                         # Extract video identifiers
1934                         ids_in_page = []
1935                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936                                 if mobj.group(1) not in ids_in_page:
1937                                         ids_in_page.append(mobj.group(1))
1938                         video_ids.extend(ids_in_page)
1939
1940                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1941                                 break
1942                         pagenum = pagenum + 1
1943
1944                 playliststart = self._downloader.params.get('playliststart', 1)
1945                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1946                 if playliststart > 0:
1947                         video_ids = video_ids[playliststart:]
1948                         
1949                 for id in video_ids:
1950                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1951                 return
1952
1953 class YoutubeUserIE(InfoExtractor):
1954         """Information Extractor for YouTube users."""
1955
1956         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1957         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1958         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1959         _youtube_ie = None
1960
1961         def __init__(self, youtube_ie, downloader=None):
1962                 InfoExtractor.__init__(self, downloader)
1963                 self._youtube_ie = youtube_ie
1964         
1965         @staticmethod
1966         def suitable(url):
1967                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1968
1969         def report_download_page(self, username):
1970                 """Report attempt to download user page."""
1971                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1972
1973         def _real_initialize(self):
1974                 self._youtube_ie.initialize()
1975         
1976         def _real_extract(self, url):
1977                 # Extract username
1978                 mobj = re.match(self._VALID_URL, url)
1979                 if mobj is None:
1980                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1981                         return
1982
1983                 # Download user page
1984                 username = mobj.group(1)
1985                 video_ids = []
1986                 pagenum = 1
1987
1988                 self.report_download_page(username)
1989                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1990                 try:
1991                         page = urllib2.urlopen(request).read()
1992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1994                         return
1995
1996                 # Extract video identifiers
1997                 ids_in_page = []
1998
1999                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2000                         if mobj.group(1) not in ids_in_page:
2001                                 ids_in_page.append(mobj.group(1))
2002                 video_ids.extend(ids_in_page)
2003
2004                 playliststart = self._downloader.params.get('playliststart', 1)
2005                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2006                 if playliststart > 0:
2007                         video_ids = video_ids[playliststart:]   
2008
2009                 for id in video_ids:
2010                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2011                 return
2012
2013 class PostProcessor(object):
2014         """Post Processor class.
2015
2016         PostProcessor objects can be added to downloaders with their
2017         add_post_processor() method. When the downloader has finished a
2018         successful download, it will take its internal chain of PostProcessors
2019         and start calling the run() method on each one of them, first with
2020         an initial argument and then with the returned value of the previous
2021         PostProcessor.
2022
2023         The chain will be stopped if one of them ever returns None or the end
2024         of the chain is reached.
2025
2026         PostProcessor objects follow a "mutual registration" process similar
2027         to InfoExtractor objects.
2028         """
2029
2030         _downloader = None
2031
2032         def __init__(self, downloader=None):
2033                 self._downloader = downloader
2034
2035         def set_downloader(self, downloader):
2036                 """Sets the downloader for this PP."""
2037                 self._downloader = downloader
2038         
2039         def run(self, information):
2040                 """Run the PostProcessor.
2041
2042                 The "information" argument is a dictionary like the ones
2043                 composed by InfoExtractors. The only difference is that this
2044                 one has an extra field called "filepath" that points to the
2045                 downloaded file.
2046
2047                 When this method returns None, the postprocessing chain is
2048                 stopped. However, this method may return an information
2049                 dictionary that will be passed to the next postprocessing
2050                 object in the chain. It can be the one it received after
2051                 changing some fields.
2052
2053                 In addition, this method may raise a PostProcessingError
2054                 exception that will be taken into account by the downloader
2055                 it was called from.
2056                 """
2057                 return information # by default, do nothing
2058         
2059 ### MAIN PROGRAM ###
2060 if __name__ == '__main__':
2061         try:
2062                 # Modules needed only when running the main program
2063                 import getpass
2064                 import optparse
2065
2066                 # Function to update the program file with the latest version from bitbucket.org
2067                 def update_self(downloader, filename):
2068                         # Note: downloader only used for options
2069                         if not os.access (filename, os.W_OK):
2070                                 sys.exit('ERROR: no write permissions on %s' % filename)
2071
2072                         downloader.to_stdout('Updating to latest stable version...')
2073                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2074                         latest_version = urllib.urlopen(latest_url).read().strip()
2075                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2076                         newcontent = urllib.urlopen(prog_url).read()
2077                         stream = open(filename, 'w')
2078                         stream.write(newcontent)
2079                         stream.close()
2080                         downloader.to_stdout('Updated to version %s' % latest_version)
2081
2082                 # General configuration
2083                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2084                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2085                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2086
2087                 # Parse command line
2088                 parser = optparse.OptionParser(
2089                         usage='Usage: %prog [options] url...',
2090                         version='2010.07.24',
2091                         conflict_handler='resolve',
2092                 )
2093
2094                 parser.add_option('-h', '--help',
2095                                 action='help', help='print this help text and exit')
2096                 parser.add_option('-v', '--version',
2097                                 action='version', help='print program version and exit')
2098                 parser.add_option('-U', '--update',
2099                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2100                 parser.add_option('-i', '--ignore-errors',
2101                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2102                 parser.add_option('-r', '--rate-limit',
2103                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2104                 parser.add_option('-R', '--retries',
2105                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2106                 parser.add_option('--playlist-start',
2107                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2108
2109                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2110                 authentication.add_option('-u', '--username',
2111                                 dest='username', metavar='USERNAME', help='account username')
2112                 authentication.add_option('-p', '--password',
2113                                 dest='password', metavar='PASSWORD', help='account password')
2114                 authentication.add_option('-n', '--netrc',
2115                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2116                 parser.add_option_group(authentication)
2117
2118                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2119                 video_format.add_option('-f', '--format',
2120                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2121                 video_format.add_option('-m', '--mobile-version',
2122                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2123                 video_format.add_option('--all-formats',
2124                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2125                 video_format.add_option('--max-quality',
2126                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2127                 video_format.add_option('-b', '--best-quality',
2128                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2129                 parser.add_option_group(video_format)
2130
2131                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2132                 verbosity.add_option('-q', '--quiet',
2133                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2134                 verbosity.add_option('-s', '--simulate',
2135                                 action='store_true', dest='simulate', help='do not download video', default=False)
2136                 verbosity.add_option('-g', '--get-url',
2137                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2138                 verbosity.add_option('-e', '--get-title',
2139                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2140                 verbosity.add_option('--get-thumbnail',
2141                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2142                 verbosity.add_option('--get-description',
2143                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2144                 verbosity.add_option('--no-progress',
2145                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2146                 parser.add_option_group(verbosity)
2147
2148                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2149                 filesystem.add_option('-t', '--title',
2150                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2151                 filesystem.add_option('-l', '--literal',
2152                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2153                 filesystem.add_option('-o', '--output',
2154                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2155                 filesystem.add_option('-a', '--batch-file',
2156                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2157                 filesystem.add_option('-w', '--no-overwrites',
2158                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2159                 filesystem.add_option('-c', '--continue',
2160                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2161                 parser.add_option_group(filesystem)
2162
2163                 (opts, args) = parser.parse_args()
2164
2165                 # Batch file verification
2166                 batchurls = []
2167                 if opts.batchfile is not None:
2168                         try:
2169                                 if opts.batchfile == '-':
2170                                         batchfd = sys.stdin
2171                                 else:
2172                                         batchfd = open(opts.batchfile, 'r')
2173                                 batchurls = batchfd.readlines()
2174                                 batchurls = [x.strip() for x in batchurls]
2175                                 batchurls = [x for x in batchurls if len(x) > 0]
2176                         except IOError:
2177                                 sys.exit(u'ERROR: batch file could not be read')
2178                 all_urls = batchurls + args
2179
2180                 # Conflicting, missing and erroneous options
2181                 if opts.bestquality:
2182                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2183                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2184                         parser.error(u'using .netrc conflicts with giving username/password')
2185                 if opts.password is not None and opts.username is None:
2186                         parser.error(u'account username missing')
2187                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2188                         parser.error(u'using output template conflicts with using title or literal title')
2189                 if opts.usetitle and opts.useliteral:
2190                         parser.error(u'using title conflicts with using literal title')
2191                 if opts.username is not None and opts.password is None:
2192                         opts.password = getpass.getpass(u'Type account password and press return:')
2193                 if opts.ratelimit is not None:
2194                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2195                         if numeric_limit is None:
2196                                 parser.error(u'invalid rate limit specified')
2197                         opts.ratelimit = numeric_limit
2198                 if opts.retries is not None:
2199                         try:
2200                                 opts.retries = long(opts.retries)
2201                         except (TypeError, ValueError), err:
2202                                 parser.error(u'invalid retry count specified')
2203                 if opts.playliststart is not None:
2204                         try:
2205                                 opts.playliststart = long(opts.playliststart)
2206                         except (TypeError, ValueError), err:
2207                                 parser.error(u'invalid playlist page specified')
2208
2209                 # Information extractors
2210                 youtube_ie = YoutubeIE()
2211                 metacafe_ie = MetacafeIE(youtube_ie)
2212                 dailymotion_ie = DailymotionIE()
2213                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2214                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2215                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2216                 google_ie = GoogleIE()
2217                 google_search_ie = GoogleSearchIE(google_ie)
2218                 photobucket_ie = PhotobucketIE()
2219                 yahoo_ie = YahooIE()
2220                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2221                 generic_ie = GenericIE()
2222
2223                 # File downloader
2224                 fd = FileDownloader({
2225                         'usenetrc': opts.usenetrc,
2226                         'username': opts.username,
2227                         'password': opts.password,
2228                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2229                         'forceurl': opts.geturl,
2230                         'forcetitle': opts.gettitle,
2231                         'forcethumbnail': opts.getthumbnail,
2232                         'forcedescription': opts.getdescription,
2233                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2234                         'format': opts.format,
2235                         'format_limit': opts.format_limit,
2236                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2237                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2238                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2239                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2240                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2241                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2242                                 or u'%(id)s.%(ext)s'),
2243                         'ignoreerrors': opts.ignoreerrors,
2244                         'ratelimit': opts.ratelimit,
2245                         'nooverwrites': opts.nooverwrites,
2246                         'retries': opts.retries,
2247                         'continuedl': opts.continue_dl,
2248                         'noprogress': opts.noprogress,
2249                         'playliststart': opts.playliststart,
2250                         })
2251                 fd.add_info_extractor(youtube_search_ie)
2252                 fd.add_info_extractor(youtube_pl_ie)
2253                 fd.add_info_extractor(youtube_user_ie)
2254                 fd.add_info_extractor(metacafe_ie)
2255                 fd.add_info_extractor(dailymotion_ie)
2256                 fd.add_info_extractor(youtube_ie)
2257                 fd.add_info_extractor(google_ie)
2258                 fd.add_info_extractor(google_search_ie)
2259                 fd.add_info_extractor(photobucket_ie)
2260                 fd.add_info_extractor(yahoo_ie)
2261                 fd.add_info_extractor(yahoo_search_ie)
2262
2263                 # This must come last since it's the
2264                 # fallback if none of the others work
2265                 fd.add_info_extractor(generic_ie)
2266
2267                 # Update version
2268                 if opts.update_self:
2269                         update_self(fd, sys.argv[0])
2270
2271                 # Maybe do nothing
2272                 if len(all_urls) < 1:
2273                         if not opts.update_self:
2274                                 parser.error(u'you must provide at least one URL')
2275                         else:
2276                                 sys.exit()
2277                 retcode = fd.download(all_urls)
2278                 sys.exit(retcode)
2279
2280         except DownloadError:
2281                 sys.exit(1)
2282         except SameFileError:
2283                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2284         except KeyboardInterrupt:
2285                 sys.exit(u'\nERROR: Interrupted by user')