727fadea47dcd9dff1743e7c3ad613453dcc9452
[youtube-dl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25         from urlparse import parse_qs
26 except ImportError:
27         from cgi import parse_qs
28
29 std_headers = {
30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33         'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39         """Get preferred encoding.
40
41         Returns the best encoding scheme for the system, based on
42         locale.getpreferredencoding() and some further tweaks.
43         """
44         def yield_preferredencoding():
45                 try:
46                         pref = locale.getpreferredencoding()
47                         u'TEST'.encode(pref)
48                 except:
49                         pref = 'UTF-8'
50                 while True:
51                         yield pref
52         return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55         """Transforms an HTML entity to a Unicode character.
56         
57         This function receives a match object and is intended to be used with
58         the re.sub() function.
59         """
60         entity = matchobj.group(1)
61
62         # Known non-numeric HTML entity
63         if entity in htmlentitydefs.name2codepoint:
64                 return unichr(htmlentitydefs.name2codepoint[entity])
65
66         # Unicode character
67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
68         if mobj is not None:
69                 numstr = mobj.group(1)
70                 if numstr.startswith(u'x'):
71                         base = 16
72                         numstr = u'0%s' % numstr
73                 else:
74                         base = 10
75                 return unichr(long(numstr, base))
76
77         # Unknown entity in name, return its literal representation
78         return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81         """Sanitizes a video title so it could be used as part of a filename."""
82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83         return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86         """Try to open the given filename, and slightly tweak it if this fails.
87
88         Attempts to open the given filename. If this fails, it tries to change
89         the filename slightly, step by step, until it's either able to open it
90         or it fails and raises a final exception, like the standard open()
91         function.
92
93         It returns the tuple (stream, definitive_file_name).
94         """
95         try:
96                 if filename == u'-':
97                         return (sys.stdout, filename)
98                 stream = open(filename, open_mode)
99                 return (stream, filename)
100         except (IOError, OSError), err:
101                 # In case of error, try to remove win32 forbidden chars
102                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104                 # An exception here should be caught in the caller
105                 stream = open(filename, open_mode)
106                 return (stream, filename)
107
108
109 class DownloadError(Exception):
110         """Download Error exception.
111         
112         This exception may be thrown by FileDownloader objects if they are not
113         configured to continue on errors. They will contain the appropriate
114         error message.
115         """
116         pass
117
118 class SameFileError(Exception):
119         """Same File exception.
120
121         This exception will be thrown by FileDownloader objects if they detect
122         multiple files would have to be downloaded to the same file on disk.
123         """
124         pass
125
126 class PostProcessingError(Exception):
127         """Post Processing exception.
128
129         This exception may be raised by PostProcessor's .run() method to
130         indicate an error in the postprocessing task.
131         """
132         pass
133
134 class UnavailableVideoError(Exception):
135         """Unavailable Format exception.
136
137         This exception will be thrown when a video is requested
138         in a format that is not available for that video.
139         """
140         pass
141
142 class ContentTooShortError(Exception):
143         """Content Too Short exception.
144
145         This exception may be raised by FileDownloader objects when a file they
146         download is too small for what the server announced first, indicating
147         the connection was probably interrupted.
148         """
149         # Both in bytes
150         downloaded = None
151         expected = None
152
153         def __init__(self, downloaded, expected):
154                 self.downloaded = downloaded
155                 self.expected = expected
156
157 class FileDownloader(object):
158         """File Downloader class.
159
160         File downloader objects are the ones responsible of downloading the
161         actual video file and writing it to disk if the user has requested
162         it, among some other tasks. In most cases there should be one per
163         program. As, given a video URL, the downloader doesn't know how to
164         extract all the needed information, task that InfoExtractors do, it
165         has to pass the URL to one of them.
166
167         For this, file downloader objects have a method that allows
168         InfoExtractors to be registered in a given order. When it is passed
169         a URL, the file downloader handles it to the first InfoExtractor it
170         finds that reports being able to handle it. The InfoExtractor extracts
171         all the information about the video or videos the URL refers to, and
172         asks the FileDownloader to process the video information, possibly
173         downloading the video.
174
175         File downloaders accept a lot of parameters. In order not to saturate
176         the object constructor with arguments, it receives a dictionary of
177         options instead. These options are available through the params
178         attribute for the InfoExtractors to use. The FileDownloader also
179         registers itself as the downloader in charge for the InfoExtractors
180         that are added to it, so this is a "mutual registration".
181
182         Available options:
183
184         username:       Username for authentication purposes.
185         password:       Password for authentication purposes.
186         usenetrc:       Use netrc for authentication instead.
187         quiet:          Do not print messages to stdout.
188         forceurl:       Force printing final URL.
189         forcetitle:     Force printing title.
190         simulate:       Do not download the video files.
191         format:         Video format code.
192         format_limit:   Highest quality format to try.
193         outtmpl:        Template for output names.
194         ignoreerrors:   Do not stop on download errors.
195         ratelimit:      Download speed limit, in bytes/sec.
196         nooverwrites:   Prevent overwriting files.
197         retries:        Number of times to retry for HTTP error 503
198         continuedl:     Try to continue downloads if possible.
199         noprogress:     Do not print the progress bar.
200         """
201
202         params = None
203         _ies = []
204         _pps = []
205         _download_retcode = None
206         _num_downloads = None
207
208         def __init__(self, params):
209                 """Create a FileDownloader object with the given options."""
210                 self._ies = []
211                 self._pps = []
212                 self._download_retcode = 0
213                 self._num_downloads = 0
214                 self.params = params
215         
216         @staticmethod
217         def pmkdir(filename):
218                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219                 components = filename.split(os.sep)
220                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222                 for dir in aggregate:
223                         if not os.path.exists(dir):
224                                 os.mkdir(dir)
225         
226         @staticmethod
227         def format_bytes(bytes):
228                 if bytes is None:
229                         return 'N/A'
230                 if type(bytes) is str:
231                         bytes = float(bytes)
232                 if bytes == 0.0:
233                         exponent = 0
234                 else:
235                         exponent = long(math.log(bytes, 1024.0))
236                 suffix = 'bkMGTPEZY'[exponent]
237                 converted = float(bytes) / float(1024**exponent)
238                 return '%.2f%s' % (converted, suffix)
239
240         @staticmethod
241         def calc_percent(byte_counter, data_len):
242                 if data_len is None:
243                         return '---.-%'
244                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246         @staticmethod
247         def calc_eta(start, now, total, current):
248                 if total is None:
249                         return '--:--'
250                 dif = now - start
251                 if current == 0 or dif < 0.001: # One millisecond
252                         return '--:--'
253                 rate = float(current) / dif
254                 eta = long((float(total) - float(current)) / rate)
255                 (eta_mins, eta_secs) = divmod(eta, 60)
256                 if eta_mins > 99:
257                         return '--:--'
258                 return '%02d:%02d' % (eta_mins, eta_secs)
259
260         @staticmethod
261         def calc_speed(start, now, bytes):
262                 dif = now - start
263                 if bytes == 0 or dif < 0.001: # One millisecond
264                         return '%10s' % '---b/s'
265                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267         @staticmethod
268         def best_block_size(elapsed_time, bytes):
269                 new_min = max(bytes / 2.0, 1.0)
270                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271                 if elapsed_time < 0.001:
272                         return long(new_max)
273                 rate = bytes / elapsed_time
274                 if rate > new_max:
275                         return long(new_max)
276                 if rate < new_min:
277                         return long(new_min)
278                 return long(rate)
279
280         @staticmethod
281         def parse_bytes(bytestr):
282                 """Parse a string indicating a byte quantity into a long integer."""
283                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284                 if matchobj is None:
285                         return None
286                 number = float(matchobj.group(1))
287                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288                 return long(round(number * multiplier))
289
290         def add_info_extractor(self, ie):
291                 """Add an InfoExtractor object to the end of the list."""
292                 self._ies.append(ie)
293                 ie.set_downloader(self)
294         
295         def add_post_processor(self, pp):
296                 """Add a PostProcessor object to the end of the chain."""
297                 self._pps.append(pp)
298                 pp.set_downloader(self)
299         
300         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301                 """Print message to stdout if not in quiet mode."""
302                 try:
303                         if not self.params.get('quiet', False):
304                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305                         sys.stdout.flush()
306                 except (UnicodeEncodeError), err:
307                         if not ignore_encoding_errors:
308                                 raise
309         
310         def to_stderr(self, message):
311                 """Print message to stderr."""
312                 print >>sys.stderr, message.encode(preferredencoding())
313         
314         def fixed_template(self):
315                 """Checks if the output template is fixed."""
316                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318         def trouble(self, message=None):
319                 """Determine action to take when a download problem appears.
320
321                 Depending on if the downloader has been configured to ignore
322                 download errors or not, this method may throw an exception or
323                 not when errors are found, after printing the message.
324                 """
325                 if message is not None:
326                         self.to_stderr(message)
327                 if not self.params.get('ignoreerrors', False):
328                         raise DownloadError(message)
329                 self._download_retcode = 1
330
331         def slow_down(self, start_time, byte_counter):
332                 """Sleep if the download speed is over the rate limit."""
333                 rate_limit = self.params.get('ratelimit', None)
334                 if rate_limit is None or byte_counter == 0:
335                         return
336                 now = time.time()
337                 elapsed = now - start_time
338                 if elapsed <= 0.0:
339                         return
340                 speed = float(byte_counter) / elapsed
341                 if speed > rate_limit:
342                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344         def report_destination(self, filename):
345                 """Report destination filename."""
346                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347         
348         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349                 """Report download progress."""
350                 if self.params.get('noprogress', False):
351                         return
352                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355         def report_resuming_byte(self, resume_len):
356                 """Report attempt to resume at given byte."""
357                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358         
359         def report_retry(self, count, retries):
360                 """Report retry in case of HTTP error 503"""
361                 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
362         
363         def report_file_already_downloaded(self, file_name):
364                 """Report file has already been fully downloaded."""
365                 try:
366                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367                 except (UnicodeEncodeError), err:
368                         self.to_stdout(u'[download] The file has already been downloaded')
369         
370         def report_unable_to_resume(self):
371                 """Report it was impossible to resume download."""
372                 self.to_stdout(u'[download] Unable to resume')
373         
374         def report_finish(self):
375                 """Report download finished."""
376                 if self.params.get('noprogress', False):
377                         self.to_stdout(u'[download] Download completed')
378                 else:
379                         self.to_stdout(u'')
380         
381         def increment_downloads(self):
382                 """Increment the ordinal that assigns a number to each file."""
383                 self._num_downloads += 1
384
385         def process_info(self, info_dict):
386                 """Process a single dictionary returned by an InfoExtractor."""
387                 # Do nothing else if in simulate mode
388                 if self.params.get('simulate', False):
389                         # Forced printings
390                         if self.params.get('forcetitle', False):
391                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392                         if self.params.get('forceurl', False):
393                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396                         if self.params.get('forcedescription', False) and 'description' in info_dict:
397                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399                         return
400                         
401                 try:
402                         template_dict = dict(info_dict)
403                         template_dict['epoch'] = unicode(long(time.time()))
404                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
405                         filename = self.params['outtmpl'] % template_dict
406                 except (ValueError, KeyError), err:
407                         self.trouble(u'ERROR: invalid system charset or erroneous output template')
408                         return
409                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
410                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
411                         return
412
413                 try:
414                         self.pmkdir(filename)
415                 except (OSError, IOError), err:
416                         self.trouble('ERROR: unable to create directories: %s' % str(err))
417                         return
418
419                 try:
420                         success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
421                 except (OSError, IOError), err:
422                         raise UnavailableVideoError
423                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
424                         self.trouble('ERROR: unable to download video data: %s' % str(err))
425                         return
426                 except (ContentTooShortError, ), err:
427                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
428                         return
429
430                 if success:
431                         try:
432                                 self.post_process(filename, info_dict)
433                         except (PostProcessingError), err:
434                                 self.trouble('ERROR: postprocessing: %s' % str(err))
435                                 return
436
437         def download(self, url_list):
438                 """Download a given list of URLs."""
439                 if len(url_list) > 1 and self.fixed_template():
440                         raise SameFileError(self.params['outtmpl'])
441
442                 for url in url_list:
443                         suitable_found = False
444                         for ie in self._ies:
445                                 # Go to next InfoExtractor if not suitable
446                                 if not ie.suitable(url):
447                                         continue
448
449                                 # Suitable InfoExtractor found
450                                 suitable_found = True
451
452                                 # Extract information from URL and process it
453                                 ie.extract(url)
454
455                                 # Suitable InfoExtractor had been found; go to next URL
456                                 break
457
458                         if not suitable_found:
459                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
460
461                 return self._download_retcode
462
463         def post_process(self, filename, ie_info):
464                 """Run the postprocessing chain on the given file."""
465                 info = dict(ie_info)
466                 info['filepath'] = filename
467                 for pp in self._pps:
468                         info = pp.run(info)
469                         if info is None:
470                                 break
471         
472         def _download_with_rtmpdump(self, filename, url, player_url):
473                 self.report_destination(filename)
474
475                 # Check for rtmpdump first
476                 try:
477                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
478                 except (OSError, IOError):
479                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
480                         return False
481
482                 # Download using rtmpdump. rtmpdump returns exit code 2 when
483                 # the connection was interrumpted and resuming appears to be
484                 # possible. This is part of rtmpdump's normal usage, AFAIK.
485                 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
486                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
487                 while retval == 2 or retval == 1:
488                         prevsize = os.path.getsize(filename)
489                         self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
490                         time.sleep(5.0) # This seems to be needed
491                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
492                         cursize = os.path.getsize(filename)
493                         if prevsize == cursize and retval == 1:
494                                 break
495                 if retval == 0:
496                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
497                         return True
498                 else:
499                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
500                         return False
501
502         def _do_download(self, filename, url, player_url):
503                 # Attempt to download using rtmpdump
504                 if url.startswith('rtmp'):
505                         return self._download_with_rtmpdump(filename, url, player_url)
506
507                 stream = None
508                 open_mode = 'wb'
509                 basic_request = urllib2.Request(url, None, std_headers)
510                 request = urllib2.Request(url, None, std_headers)
511
512                 # Establish possible resume length
513                 if os.path.isfile(filename):
514                         resume_len = os.path.getsize(filename)
515                 else:
516                         resume_len = 0
517
518                 # Request parameters in case of being able to resume
519                 if self.params.get('continuedl', False) and resume_len != 0:
520                         self.report_resuming_byte(resume_len)
521                         request.add_header('Range','bytes=%d-' % resume_len)
522                         open_mode = 'ab'
523
524                 count = 0
525                 retries = self.params.get('retries', 0)
526                 while count <= retries:
527                         # Establish connection
528                         try:
529                                 data = urllib2.urlopen(request)
530                                 break
531                         except (urllib2.HTTPError, ), err:
532                                 if err.code != 503 and err.code != 416:
533                                         # Unexpected HTTP error
534                                         raise
535                                 elif err.code == 416:
536                                         # Unable to resume (requested range not satisfiable)
537                                         try:
538                                                 # Open the connection again without the range header
539                                                 data = urllib2.urlopen(basic_request)
540                                                 content_length = data.info()['Content-Length']
541                                         except (urllib2.HTTPError, ), err:
542                                                 if err.code != 503:
543                                                         raise
544                                         else:
545                                                 # Examine the reported length
546                                                 if (content_length is not None and
547                                                     (resume_len - 100 < long(content_length) < resume_len + 100)):
548                                                         # The file had already been fully downloaded.
549                                                         # Explanation to the above condition: in issue #175 it was revealed that
550                                                         # YouTube sometimes adds or removes a few bytes from the end of the file,
551                                                         # changing the file size slightly and causing problems for some users. So
552                                                         # I decided to implement a suggested change and consider the file
553                                                         # completely downloaded if the file size differs less than 100 bytes from
554                                                         # the one in the hard drive.
555                                                         self.report_file_already_downloaded(filename)
556                                                         return True
557                                                 else:
558                                                         # The length does not match, we start the download over
559                                                         self.report_unable_to_resume()
560                                                         open_mode = 'wb'
561                                                         break
562                         # Retry
563                         count += 1
564                         if count <= retries:
565                                 self.report_retry(count, retries)
566
567                 if count > retries:
568                         self.trouble(u'ERROR: giving up after %s retries' % retries)
569                         return False
570
571                 data_len = data.info().get('Content-length', None)
572                 data_len_str = self.format_bytes(data_len)
573                 byte_counter = 0
574                 block_size = 1024
575                 start = time.time()
576                 while True:
577                         # Download and write
578                         before = time.time()
579                         data_block = data.read(block_size)
580                         after = time.time()
581                         data_block_len = len(data_block)
582                         if data_block_len == 0:
583                                 break
584                         byte_counter += data_block_len
585
586                         # Open file just in time
587                         if stream is None:
588                                 try:
589                                         (stream, filename) = sanitize_open(filename, open_mode)
590                                         self.report_destination(filename)
591                                 except (OSError, IOError), err:
592                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
593                                         return False
594                         try:
595                                 stream.write(data_block)
596                         except (IOError, OSError), err:
597                                 self.trouble('\nERROR: unable to write data: %s' % str(err))
598                         block_size = self.best_block_size(after - before, data_block_len)
599
600                         # Progress message
601                         percent_str = self.calc_percent(byte_counter, data_len)
602                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
603                         speed_str = self.calc_speed(start, time.time(), byte_counter)
604                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
605
606                         # Apply rate limit
607                         self.slow_down(start, byte_counter)
608
609                 self.report_finish()
610                 if data_len is not None and str(byte_counter) != data_len:
611                         raise ContentTooShortError(byte_counter, long(data_len))
612                 return True
613
614 class InfoExtractor(object):
615         """Information Extractor class.
616
617         Information extractors are the classes that, given a URL, extract
618         information from the video (or videos) the URL refers to. This
619         information includes the real video URL, the video title and simplified
620         title, author and others. The information is stored in a dictionary
621         which is then passed to the FileDownloader. The FileDownloader
622         processes this information possibly downloading the video to the file
623         system, among other possible outcomes. The dictionaries must include
624         the following fields:
625
626         id:             Video identifier.
627         url:            Final video URL.
628         uploader:       Nickname of the video uploader.
629         title:          Literal title.
630         stitle:         Simplified title.
631         ext:            Video filename extension.
632         format:         Video format.
633         player_url:     SWF Player URL (may be None).
634
635         The following fields are optional. Their primary purpose is to allow
636         youtube-dl to serve as the backend for a video search function, such
637         as the one in youtube2mp3.  They are only used when their respective
638         forced printing functions are called:
639
640         thumbnail:      Full URL to a video thumbnail image.
641         description:    One-line video description.
642
643         Subclasses of this one should re-define the _real_initialize() and
644         _real_extract() methods, as well as the suitable() static method.
645         Probably, they should also be instantiated and added to the main
646         downloader.
647         """
648
649         _ready = False
650         _downloader = None
651
652         def __init__(self, downloader=None):
653                 """Constructor. Receives an optional downloader."""
654                 self._ready = False
655                 self.set_downloader(downloader)
656
657         @staticmethod
658         def suitable(url):
659                 """Receives a URL and returns True if suitable for this IE."""
660                 return False
661
662         def initialize(self):
663                 """Initializes an instance (authentication, etc)."""
664                 if not self._ready:
665                         self._real_initialize()
666                         self._ready = True
667
668         def extract(self, url):
669                 """Extracts URL information and returns it in list of dicts."""
670                 self.initialize()
671                 return self._real_extract(url)
672
673         def set_downloader(self, downloader):
674                 """Sets the downloader for this IE."""
675                 self._downloader = downloader
676         
677         def _real_initialize(self):
678                 """Real initialization process. Redefine in subclasses."""
679                 pass
680
681         def _real_extract(self, url):
682                 """Real extraction process. Redefine in subclasses."""
683                 pass
684
685 class YoutubeIE(InfoExtractor):
686         """Information extractor for youtube.com."""
687
688         _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
689         _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
690         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
691         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
692         _NETRC_MACHINE = 'youtube'
693         # Listed in order of quality
694         _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
695         _video_extensions = {
696                 '13': '3gp',
697                 '17': 'mp4',
698                 '18': 'mp4',
699                 '22': 'mp4',
700                 '37': 'mp4',
701                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
702                 '43': 'webm',
703                 '45': 'webm',
704         }
705
706         @staticmethod
707         def suitable(url):
708                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
709
710         def report_lang(self):
711                 """Report attempt to set language."""
712                 self._downloader.to_stdout(u'[youtube] Setting language')
713
714         def report_login(self):
715                 """Report attempt to log in."""
716                 self._downloader.to_stdout(u'[youtube] Logging in')
717         
718         def report_age_confirmation(self):
719                 """Report attempt to confirm age."""
720                 self._downloader.to_stdout(u'[youtube] Confirming age')
721         
722         def report_video_webpage_download(self, video_id):
723                 """Report attempt to download video webpage."""
724                 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
725         
726         def report_video_info_webpage_download(self, video_id):
727                 """Report attempt to download video info webpage."""
728                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
729         
730         def report_information_extraction(self, video_id):
731                 """Report attempt to extract video information."""
732                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
733         
734         def report_unavailable_format(self, video_id, format):
735                 """Report extracted video URL."""
736                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
737         
738         def report_rtmp_download(self):
739                 """Indicate the download will use the RTMP protocol."""
740                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
741         
742         def _real_initialize(self):
743                 if self._downloader is None:
744                         return
745
746                 username = None
747                 password = None
748                 downloader_params = self._downloader.params
749
750                 # Attempt to use provided username and password or .netrc data
751                 if downloader_params.get('username', None) is not None:
752                         username = downloader_params['username']
753                         password = downloader_params['password']
754                 elif downloader_params.get('usenetrc', False):
755                         try:
756                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
757                                 if info is not None:
758                                         username = info[0]
759                                         password = info[2]
760                                 else:
761                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
762                         except (IOError, netrc.NetrcParseError), err:
763                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
764                                 return
765
766                 # Set language
767                 request = urllib2.Request(self._LANG_URL, None, std_headers)
768                 try:
769                         self.report_lang()
770                         urllib2.urlopen(request).read()
771                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
772                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
773                         return
774
775                 # No authentication to be performed
776                 if username is None:
777                         return
778
779                 # Log in
780                 login_form = {
781                                 'current_form': 'loginForm',
782                                 'next':         '/',
783                                 'action_login': 'Log In',
784                                 'username':     username,
785                                 'password':     password,
786                                 }
787                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
788                 try:
789                         self.report_login()
790                         login_results = urllib2.urlopen(request).read()
791                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
792                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
793                                 return
794                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
795                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
796                         return
797         
798                 # Confirm age
799                 age_form = {
800                                 'next_url':             '/',
801                                 'action_confirm':       'Confirm',
802                                 }
803                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
804                 try:
805                         self.report_age_confirmation()
806                         age_results = urllib2.urlopen(request).read()
807                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
808                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
809                         return
810
811         def _real_extract(self, url):
812                 # Extract video id from URL
813                 mobj = re.match(self._VALID_URL, url)
814                 if mobj is None:
815                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
816                         return
817                 video_id = mobj.group(2)
818
819                 # Get video webpage
820                 self.report_video_webpage_download(video_id)
821                 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
822                 try:
823                         video_webpage = urllib2.urlopen(request).read()
824                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
825                         self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
826                         return
827
828                 # Attempt to extract SWF player URL
829                 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
830                 if mobj is not None:
831                         player_url = mobj.group(1)
832                 else:
833                         player_url = None
834
835                 # Get video info
836                 self.report_video_info_webpage_download(video_id)
837                 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
838                         video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
839                                            % (video_id, el_type))
840                         request = urllib2.Request(video_info_url, None, std_headers)
841                         try:
842                                 video_info_webpage = urllib2.urlopen(request).read()
843                                 video_info = parse_qs(video_info_webpage)
844                                 if 'token' in video_info:
845                                         break
846                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
847                                 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
848                                 return
849                 if 'token' not in video_info:
850                         if 'reason' in video_info:
851                                 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
852                         else:
853                                 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
854                         return
855
856                 # Start extracting information
857                 self.report_information_extraction(video_id)
858
859                 # uploader
860                 if 'author' not in video_info:
861                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
862                         return
863                 video_uploader = urllib.unquote_plus(video_info['author'][0])
864
865                 # title
866                 if 'title' not in video_info:
867                         self._downloader.trouble(u'ERROR: unable to extract video title')
868                         return
869                 video_title = urllib.unquote_plus(video_info['title'][0])
870                 video_title = video_title.decode('utf-8')
871                 video_title = sanitize_title(video_title)
872
873                 # simplified title
874                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
875                 simple_title = simple_title.strip(ur'_')
876
877                 # thumbnail image
878                 if 'thumbnail_url' not in video_info:
879                         self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
880                         video_thumbnail = ''
881                 else:   # don't panic if we can't find it
882                         video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
883
884                 # description
885                 video_description = 'No description available.'
886                 if self._downloader.params.get('forcedescription', False):
887                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
888                         if mobj is not None:
889                                 video_description = mobj.group(1)
890
891                 # token
892                 video_token = urllib.unquote_plus(video_info['token'][0])
893
894                 # Decide which formats to download
895                 requested_format = self._downloader.params.get('format', None)
896                 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
897
898                 if 'fmt_url_map' in video_info:
899                         url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
900                         format_limit = self._downloader.params.get('format_limit', None)
901                         if format_limit is not None and format_limit in self._available_formats:
902                                 format_list = self._available_formats[self._available_formats.index(format_limit):]
903                         else:
904                                 format_list = self._available_formats
905                         existing_formats = [x for x in format_list if x in url_map]
906                         if len(existing_formats) == 0:
907                                 self._downloader.trouble(u'ERROR: no known formats available for video')
908                                 return
909                         if requested_format is None:
910                                 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
911                         elif requested_format == '-1':
912                                 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
913                         else:
914                                 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
915
916                 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
917                         self.report_rtmp_download()
918                         video_url_list = [(None, video_info['conn'][0])]
919
920                 else:
921                         self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
922                         return
923
924                 for format_param, video_real_url in video_url_list:
925                         # At this point we have a new video
926                         self._downloader.increment_downloads()
927
928                         # Extension
929                         video_extension = self._video_extensions.get(format_param, 'flv')
930
931                         # Find the video URL in fmt_url_map or conn paramters
932                         try:
933                                 # Process video information
934                                 self._downloader.process_info({
935                                         'id':           video_id.decode('utf-8'),
936                                         'url':          video_real_url.decode('utf-8'),
937                                         'uploader':     video_uploader.decode('utf-8'),
938                                         'title':        video_title,
939                                         'stitle':       simple_title,
940                                         'ext':          video_extension.decode('utf-8'),
941                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
942                                         'thumbnail':    video_thumbnail.decode('utf-8'),
943                                         'description':  video_description.decode('utf-8'),
944                                         'player_url':   player_url,
945                                 })
946                         except UnavailableVideoError, err:
947                                 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
948
949
950 class MetacafeIE(InfoExtractor):
951         """Information Extractor for metacafe.com."""
952
953         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
954         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
955         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
956         _youtube_ie = None
957
958         def __init__(self, youtube_ie, downloader=None):
959                 InfoExtractor.__init__(self, downloader)
960                 self._youtube_ie = youtube_ie
961
962         @staticmethod
963         def suitable(url):
964                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
965
966         def report_disclaimer(self):
967                 """Report disclaimer retrieval."""
968                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
969
970         def report_age_confirmation(self):
971                 """Report attempt to confirm age."""
972                 self._downloader.to_stdout(u'[metacafe] Confirming age')
973         
974         def report_download_webpage(self, video_id):
975                 """Report webpage download."""
976                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
977         
978         def report_extraction(self, video_id):
979                 """Report information extraction."""
980                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
981
982         def _real_initialize(self):
983                 # Retrieve disclaimer
984                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
985                 try:
986                         self.report_disclaimer()
987                         disclaimer = urllib2.urlopen(request).read()
988                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
989                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
990                         return
991
992                 # Confirm age
993                 disclaimer_form = {
994                         'filters': '0',
995                         'submit': "Continue - I'm over 18",
996                         }
997                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
998                 try:
999                         self.report_age_confirmation()
1000                         disclaimer = urllib2.urlopen(request).read()
1001                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1002                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1003                         return
1004         
1005         def _real_extract(self, url):
1006                 # Extract id and simplified title from URL
1007                 mobj = re.match(self._VALID_URL, url)
1008                 if mobj is None:
1009                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1010                         return
1011
1012                 video_id = mobj.group(1)
1013
1014                 # Check if video comes from YouTube
1015                 mobj2 = re.match(r'^yt-(.*)$', video_id)
1016                 if mobj2 is not None:
1017                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1018                         return
1019
1020                 # At this point we have a new video
1021                 self._downloader.increment_downloads()
1022
1023                 simple_title = mobj.group(2).decode('utf-8')
1024                 video_extension = 'flv'
1025
1026                 # Retrieve video webpage to extract further information
1027                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1028                 try:
1029                         self.report_download_webpage(video_id)
1030                         webpage = urllib2.urlopen(request).read()
1031                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1032                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1033                         return
1034
1035                 # Extract URL, uploader and title from webpage
1036                 self.report_extraction(video_id)
1037                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1038                 if mobj is None:
1039                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1040                         return
1041                 mediaURL = urllib.unquote(mobj.group(1))
1042
1043                 # Extract gdaKey if available
1044                 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1045                 if mobj is None:
1046                         video_url = mediaURL
1047                         #self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1048                         #return
1049                 else:
1050                         gdaKey = mobj.group(1)
1051                         video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1052
1053                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1054                 if mobj is None:
1055                         self._downloader.trouble(u'ERROR: unable to extract title')
1056                         return
1057                 video_title = mobj.group(1).decode('utf-8')
1058                 video_title = sanitize_title(video_title)
1059
1060                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1061                 if mobj is None:
1062                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1063                         return
1064                 video_uploader = mobj.group(1)
1065
1066                 try:
1067                         # Process video information
1068                         self._downloader.process_info({
1069                                 'id':           video_id.decode('utf-8'),
1070                                 'url':          video_url.decode('utf-8'),
1071                                 'uploader':     video_uploader.decode('utf-8'),
1072                                 'title':        video_title,
1073                                 'stitle':       simple_title,
1074                                 'ext':          video_extension.decode('utf-8'),
1075                                 'format':       u'NA',
1076                                 'player_url':   None,
1077                         })
1078                 except UnavailableVideoError:
1079                         self._downloader.trouble(u'ERROR: unable to download video')
1080
1081
1082 class DailymotionIE(InfoExtractor):
1083         """Information Extractor for Dailymotion"""
1084
1085         _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1086
1087         def __init__(self, downloader=None):
1088                 InfoExtractor.__init__(self, downloader)
1089
1090         @staticmethod
1091         def suitable(url):
1092                 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1093
1094         def report_download_webpage(self, video_id):
1095                 """Report webpage download."""
1096                 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1097         
1098         def report_extraction(self, video_id):
1099                 """Report information extraction."""
1100                 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1101
1102         def _real_initialize(self):
1103                 return
1104
1105         def _real_extract(self, url):
1106                 # Extract id and simplified title from URL
1107                 mobj = re.match(self._VALID_URL, url)
1108                 if mobj is None:
1109                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1110                         return
1111
1112                 # At this point we have a new video
1113                 self._downloader.increment_downloads()
1114                 video_id = mobj.group(1)
1115
1116                 simple_title = mobj.group(2).decode('utf-8')
1117                 video_extension = 'flv'
1118
1119                 # Retrieve video webpage to extract further information
1120                 request = urllib2.Request(url)
1121                 try:
1122                         self.report_download_webpage(video_id)
1123                         webpage = urllib2.urlopen(request).read()
1124                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1125                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1126                         return
1127
1128                 # Extract URL, uploader and title from webpage
1129                 self.report_extraction(video_id)
1130                 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1131                 if mobj is None:
1132                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1133                         return
1134                 mediaURL = urllib.unquote(mobj.group(1))
1135
1136                 # if needed add http://www.dailymotion.com/ if relative URL
1137
1138                 video_url = mediaURL
1139
1140                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1141                 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1142                 if mobj is None:
1143                         self._downloader.trouble(u'ERROR: unable to extract title')
1144                         return
1145                 video_title = mobj.group(1).decode('utf-8')
1146                 video_title = sanitize_title(video_title)
1147
1148                 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1149                 if mobj is None:
1150                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1151                         return
1152                 video_uploader = mobj.group(1)
1153
1154                 try:
1155                         # Process video information
1156                         self._downloader.process_info({
1157                                 'id':           video_id.decode('utf-8'),
1158                                 'url':          video_url.decode('utf-8'),
1159                                 'uploader':     video_uploader.decode('utf-8'),
1160                                 'title':        video_title,
1161                                 'stitle':       simple_title,
1162                                 'ext':          video_extension.decode('utf-8'),
1163                                 'format':       u'NA',
1164                                 'player_url':   None,
1165                         })
1166                 except UnavailableVideoError:
1167                         self._downloader.trouble(u'ERROR: unable to download video')
1168
1169 class GoogleIE(InfoExtractor):
1170         """Information extractor for video.google.com."""
1171
1172         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1173
1174         def __init__(self, downloader=None):
1175                 InfoExtractor.__init__(self, downloader)
1176
1177         @staticmethod
1178         def suitable(url):
1179                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1180
1181         def report_download_webpage(self, video_id):
1182                 """Report webpage download."""
1183                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1184
1185         def report_extraction(self, video_id):
1186                 """Report information extraction."""
1187                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1188
1189         def _real_initialize(self):
1190                 return
1191
1192         def _real_extract(self, url):
1193                 # Extract id from URL
1194                 mobj = re.match(self._VALID_URL, url)
1195                 if mobj is None:
1196                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1197                         return
1198
1199                 # At this point we have a new video
1200                 self._downloader.increment_downloads()
1201                 video_id = mobj.group(1)
1202
1203                 video_extension = 'mp4'
1204
1205                 # Retrieve video webpage to extract further information
1206                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1207                 try:
1208                         self.report_download_webpage(video_id)
1209                         webpage = urllib2.urlopen(request).read()
1210                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1211                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1212                         return
1213
1214                 # Extract URL, uploader, and title from webpage
1215                 self.report_extraction(video_id)
1216                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1217                 if mobj is None:
1218                         video_extension = 'flv'
1219                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1220                 if mobj is None:
1221                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1222                         return
1223                 mediaURL = urllib.unquote(mobj.group(1))
1224                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1225                 mediaURL = mediaURL.replace('\\x26', '\x26')
1226
1227                 video_url = mediaURL
1228
1229                 mobj = re.search(r'<title>(.*)</title>', webpage)
1230                 if mobj is None:
1231                         self._downloader.trouble(u'ERROR: unable to extract title')
1232                         return
1233                 video_title = mobj.group(1).decode('utf-8')
1234                 video_title = sanitize_title(video_title)
1235                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1236
1237                 # Extract video description
1238                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1239                 if mobj is None:
1240                         self._downloader.trouble(u'ERROR: unable to extract video description')
1241                         return
1242                 video_description = mobj.group(1).decode('utf-8')
1243                 if not video_description:
1244                         video_description = 'No description available.'
1245
1246                 # Extract video thumbnail
1247                 if self._downloader.params.get('forcethumbnail', False):
1248                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1249                         try:
1250                                 webpage = urllib2.urlopen(request).read()
1251                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1252                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1253                                 return
1254                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1255                         if mobj is None:
1256                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1257                                 return
1258                         video_thumbnail = mobj.group(1)
1259                 else:   # we need something to pass to process_info
1260                         video_thumbnail = ''
1261
1262
1263                 try:
1264                         # Process video information
1265                         self._downloader.process_info({
1266                                 'id':           video_id.decode('utf-8'),
1267                                 'url':          video_url.decode('utf-8'),
1268                                 'uploader':     u'NA',
1269                                 'title':        video_title,
1270                                 'stitle':       simple_title,
1271                                 'ext':          video_extension.decode('utf-8'),
1272                                 'format':       u'NA',
1273                                 'player_url':   None,
1274                         })
1275                 except UnavailableVideoError:
1276                         self._downloader.trouble(u'ERROR: unable to download video')
1277
1278
1279 class PhotobucketIE(InfoExtractor):
1280         """Information extractor for photobucket.com."""
1281
1282         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1283
1284         def __init__(self, downloader=None):
1285                 InfoExtractor.__init__(self, downloader)
1286
1287         @staticmethod
1288         def suitable(url):
1289                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1290
1291         def report_download_webpage(self, video_id):
1292                 """Report webpage download."""
1293                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1294
1295         def report_extraction(self, video_id):
1296                 """Report information extraction."""
1297                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1298
1299         def _real_initialize(self):
1300                 return
1301
1302         def _real_extract(self, url):
1303                 # Extract id from URL
1304                 mobj = re.match(self._VALID_URL, url)
1305                 if mobj is None:
1306                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1307                         return
1308
1309                 # At this point we have a new video
1310                 self._downloader.increment_downloads()
1311                 video_id = mobj.group(1)
1312
1313                 video_extension = 'flv'
1314
1315                 # Retrieve video webpage to extract further information
1316                 request = urllib2.Request(url)
1317                 try:
1318                         self.report_download_webpage(video_id)
1319                         webpage = urllib2.urlopen(request).read()
1320                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1321                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1322                         return
1323
1324                 # Extract URL, uploader, and title from webpage
1325                 self.report_extraction(video_id)
1326                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1327                 if mobj is None:
1328                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1329                         return
1330                 mediaURL = urllib.unquote(mobj.group(1))
1331
1332                 video_url = mediaURL
1333
1334                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1335                 if mobj is None:
1336                         self._downloader.trouble(u'ERROR: unable to extract title')
1337                         return
1338                 video_title = mobj.group(1).decode('utf-8')
1339                 video_title = sanitize_title(video_title)
1340                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1341
1342                 video_uploader = mobj.group(2).decode('utf-8')
1343
1344                 try:
1345                         # Process video information
1346                         self._downloader.process_info({
1347                                 'id':           video_id.decode('utf-8'),
1348                                 'url':          video_url.decode('utf-8'),
1349                                 'uploader':     video_uploader,
1350                                 'title':        video_title,
1351                                 'stitle':       simple_title,
1352                                 'ext':          video_extension.decode('utf-8'),
1353                                 'format':       u'NA',
1354                                 'player_url':   None,
1355                         })
1356                 except UnavailableVideoError:
1357                         self._downloader.trouble(u'ERROR: unable to download video')
1358
1359
1360 class YahooIE(InfoExtractor):
1361         """Information extractor for video.yahoo.com."""
1362
1363         # _VALID_URL matches all Yahoo! Video URLs
1364         # _VPAGE_URL matches only the extractable '/watch/' URLs
1365         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1366         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1367
1368         def __init__(self, downloader=None):
1369                 InfoExtractor.__init__(self, downloader)
1370
1371         @staticmethod
1372         def suitable(url):
1373                 return (re.match(YahooIE._VALID_URL, url) is not None)
1374
1375         def report_download_webpage(self, video_id):
1376                 """Report webpage download."""
1377                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1378
1379         def report_extraction(self, video_id):
1380                 """Report information extraction."""
1381                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1382
1383         def _real_initialize(self):
1384                 return
1385
1386         def _real_extract(self, url, new_video=True):
1387                 # Extract ID from URL
1388                 mobj = re.match(self._VALID_URL, url)
1389                 if mobj is None:
1390                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1391                         return
1392
1393                 # At this point we have a new video
1394                 self._downloader.increment_downloads()
1395                 video_id = mobj.group(2)
1396                 video_extension = 'flv'
1397
1398                 # Rewrite valid but non-extractable URLs as
1399                 # extractable English language /watch/ URLs
1400                 if re.match(self._VPAGE_URL, url) is None:
1401                         request = urllib2.Request(url)
1402                         try:
1403                                 webpage = urllib2.urlopen(request).read()
1404                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1405                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1406                                 return
1407
1408                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1409                         if mobj is None:
1410                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1411                                 return
1412                         yahoo_id = mobj.group(1)
1413
1414                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1415                         if mobj is None:
1416                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1417                                 return
1418                         yahoo_vid = mobj.group(1)
1419
1420                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1421                         return self._real_extract(url, new_video=False)
1422
1423                 # Retrieve video webpage to extract further information
1424                 request = urllib2.Request(url)
1425                 try:
1426                         self.report_download_webpage(video_id)
1427                         webpage = urllib2.urlopen(request).read()
1428                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1429                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1430                         return
1431
1432                 # Extract uploader and title from webpage
1433                 self.report_extraction(video_id)
1434                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1435                 if mobj is None:
1436                         self._downloader.trouble(u'ERROR: unable to extract video title')
1437                         return
1438                 video_title = mobj.group(1).decode('utf-8')
1439                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1440
1441                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1442                 if mobj is None:
1443                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1444                         return
1445                 video_uploader = mobj.group(1).decode('utf-8')
1446
1447                 # Extract video thumbnail
1448                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1449                 if mobj is None:
1450                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1451                         return
1452                 video_thumbnail = mobj.group(1).decode('utf-8')
1453
1454                 # Extract video description
1455                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1456                 if mobj is None:
1457                         self._downloader.trouble(u'ERROR: unable to extract video description')
1458                         return
1459                 video_description = mobj.group(1).decode('utf-8')
1460                 if not video_description: video_description = 'No description available.'
1461
1462                 # Extract video height and width
1463                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1464                 if mobj is None:
1465                         self._downloader.trouble(u'ERROR: unable to extract video height')
1466                         return
1467                 yv_video_height = mobj.group(1)
1468
1469                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1470                 if mobj is None:
1471                         self._downloader.trouble(u'ERROR: unable to extract video width')
1472                         return
1473                 yv_video_width = mobj.group(1)
1474
1475                 # Retrieve video playlist to extract media URL
1476                 # I'm not completely sure what all these options are, but we
1477                 # seem to need most of them, otherwise the server sends a 401.
1478                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1479                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1480                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1481                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1482                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1483                 try:
1484                         self.report_download_webpage(video_id)
1485                         webpage = urllib2.urlopen(request).read()
1486                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1487                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1488                         return
1489
1490                 # Extract media URL from playlist XML
1491                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1492                 if mobj is None:
1493                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1494                         return
1495                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1496                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1497
1498                 try:
1499                         # Process video information
1500                         self._downloader.process_info({
1501                                 'id':           video_id.decode('utf-8'),
1502                                 'url':          video_url,
1503                                 'uploader':     video_uploader,
1504                                 'title':        video_title,
1505                                 'stitle':       simple_title,
1506                                 'ext':          video_extension.decode('utf-8'),
1507                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1508                                 'description':  video_description,
1509                                 'thumbnail':    video_thumbnail,
1510                                 'description':  video_description,
1511                                 'player_url':   None,
1512                         })
1513                 except UnavailableVideoError:
1514                         self._downloader.trouble(u'ERROR: unable to download video')
1515
1516
1517 class GenericIE(InfoExtractor):
1518         """Generic last-resort information extractor."""
1519
1520         def __init__(self, downloader=None):
1521                 InfoExtractor.__init__(self, downloader)
1522
1523         @staticmethod
1524         def suitable(url):
1525                 return True
1526
1527         def report_download_webpage(self, video_id):
1528                 """Report webpage download."""
1529                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1530                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1531
1532         def report_extraction(self, video_id):
1533                 """Report information extraction."""
1534                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1535
1536         def _real_initialize(self):
1537                 return
1538
1539         def _real_extract(self, url):
1540                 # At this point we have a new video
1541                 self._downloader.increment_downloads()
1542
1543                 video_id = url.split('/')[-1]
1544                 request = urllib2.Request(url)
1545                 try:
1546                         self.report_download_webpage(video_id)
1547                         webpage = urllib2.urlopen(request).read()
1548                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1549                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1550                         return
1551                 except ValueError, err:
1552                         # since this is the last-resort InfoExtractor, if
1553                         # this error is thrown, it'll be thrown here
1554                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1555                         return
1556
1557                 # Start with something easy: JW Player in SWFObject
1558                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1559                 if mobj is None:
1560                         # Broaden the search a little bit
1561                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1562                 if mobj is None:
1563                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1564                         return
1565
1566                 # It's possible that one of the regexes
1567                 # matched, but returned an empty group:
1568                 if mobj.group(1) is None:
1569                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1570                         return
1571
1572                 video_url = urllib.unquote(mobj.group(1))
1573                 video_id  = os.path.basename(video_url)
1574
1575                 # here's a fun little line of code for you:
1576                 video_extension = os.path.splitext(video_id)[1][1:]
1577                 video_id        = os.path.splitext(video_id)[0]
1578
1579                 # it's tempting to parse this further, but you would
1580                 # have to take into account all the variations like
1581                 #   Video Title - Site Name
1582                 #   Site Name | Video Title
1583                 #   Video Title - Tagline | Site Name
1584                 # and so on and so forth; it's just not practical
1585                 mobj = re.search(r'<title>(.*)</title>', webpage)
1586                 if mobj is None:
1587                         self._downloader.trouble(u'ERROR: unable to extract title')
1588                         return
1589                 video_title = mobj.group(1).decode('utf-8')
1590                 video_title = sanitize_title(video_title)
1591                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1592
1593                 # video uploader is domain name
1594                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1595                 if mobj is None:
1596                         self._downloader.trouble(u'ERROR: unable to extract title')
1597                         return
1598                 video_uploader = mobj.group(1).decode('utf-8')
1599
1600                 try:
1601                         # Process video information
1602                         self._downloader.process_info({
1603                                 'id':           video_id.decode('utf-8'),
1604                                 'url':          video_url.decode('utf-8'),
1605                                 'uploader':     video_uploader,
1606                                 'title':        video_title,
1607                                 'stitle':       simple_title,
1608                                 'ext':          video_extension.decode('utf-8'),
1609                                 'format':       u'NA',
1610                                 'player_url':   None,
1611                         })
1612                 except UnavailableVideoError, err:
1613                         self._downloader.trouble(u'ERROR: unable to download video')
1614
1615
1616 class YoutubeSearchIE(InfoExtractor):
1617         """Information Extractor for YouTube search queries."""
1618         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1619         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1620         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1621         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1622         _youtube_ie = None
1623         _max_youtube_results = 1000
1624
1625         def __init__(self, youtube_ie, downloader=None):
1626                 InfoExtractor.__init__(self, downloader)
1627                 self._youtube_ie = youtube_ie
1628         
1629         @staticmethod
1630         def suitable(url):
1631                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1632
1633         def report_download_page(self, query, pagenum):
1634                 """Report attempt to download playlist page with given number."""
1635                 query = query.decode(preferredencoding())
1636                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1637
1638         def _real_initialize(self):
1639                 self._youtube_ie.initialize()
1640         
1641         def _real_extract(self, query):
1642                 mobj = re.match(self._VALID_QUERY, query)
1643                 if mobj is None:
1644                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1645                         return
1646
1647                 prefix, query = query.split(':')
1648                 prefix = prefix[8:]
1649                 query  = query.encode('utf-8')
1650                 if prefix == '':
1651                         self._download_n_results(query, 1)
1652                         return
1653                 elif prefix == 'all':
1654                         self._download_n_results(query, self._max_youtube_results)
1655                         return
1656                 else:
1657                         try:
1658                                 n = long(prefix)
1659                                 if n <= 0:
1660                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1661                                         return
1662                                 elif n > self._max_youtube_results:
1663                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1664                                         n = self._max_youtube_results
1665                                 self._download_n_results(query, n)
1666                                 return
1667                         except ValueError: # parsing prefix as integer fails
1668                                 self._download_n_results(query, 1)
1669                                 return
1670
1671         def _download_n_results(self, query, n):
1672                 """Downloads a specified number of results for a query"""
1673
1674                 video_ids = []
1675                 already_seen = set()
1676                 pagenum = 1
1677
1678                 while True:
1679                         self.report_download_page(query, pagenum)
1680                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1681                         request = urllib2.Request(result_url, None, std_headers)
1682                         try:
1683                                 page = urllib2.urlopen(request).read()
1684                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1685                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1686                                 return
1687
1688                         # Extract video identifiers
1689                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1691                                 if video_id not in already_seen:
1692                                         video_ids.append(video_id)
1693                                         already_seen.add(video_id)
1694                                         if len(video_ids) == n:
1695                                                 # Specified n videos reached
1696                                                 for id in video_ids:
1697                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1698                                                 return
1699
1700                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1701                                 for id in video_ids:
1702                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1703                                 return
1704
1705                         pagenum = pagenum + 1
1706
1707 class GoogleSearchIE(InfoExtractor):
1708         """Information Extractor for Google Video search queries."""
1709         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1710         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1711         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1712         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1713         _google_ie = None
1714         _max_google_results = 1000
1715
1716         def __init__(self, google_ie, downloader=None):
1717                 InfoExtractor.__init__(self, downloader)
1718                 self._google_ie = google_ie
1719         
1720         @staticmethod
1721         def suitable(url):
1722                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1723
1724         def report_download_page(self, query, pagenum):
1725                 """Report attempt to download playlist page with given number."""
1726                 query = query.decode(preferredencoding())
1727                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1728
1729         def _real_initialize(self):
1730                 self._google_ie.initialize()
1731         
1732         def _real_extract(self, query):
1733                 mobj = re.match(self._VALID_QUERY, query)
1734                 if mobj is None:
1735                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1736                         return
1737
1738                 prefix, query = query.split(':')
1739                 prefix = prefix[8:]
1740                 query  = query.encode('utf-8')
1741                 if prefix == '':
1742                         self._download_n_results(query, 1)
1743                         return
1744                 elif prefix == 'all':
1745                         self._download_n_results(query, self._max_google_results)
1746                         return
1747                 else:
1748                         try:
1749                                 n = long(prefix)
1750                                 if n <= 0:
1751                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1752                                         return
1753                                 elif n > self._max_google_results:
1754                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1755                                         n = self._max_google_results
1756                                 self._download_n_results(query, n)
1757                                 return
1758                         except ValueError: # parsing prefix as integer fails
1759                                 self._download_n_results(query, 1)
1760                                 return
1761
1762         def _download_n_results(self, query, n):
1763                 """Downloads a specified number of results for a query"""
1764
1765                 video_ids = []
1766                 already_seen = set()
1767                 pagenum = 1
1768
1769                 while True:
1770                         self.report_download_page(query, pagenum)
1771                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1772                         request = urllib2.Request(result_url, None, std_headers)
1773                         try:
1774                                 page = urllib2.urlopen(request).read()
1775                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1777                                 return
1778
1779                         # Extract video identifiers
1780                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1781                                 video_id = mobj.group(1)
1782                                 if video_id not in already_seen:
1783                                         video_ids.append(video_id)
1784                                         already_seen.add(video_id)
1785                                         if len(video_ids) == n:
1786                                                 # Specified n videos reached
1787                                                 for id in video_ids:
1788                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1789                                                 return
1790
1791                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1792                                 for id in video_ids:
1793                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1794                                 return
1795
1796                         pagenum = pagenum + 1
1797
1798 class YahooSearchIE(InfoExtractor):
1799         """Information Extractor for Yahoo! Video search queries."""
1800         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1801         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1802         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1803         _MORE_PAGES_INDICATOR = r'\s*Next'
1804         _yahoo_ie = None
1805         _max_yahoo_results = 1000
1806
1807         def __init__(self, yahoo_ie, downloader=None):
1808                 InfoExtractor.__init__(self, downloader)
1809                 self._yahoo_ie = yahoo_ie
1810         
1811         @staticmethod
1812         def suitable(url):
1813                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1814
1815         def report_download_page(self, query, pagenum):
1816                 """Report attempt to download playlist page with given number."""
1817                 query = query.decode(preferredencoding())
1818                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1819
1820         def _real_initialize(self):
1821                 self._yahoo_ie.initialize()
1822         
1823         def _real_extract(self, query):
1824                 mobj = re.match(self._VALID_QUERY, query)
1825                 if mobj is None:
1826                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1827                         return
1828
1829                 prefix, query = query.split(':')
1830                 prefix = prefix[8:]
1831                 query  = query.encode('utf-8')
1832                 if prefix == '':
1833                         self._download_n_results(query, 1)
1834                         return
1835                 elif prefix == 'all':
1836                         self._download_n_results(query, self._max_yahoo_results)
1837                         return
1838                 else:
1839                         try:
1840                                 n = long(prefix)
1841                                 if n <= 0:
1842                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1843                                         return
1844                                 elif n > self._max_yahoo_results:
1845                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1846                                         n = self._max_yahoo_results
1847                                 self._download_n_results(query, n)
1848                                 return
1849                         except ValueError: # parsing prefix as integer fails
1850                                 self._download_n_results(query, 1)
1851                                 return
1852
1853         def _download_n_results(self, query, n):
1854                 """Downloads a specified number of results for a query"""
1855
1856                 video_ids = []
1857                 already_seen = set()
1858                 pagenum = 1
1859
1860                 while True:
1861                         self.report_download_page(query, pagenum)
1862                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1863                         request = urllib2.Request(result_url, None, std_headers)
1864                         try:
1865                                 page = urllib2.urlopen(request).read()
1866                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1867                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1868                                 return
1869
1870                         # Extract video identifiers
1871                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1872                                 video_id = mobj.group(1)
1873                                 if video_id not in already_seen:
1874                                         video_ids.append(video_id)
1875                                         already_seen.add(video_id)
1876                                         if len(video_ids) == n:
1877                                                 # Specified n videos reached
1878                                                 for id in video_ids:
1879                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1880                                                 return
1881
1882                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1883                                 for id in video_ids:
1884                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1885                                 return
1886
1887                         pagenum = pagenum + 1
1888
1889 class YoutubePlaylistIE(InfoExtractor):
1890         """Information Extractor for YouTube playlists."""
1891
1892         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1893         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1894         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1895         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1896         _youtube_ie = None
1897
1898         def __init__(self, youtube_ie, downloader=None):
1899                 InfoExtractor.__init__(self, downloader)
1900                 self._youtube_ie = youtube_ie
1901         
1902         @staticmethod
1903         def suitable(url):
1904                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1905
1906         def report_download_page(self, playlist_id, pagenum):
1907                 """Report attempt to download playlist page with given number."""
1908                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1909
1910         def _real_initialize(self):
1911                 self._youtube_ie.initialize()
1912         
1913         def _real_extract(self, url):
1914                 # Extract playlist id
1915                 mobj = re.match(self._VALID_URL, url)
1916                 if mobj is None:
1917                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1918                         return
1919
1920                 # Download playlist pages
1921                 playlist_id = mobj.group(1)
1922                 video_ids = []
1923                 pagenum = 1
1924
1925                 while True:
1926                         self.report_download_page(playlist_id, pagenum)
1927                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1928                         try:
1929                                 page = urllib2.urlopen(request).read()
1930                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1932                                 return
1933
1934                         # Extract video identifiers
1935                         ids_in_page = []
1936                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1937                                 if mobj.group(1) not in ids_in_page:
1938                                         ids_in_page.append(mobj.group(1))
1939                         video_ids.extend(ids_in_page)
1940
1941                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1942                                 break
1943                         pagenum = pagenum + 1
1944
1945                 playliststart = self._downloader.params.get('playliststart', 1)
1946                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1947                 if playliststart > 0:
1948                         video_ids = video_ids[playliststart:]
1949                         
1950                 for id in video_ids:
1951                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1952                 return
1953
1954 class YoutubeUserIE(InfoExtractor):
1955         """Information Extractor for YouTube users."""
1956
1957         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1958         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1959         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1960         _youtube_ie = None
1961
1962         def __init__(self, youtube_ie, downloader=None):
1963                 InfoExtractor.__init__(self, downloader)
1964                 self._youtube_ie = youtube_ie
1965         
1966         @staticmethod
1967         def suitable(url):
1968                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1969
1970         def report_download_page(self, username):
1971                 """Report attempt to download user page."""
1972                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1973
1974         def _real_initialize(self):
1975                 self._youtube_ie.initialize()
1976         
1977         def _real_extract(self, url):
1978                 # Extract username
1979                 mobj = re.match(self._VALID_URL, url)
1980                 if mobj is None:
1981                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1982                         return
1983
1984                 # Download user page
1985                 username = mobj.group(1)
1986                 video_ids = []
1987                 pagenum = 1
1988
1989                 self.report_download_page(username)
1990                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1991                 try:
1992                         page = urllib2.urlopen(request).read()
1993                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1994                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1995                         return
1996
1997                 # Extract video identifiers
1998                 ids_in_page = []
1999
2000                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2001                         if mobj.group(1) not in ids_in_page:
2002                                 ids_in_page.append(mobj.group(1))
2003                 video_ids.extend(ids_in_page)
2004
2005                 playliststart = self._downloader.params.get('playliststart', 1)
2006                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2007                 if playliststart > 0:
2008                         video_ids = video_ids[playliststart:]   
2009
2010                 for id in video_ids:
2011                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2012                 return
2013
2014 class PostProcessor(object):
2015         """Post Processor class.
2016
2017         PostProcessor objects can be added to downloaders with their
2018         add_post_processor() method. When the downloader has finished a
2019         successful download, it will take its internal chain of PostProcessors
2020         and start calling the run() method on each one of them, first with
2021         an initial argument and then with the returned value of the previous
2022         PostProcessor.
2023
2024         The chain will be stopped if one of them ever returns None or the end
2025         of the chain is reached.
2026
2027         PostProcessor objects follow a "mutual registration" process similar
2028         to InfoExtractor objects.
2029         """
2030
2031         _downloader = None
2032
2033         def __init__(self, downloader=None):
2034                 self._downloader = downloader
2035
2036         def set_downloader(self, downloader):
2037                 """Sets the downloader for this PP."""
2038                 self._downloader = downloader
2039         
2040         def run(self, information):
2041                 """Run the PostProcessor.
2042
2043                 The "information" argument is a dictionary like the ones
2044                 composed by InfoExtractors. The only difference is that this
2045                 one has an extra field called "filepath" that points to the
2046                 downloaded file.
2047
2048                 When this method returns None, the postprocessing chain is
2049                 stopped. However, this method may return an information
2050                 dictionary that will be passed to the next postprocessing
2051                 object in the chain. It can be the one it received after
2052                 changing some fields.
2053
2054                 In addition, this method may raise a PostProcessingError
2055                 exception that will be taken into account by the downloader
2056                 it was called from.
2057                 """
2058                 return information # by default, do nothing
2059         
2060 ### MAIN PROGRAM ###
2061 if __name__ == '__main__':
2062         try:
2063                 # Modules needed only when running the main program
2064                 import getpass
2065                 import optparse
2066
2067                 # Function to update the program file with the latest version from bitbucket.org
2068                 def update_self(downloader, filename):
2069                         # Note: downloader only used for options
2070                         if not os.access (filename, os.W_OK):
2071                                 sys.exit('ERROR: no write permissions on %s' % filename)
2072
2073                         downloader.to_stdout('Updating to latest stable version...')
2074                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2075                         latest_version = urllib.urlopen(latest_url).read().strip()
2076                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2077                         newcontent = urllib.urlopen(prog_url).read()
2078                         stream = open(filename, 'w')
2079                         stream.write(newcontent)
2080                         stream.close()
2081                         downloader.to_stdout('Updated to version %s' % latest_version)
2082
2083                 # General configuration
2084                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2085                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2086                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2087
2088                 # Parse command line
2089                 parser = optparse.OptionParser(
2090                         usage='Usage: %prog [options] url...',
2091                         version='2010.08.04',
2092                         conflict_handler='resolve',
2093                 )
2094
2095                 parser.add_option('-h', '--help',
2096                                 action='help', help='print this help text and exit')
2097                 parser.add_option('-v', '--version',
2098                                 action='version', help='print program version and exit')
2099                 parser.add_option('-U', '--update',
2100                                 action='store_true', dest='update_self', help='update this program to latest stable version')
2101                 parser.add_option('-i', '--ignore-errors',
2102                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2103                 parser.add_option('-r', '--rate-limit',
2104                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2105                 parser.add_option('-R', '--retries',
2106                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2107                 parser.add_option('--playlist-start',
2108                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2109
2110                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2111                 authentication.add_option('-u', '--username',
2112                                 dest='username', metavar='USERNAME', help='account username')
2113                 authentication.add_option('-p', '--password',
2114                                 dest='password', metavar='PASSWORD', help='account password')
2115                 authentication.add_option('-n', '--netrc',
2116                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2117                 parser.add_option_group(authentication)
2118
2119                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2120                 video_format.add_option('-f', '--format',
2121                                 action='store', dest='format', metavar='FORMAT', help='video format code')
2122                 video_format.add_option('-m', '--mobile-version',
2123                                 action='store_const', dest='format', help='alias for -f 17', const='17')
2124                 video_format.add_option('--all-formats',
2125                                 action='store_const', dest='format', help='download all available video formats', const='-1')
2126                 video_format.add_option('--max-quality',
2127                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2128                 video_format.add_option('-b', '--best-quality',
2129                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2130                 parser.add_option_group(video_format)
2131
2132                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2133                 verbosity.add_option('-q', '--quiet',
2134                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2135                 verbosity.add_option('-s', '--simulate',
2136                                 action='store_true', dest='simulate', help='do not download video', default=False)
2137                 verbosity.add_option('-g', '--get-url',
2138                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2139                 verbosity.add_option('-e', '--get-title',
2140                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2141                 verbosity.add_option('--get-thumbnail',
2142                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2143                 verbosity.add_option('--get-description',
2144                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2145                 verbosity.add_option('--no-progress',
2146                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2147                 parser.add_option_group(verbosity)
2148
2149                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2150                 filesystem.add_option('-t', '--title',
2151                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2152                 filesystem.add_option('-l', '--literal',
2153                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2154                 filesystem.add_option('-o', '--output',
2155                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2156                 filesystem.add_option('-a', '--batch-file',
2157                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2158                 filesystem.add_option('-w', '--no-overwrites',
2159                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2160                 filesystem.add_option('-c', '--continue',
2161                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2162                 parser.add_option_group(filesystem)
2163
2164                 (opts, args) = parser.parse_args()
2165
2166                 # Batch file verification
2167                 batchurls = []
2168                 if opts.batchfile is not None:
2169                         try:
2170                                 if opts.batchfile == '-':
2171                                         batchfd = sys.stdin
2172                                 else:
2173                                         batchfd = open(opts.batchfile, 'r')
2174                                 batchurls = batchfd.readlines()
2175                                 batchurls = [x.strip() for x in batchurls]
2176                                 batchurls = [x for x in batchurls if len(x) > 0]
2177                         except IOError:
2178                                 sys.exit(u'ERROR: batch file could not be read')
2179                 all_urls = batchurls + args
2180
2181                 # Conflicting, missing and erroneous options
2182                 if opts.bestquality:
2183                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2184                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2185                         parser.error(u'using .netrc conflicts with giving username/password')
2186                 if opts.password is not None and opts.username is None:
2187                         parser.error(u'account username missing')
2188                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2189                         parser.error(u'using output template conflicts with using title or literal title')
2190                 if opts.usetitle and opts.useliteral:
2191                         parser.error(u'using title conflicts with using literal title')
2192                 if opts.username is not None and opts.password is None:
2193                         opts.password = getpass.getpass(u'Type account password and press return:')
2194                 if opts.ratelimit is not None:
2195                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2196                         if numeric_limit is None:
2197                                 parser.error(u'invalid rate limit specified')
2198                         opts.ratelimit = numeric_limit
2199                 if opts.retries is not None:
2200                         try:
2201                                 opts.retries = long(opts.retries)
2202                         except (TypeError, ValueError), err:
2203                                 parser.error(u'invalid retry count specified')
2204                 if opts.playliststart is not None:
2205                         try:
2206                                 opts.playliststart = long(opts.playliststart)
2207                         except (TypeError, ValueError), err:
2208                                 parser.error(u'invalid playlist page specified')
2209
2210                 # Information extractors
2211                 youtube_ie = YoutubeIE()
2212                 metacafe_ie = MetacafeIE(youtube_ie)
2213                 dailymotion_ie = DailymotionIE()
2214                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2215                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2216                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2217                 google_ie = GoogleIE()
2218                 google_search_ie = GoogleSearchIE(google_ie)
2219                 photobucket_ie = PhotobucketIE()
2220                 yahoo_ie = YahooIE()
2221                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2222                 generic_ie = GenericIE()
2223
2224                 # File downloader
2225                 fd = FileDownloader({
2226                         'usenetrc': opts.usenetrc,
2227                         'username': opts.username,
2228                         'password': opts.password,
2229                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2230                         'forceurl': opts.geturl,
2231                         'forcetitle': opts.gettitle,
2232                         'forcethumbnail': opts.getthumbnail,
2233                         'forcedescription': opts.getdescription,
2234                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2235                         'format': opts.format,
2236                         'format_limit': opts.format_limit,
2237                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2238                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2239                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2240                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2241                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2242                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2243                                 or u'%(id)s.%(ext)s'),
2244                         'ignoreerrors': opts.ignoreerrors,
2245                         'ratelimit': opts.ratelimit,
2246                         'nooverwrites': opts.nooverwrites,
2247                         'retries': opts.retries,
2248                         'continuedl': opts.continue_dl,
2249                         'noprogress': opts.noprogress,
2250                         'playliststart': opts.playliststart,
2251                         })
2252                 fd.add_info_extractor(youtube_search_ie)
2253                 fd.add_info_extractor(youtube_pl_ie)
2254                 fd.add_info_extractor(youtube_user_ie)
2255                 fd.add_info_extractor(metacafe_ie)
2256                 fd.add_info_extractor(dailymotion_ie)
2257                 fd.add_info_extractor(youtube_ie)
2258                 fd.add_info_extractor(google_ie)
2259                 fd.add_info_extractor(google_search_ie)
2260                 fd.add_info_extractor(photobucket_ie)
2261                 fd.add_info_extractor(yahoo_ie)
2262                 fd.add_info_extractor(yahoo_search_ie)
2263
2264                 # This must come last since it's the
2265                 # fallback if none of the others work
2266                 fd.add_info_extractor(generic_ie)
2267
2268                 # Update version
2269                 if opts.update_self:
2270                         update_self(fd, sys.argv[0])
2271
2272                 # Maybe do nothing
2273                 if len(all_urls) < 1:
2274                         if not opts.update_self:
2275                                 parser.error(u'you must provide at least one URL')
2276                         else:
2277                                 sys.exit()
2278                 retcode = fd.download(all_urls)
2279                 sys.exit(retcode)
2280
2281         except DownloadError:
2282                 sys.exit(1)
2283         except SameFileError:
2284                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2285         except KeyboardInterrupt:
2286                 sys.exit(u'\nERROR: Interrupted by user')